Importing Libraries

In [None]:
! pip install -q pytorch-lightning

In [None]:
! pip install -q bs4
! pip install -q transformers
! pip install spacy

In [None]:
import pandas as pd
import numpy as np
import re
import spacy

import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Importing Data

In [None]:
df_content = pd.read_excel("/content/TRAINING_SENTENCES_CONTENT.xlsx")

df_cat = pd.read_excel("/content/TRAINING_SENTENCES_TAGS.xlsx")

pment_prelim = pd.read_excel("/content/PARLIAMENT_PROTOCOL_CORPUS.xlsx")

questions = pment_prelim['text']
word_cnt = [len(quest.split()) for quest in questions]

plt.figure(figsize=[8,5])
plt.hist(word_cnt, bins = 10)
plt.xlabel('Word Count/Question')
plt.ylabel('# of Occurences')
plt.title("Frequency of Word Counts/sentence")
plt.show()

Data Cleaning

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 20000000
EIN_SATZ="yes"
print(EIN_SATZ)
if EIN_SATZ=="no":
    pment=pment_prelim
else: 
    pment.drop(pment.index[:], inplace=True)
    item=1
    wortlimit=3
    for iter in range(len(pment_prelim)):
      about_doc = nlp(pment_prelim.loc[iter,"text" ])
      for sent in about_doc.sents:
        if (str(sent) != " ") & (len(sent)>wortlimit):
          pment=pment.append({'id':item, 'text':str(sent)}, ignore_index = True)
          item +=1
    print(pment.loc[0, "text"])
    file_name = "/content/Test_eng.xlsx" 
    pment.to_excel(file_name)

In [None]:
def pre_process(text):
  
  text = re.sub("[^a-zA-Z]", " ", text)
  text = text.lower()
  tokens = text.split()
  text= text.encode('ascii', 'ignore').decode()
  text= re.sub(r'https*\S+', ' ', text)
  text= re.sub(r'http*\S+', ' ',text)
  text= re.sub(r'\'\w+', '',text) 
  text= re.sub(r'\w*\d+\w*', '',text)
  text= re.sub(r'\s{2,}', ' ',text)
  text= re.sub(r'\s[^\w\s]\s', '',text)
  return text
df_content['Content_clean'] = df_content['Content'].apply(pre_process)
pment['text_clean'] = pment['text'].apply(pre_process)

Initialising BERT

In [None]:
BERT_MODEL_NAME = "bert-base-cased"
Bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [None]:
print(df_content.head(2))
df_content.shape

In [None]:
df_cat_grouped = df_cat.groupby('Line').apply(lambda x:x['Tag'].values).reset_index(name='tags')
df_cat_grouped.sample (3)

In [None]:
df_calc = pd.merge(df_content,df_cat_grouped,how='inner',on='Line')
print(df_calc.sample(3))
df_calc.shape

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
yt = mlb.fit_transform(df_calc['tags'])
print(yt)
yt.shape

In [None]:
print(yt[0])
print(mlb.inverse_transform(yt[0].reshape(1,-1)))
print(mlb.classes_)

In [None]:
x = df_calc['Content_clean'].tolist()
len(x)

Splitting Model Data in Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, yt, test_size=0.1, random_state=RANDOM_SEED,shuffle=True)
x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=RANDOM_SEED,shuffle=True)

In [None]:
len(x_tr) ,len(x_val), len(x_test)

Model Training

In [None]:
class QTagDataset (Dataset):
    def __init__(self,quest,tags, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = quest
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.labels[item_idx], dtype=torch.float)
        }

In [None]:
class QTagDataModule (pl.LightningDataModule):
    
    def __init__(self,x_tr,y_tr,x_val,y_val,x_test,y_test,tokenizer,batch_size=16,max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = QTagDataset(quest=self.tr_text, tags=self.tr_label, tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.val_dataset  = QTagDataset(quest=self.val_text,tags=self.val_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.test_dataset  = QTagDataset(quest=self.test_text,tags=self.test_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        
        
    def train_dataloader(self):
        return DataLoader (self.train_dataset,batch_size = self.batch_size,shuffle = True , num_workers=2)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= 16)

In [None]:
max_word_cnt = 300
quest_cnt = 0
for question in x:
    input_ids = Bert_tokenizer.encode(question, add_special_tokens=True)
    if len(input_ids) > max_word_cnt:
        quest_cnt +=1
print(f'# Question having word count > {max_word_cnt}: is  {quest_cnt}')

In [None]:
N_EPOCHS = 12
BATCH_SIZE = 32
MAX_LEN = 300
LR = 2e-05

In [None]:
QTdata_module = QTagDataModule(x_tr,y_tr,x_val,y_val,x_test,y_test,Bert_tokenizer,BATCH_SIZE,MAX_LEN)
QTdata_module.setup()

In [None]:
class QTagClassifier(pl.LightningModule):
    def __init__(self, n_classes=5, steps_per_epoch=None, n_epochs=3, lr=2e-5 ):
        super().__init__()

        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size,n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self,input_ids, attn_mask):
        output = self.bert(input_ids = input_ids ,attention_mask = attn_mask)
        output = self.classifier(output.pooler_output)
                
        return output
    
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

In [None]:
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = QTagClassifier(n_classes=5, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename='QTag-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

In [None]:
trainer = pl.Trainer(max_epochs = N_EPOCHS , accelerator="gpu", callbacks=[checkpoint_callback])

In [None]:
trainer.fit(model, QTdata_module)

In [None]:
model_path = checkpoint_callback.best_model_path
model_path

In [None]:
len(y_test), len(x_test)

In [None]:
print(f'Number of Questions = {len(x_test)}')

In [None]:
from torch.utils.data import TensorDataset
input_ids = []
attention_masks = []


for quest in x_test:
    encoded_quest =  Bert_tokenizer.encode_plus(
                    quest,
                    None,
                    add_special_tokens=True,
                    max_length= MAX_LEN,
                    padding = 'max_length',
                    return_token_type_ids= False,
                    return_attention_mask= True,
                    truncation=True,
                    return_tensors = 'pt'      
    )   
    input_ids.append(encoded_quest['input_ids'])
    attention_masks.append(encoded_quest['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test)
TEST_BATCH_SIZE = 64  

pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)

In [None]:
flat_pred_outs = 0
flat_true_labels = 0

In [None]:
model = model.to(device)
model.eval()

pred_outs, true_labels = [], [] 
for batch in pred_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        pred_out = model(b_input_ids,b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    pred_outs.append(pred_out)
    true_labels.append(label_ids)

In [None]:
pred_outs[0][0]

In [None]:
flat_pred_outs = np.concatenate(pred_outs, axis=0)
flat_true_labels = np.concatenate(true_labels, axis=0)

In [None]:
flat_pred_outs.shape , flat_true_labels.shape

In [None]:
flat_pred_outs

In [None]:
threshold  = np.arange(0.4,0.51,0.01)
threshold

In [None]:
def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1)
            else:
                temp.append(0)
        y_pred.append(temp)

    return y_pred

In [None]:
flat_pred_outs[3]

In [None]:
flat_true_labels[5]

In [None]:
flat_pred_outs[5]

Identifying Optimal Threshold Value

In [None]:
from sklearn import metrics
scores=[]

y_true = flat_true_labels.ravel() 

for thresh in threshold:
    
    pred_bin_label = classify(flat_pred_outs,thresh) 

    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true,y_pred))

In [None]:
opt_thresh = threshold[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

In [None]:
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel()
y_pred[90]

In [None]:
print(metrics.classification_report(y_true,y_pred))

In [None]:
y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

df_train = pd.DataFrame({'Body':x_test,'Actual Tags':y_act,'Predicted Tags':y_pred})

Predicting Classifications of Test Data Set

In [None]:
df_train.sample(50)

In [None]:
QTmodel = QTagClassifier.load_from_checkpoint(model_path)
QTmodel.eval()

Predicting Classifications

In [None]:
def predict(question):
    text_enc = Bert_tokenizer.encode_plus(
            question,
            None,
            add_special_tokens=True,
            max_length= MAX_LEN,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'      
    )
    outputs = QTmodel(text_enc['input_ids'], text_enc['attention_mask'])
    pred_out = outputs[0].detach().numpy()
    preds = [(pred > opt_thresh) for pred in pred_out ]
    preds = np.asarray(preds)
    new_preds = preds.reshape(1,-1).astype(int)
    pred_tags = mlb.inverse_transform(new_preds)
    return pred_tags

Analysing and Predicting Large Text Corpus

In [None]:
print(len(pment))

for zeile in range (len(pment)):
  tags=predict(pment.loc[zeile,"text_clean"])
  if not tags[0]:
    text="keineZuordnung"
    pment.loc[zeile, "analyse"]=text.split()
  else:
    pment.loc[zeile, "analyse"]=set(tags)

In [None]:
print(pment)

In [None]:
print(len(pment))

for zeile in range (len(pment)):
  tags=predict(pment.loc[zeile,"text_clean"])
  if not tags[0]:
    text="keineZuordnung"
    pment.loc[zeile, "analyse"]=text.split()
  else:
    pment.loc[zeile, "analyse"]=set(tags)

In [None]:
 print(pment.sample(15))

Excel-Output of Classified Text Units

In [None]:
file_name = "/content/Output_Parlament_eng_multi.xlsx" 
pment.to_excel(file_name)