In [None]:
import pandas as pd
import numpy as np
import torch
import torchvision
import sklearn
from sklearn.model_selection import train_test_split
import transformers
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import F1Score
import seaborn as snn

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



In [None]:
#Random Seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f8450665e90>

In [None]:
#Training Files
df1 = pd.read_json("en-train.json", lines=True)
df2 = pd.read_json("es-train.json", lines=True)
df3 = pd.read_json("pr-train.json", lines=True)

In [None]:
#Compiling Training Data
df = pd.concat([df1, df2 , df3], axis=0)
df = df.sample(frac=1)

In [None]:
import html
import re

#Text Cleaning
def clean_text(df):
    df['text'] = df['text'].apply(html.unescape)
    df['text'] = df['text'].apply(lambda s: re.sub('@\w+', ' ', s))
    df['text'] = df['text'].apply(lambda s: re.sub('#',    ' ', s))
    df['text'] = df['text'].apply(lambda s: re.sub('\n',   ' ', s))
    df['text'] = df['text'].apply(lambda s: re.sub('\w+://\S+',  ' ', s))
    df['text'] = df['text'].apply(lambda s: re.sub('\s+',  ' ', s))
    return df
    
df = clean_text(df)

In [None]:
df

Unnamed: 0,id,text,label
8151,108151,"District faces fallout, say agents PUBLISHED :...",0
433,95433,En plenas bodas de plata de La venganza será t...,0
2154,102154,"Sonia to appoint CLP leader TNN | Aug 28, 2001...",0
727,90727,21/10/2002 - 21h17 Termina greve geral contra...,1
6270,106270,China LNG shares dumped as its prospects defla...,0
...,...,...,...
960,90960,27/05/2002 - 14h39 Veja as últimas pesquisas ...,0
5191,105191,Former V-C walks out of meet over affiliation ...,0
5390,105390,DEVELOPMENT Sports officials angry over plan t...,0
860,100860,Sinha confident of achieving indirect tax targ...,0


In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=29)
#train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df.label)
test_df = pd.read_json("Portuguese_test.json", lines=True)
# train_df = df

In [None]:
label_column = ["label"]

In [None]:
#Samples
sample_row = df.iloc[42]
sample_comment = sample_row.text
sample_labels = sample_row[label_column]

Change model name for different BERT models

In [None]:
#Choosing Model and Tokenizer
Bert_Model = "xlm-roberta-base" 
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(Bert_Model)

In [None]:
#Sample Encoding
encoding = tokenizer.encode_plus(
    sample_comment,
    add_special_tokens = True,
    max_length = 512,
    return_token_type_ids = False,
    padding = "max_length",
    return_attention_mask = True,
    return_tensors = "pt",
)

In [None]:
tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())

In [None]:
#Dataset Class
class DocumentDataset(torch.utils.data.DocumentDataset):
    def __init__(self, data: pd.DataFrame, tokenizer: transformers.XLMRobertaTokenizer, max_length: int = 256, testData = False):
    
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.test_data = testData

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        comment_text = data_row.text
        labels = data_row[label_column]

        if self.test_data:
            labels = []

        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens = True,
            max_length = self.max_length,
            return_token_type_ids = False,
            padding = "max_length",
            truncation = True,
            return_attention_mask = True,
            return_tensors = "pt",
        )

        if self.test_data:
            return dict(
              comment_text = comment_text,
              input_ids = encoding["input_ids"].flatten(),
              attention_mask = encoding["attention_mask"].flatten()
            )       
        else:
            return dict(
              comment_text = comment_text,
              input_ids = encoding["input_ids"].flatten(),
              attention_mask = encoding["attention_mask"].flatten(),
              labels = torch.FloatTensor(labels)
            )
            

In [None]:
#Initialising Dataset Class and Model
train_dataset = DocumentDataset(train_df, tokenizer)
sample = train_dataset[0]
bert_model = transformers.XLMRobertaModel.from_pretrained(Bert_Model, return_dict = True)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
prediction = bert_model(sample["input_ids"].unsqueeze(dim = 0), sample["attention_mask"].unsqueeze(dim = 0))

In [None]:
#Data Module Class for Organizing Different Datasets for Train/Predict
class DocumentDataModule(pl.LightningDocumentDataModule):
    def __init__(self, train_df, val_df, test_df, tokenizer, batch_size = 32, max_length = 256):
        super().__init__()

        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length

    def setup(self, stage=None):
        self.train_dataset = DocumentDataset(
            self.train_df,
            self.tokenizer,
            self.max_length
        )

        self.val_dataset = DocumentDataset(
            self.val_df,
            self.tokenizer,
            self.max_length
        )

        self.test_dataset = DocumentDataset(
            self.test_df,
            self.tokenizer,
            self.max_length,
            testData = True
        )

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            self.batch_size,
            shuffle = True,
            num_workers = 20
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size = 1,
            shuffle = False,
            num_workers = 20
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size = 1,
            shuffle = False,
            num_workers = 20
        )

In [None]:
#Initialization and Hyperparameters
epochs = 20
batch_size = 32
data_module = DocumentDataModule(train_df, val_df, test_df, tokenizer, batch_size)
data_module.setup()
criterion = torch.nn.BCELoss()

In [None]:
#Actual Training Class
class ModelTrainEval(pl.LightningModule):
    def __init__(self, n_classes: int, steps_per_epoch = None, epochs = None, learning_rate = 5e-5):
        super().__init__()

        self.bert = transformers.AutoModel.from_pretrained(Bert_Model, return_dict = True)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.criterion = torch.nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels = None):
        output = self.bert(input_ids, attention_mask = attention_mask)
        #print(output.last_hidden_state.size())
        output = self.classifier(output.pooler_output)
        #print(output.shape)
        output = torch.sigmoid(output)
        #print(output.shape)
        loss = 0
        if labels != None:
            loss = self.criterion(output, labels)

        return loss, output

    def train(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        loss, output = self(input_ids, attention_mask, labels)

        self.log("train_loss", loss, prog_bar = True, logger = True)

        return {"loss": loss, "predictions": output, "labels": labels}

    def validate(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        loss, output = self(input_ids, attention_mask, labels)

        self.log("val_loss", loss, prog_bar = True, logger = True)

        return {"loss": loss, "predictions": output, "labels": labels}

    def test(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        loss, output = self(input_ids, attention_mask)

        return loss

    def end_training(self, outputs):
        labels = []
        predictions = []

        for output in outputs:
            for out_labels in output["labels"].detach():
                labels.append(out_labels.int())

            for out_predictions in output["predictions"].detach():
                predictions.append(out_predictions)

        labels = torch.stack(labels)
        predictions = torch.stack(predictions)

    def end_validation(self, outputs):
        labels = []
        predictions = []

        for output in outputs:
            for out_labels in output["labels"].detach():
                labels.append(out_labels.int())

            for out_predictions in output["predictions"].detach():
                predictions.append(out_predictions)

        labels = torch.stack(labels)
        predictions = torch.stack(predictions)

    def configure_optimizers(self):
        optimizer = transformers.AdamW(self.parameters(), lr= self.learning_rate)

        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.epochs - warmup_steps

        scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer,
            warmup_steps,
            total_steps
        )

        return [optimizer], [scheduler]

In [None]:
#Initialising Model with required Hyperparameters
model = ModelTrainEval(n_classes= 1, steps_per_epoch= len(train_df) // batch_size, epochs= epochs)
model.learning_rate = 2.75e-05

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
_, prediction = model(sample["input_ids"].unsqueeze(dim = 0), sample["attention_mask"].unsqueeze(dim = 0))

In [None]:
#Initializing Trainer for Model Training
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import  ModelCheckpoint

checkpoint_callback = ModelCheckpoint(monitor='val_loss')

trainer = pl.Trainer(max_epochs= epochs, gpus = 1,
                     callbacks=[
                                EarlyStopping(monitor='val_loss', patience=2), 
                                checkpoint_callback,
                                ]
                     )

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
#trainer.fit(model, data_module)

In [None]:
#Save/Load Models
#torch.save(model.state_dict(), "all_train.pt")
model.load_state_dict(torch.load("XLMR_all_lang.pt"))


<All keys matched successfully>

In [None]:
model.freeze()

In [None]:
device = torch.device("cuda:0")


In [None]:
#Storing True labels and Prediction Labels for Val Dataset
predictions = []
targets = []
for testObj in data_module.val_dataloader():
    _, test_pred = model(testObj["input_ids"], testObj["attention_mask"])
    targets.append(testObj["labels"])
    predictions.append(test_pred)

In [None]:
prediction_np = []
for pred in predictions:
    prediction_np.append(pred.flatten().numpy()[0])

In [None]:
targets_np = []
for t in targets:
    targets_np.append(int(t.flatten().numpy()[0]))

In [None]:
#Function to find Threshold for best f1 score and the best f1 score.
def max_f1(target, prediction):
    f1_max = 0
    threshold = 0
    for i in range(1, 99, 1):
        score = sklearn.metrics.f1_score(np.array(target), np.array(prediction) > (i / 100), average= "macro")

        if score > f1_max:
            f1_max = score
            threshold = i
            
    return threshold, f1_max 

print(f"Threshold = {threshold}, F1 Score = {f1_max}")
    



In [None]:
#Storing Predictions of Test Dataset
predictions = []
for testObj in data_module.test_dataloader():
    _, test_pred = model(testObj["input_ids"], testObj["attention_mask"])
    predictions.append(test_pred)

prediction_np = []
for pred in predictions:
    prediction_np.append(pred.cpu().flatten().numpy()[0])

In [None]:
prediction_np

In [None]:
len(prediction_np)

In [None]:
test_df

In [None]:
ans = [1 if i>=0.6 else 0 for i in prediction_np]
ans


In [None]:
count = 0
json_list = []
for id in test_df["id"]:
    sub = {}
    sub["prediction"]=ans[count]
    sub["id"] = id
    count+=1
    print(str(sub))
    json_list.append(sub)


In [None]:
import json
with open("Portuguese_XLMR_Softmax_Submission.json", "w", encoding="utf-8") as f:
            for doc in json_list:
                f.write(json.dumps(doc) + "\n")