In [None]:
!pip install transformers

In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

import transformers
from transformers import AutoTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

#### Setting seeds for reproducability

In [25]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# For use in DataLoader:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(42)

<torch._C.Generator at 0x7ff9f99dc1f0>

In [26]:
class EvidenceTypeDataset(Dataset):

    def __init__(self, thread_id, comment_id, sentence, target, tokenizer, max_len):
        self.thread_id = thread_id
        self.comment_id = comment_id
        self.sentence = sentence
        self.targets = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, item):
        sentence = str(self.sentence[item])
        thread_id = str(self.thread_id[item])
        comment_id = str(self.comment_id[item])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return {
            'thread_id': thread_id,
            'comment_id': comment_id,
            'sentence_text': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }

In [27]:
class EvidenceTypeClassifier(nn.Module):

    def __init__(self, num_classes, model_name, pretrained_model_name):
        super(EvidenceTypeClassifier, self).__init__()
        
        self.bert = model_name.from_pretrained(pretrained_model_name, return_dict=False)
        self.drop = nn.Dropout()
        self.out = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [28]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    return DataLoader(
        EvidenceTypeDataset(
            thread_id=df.thread_id.to_numpy(),
            comment_id=df.comment_id.to_numpy(),
            sentence=df.sentence.to_numpy(),
            target=df.label_int.to_numpy(),
            tokenizer=tokenizer,
            max_len=max_len
        ),
        worker_init_fn=seed_worker,
        generator=g,
        batch_size=batch_size,
        num_workers=4
    )

In [29]:
def train_model(model, data_loader, loss_function, optimizer, scheduler, num_examples, device):
    model = model.train()
    losses = []
    predictions = []
    real_values = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
 
        _, preds = torch.max(outputs, dim=1)
        loss = loss_function(outputs, targets)

        predictions.extend(preds)
        real_values.extend(targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    f1 = f1_score(real_values, predictions, average='weighted')

    return correct_predictions.double() / num_examples, np.mean(losses), f1

In [30]:
def evaluate_model(model, data_loader, loss_function, num_examples, device):
    model = model.eval()
    losses = []
    predictions = []
    real_values = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_function(outputs, targets)

            predictions.extend(preds)
            real_values.extend(targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    f1 = f1_score(real_values, predictions, average='weighted')

    return correct_predictions.double() / num_examples, np.mean(losses), f1

In [31]:
def find_best_model(model, loss_function, df_train, df_val, tokenizer, max_len,
                    batch_size, num_epochs, device, print_graph=True, 
                    save_file_name='model_state', save_models='last'):
  
    """"save_models can be 'all', 'best' or 'last'"""

    train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
    val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=True, no_deprecation_warning=True)

    total_steps = len(train_data_loader) * num_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    history = defaultdict(list)
    best_f1 = 0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 56)

        training_accuracy, training_loss, training_f1 = train_model(
            model,
            train_data_loader,
            loss_function,
            optimizer,
            scheduler,
            len(df_train),
            device
        )

        print('Training:   loss {:.3f} - accuracy {:.3f} - f1-score {:.3f}'.format(training_loss, training_accuracy, training_f1))

        validation_accuracy, validation_loss, validation_f1 = evaluate_model(
            model,
            val_data_loader,
            loss_function,
            len(df_val),
            device
        )

        print('Validation: loss {:.3f} - accuracy {:.3f} - f1-score {:.3f}\n'.format(validation_loss, validation_accuracy, validation_f1))

        history['training_accuracy'].append(training_accuracy.cpu().item())
        history['training_loss'].append(training_loss)
        history['training_f1'].append(training_f1)

        history['validation_accuracy'].append(validation_accuracy.cpu().item())
        history['validation_loss'].append(validation_loss)
        history['validation_f1'].append(validation_f1)

        if save_models == 'all':
            torch.save(model.state_dict(), save_file_name+"_"+str(epoch+1)+".bin")
        elif save_models == 'best':
            if validation_f1 > best_f1:
                torch.save(model.state_dict(), 'best_model_state.bin')
                best_f1 = validation_f1
        elif save_models == 'last':
            if epoch == num_epochs:
                torch.save(model.state_dict(), 'last_model_state.bin')

    return history

In [32]:
def get_predictions(model, data_loader, label_encoder, device, save_predictions=True, save_file_name='predictions.csv'):
    model = model.eval()
    thread_id_list = []
    comment_id_list = []
    sentence_texts = []
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            thread_ids = d['thread_id']
            comment_ids = d['comment_id']
            texts = d["sentence_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            thread_id_list.extend(thread_ids)
            comment_id_list.extend(comment_ids)
            sentence_texts.extend(texts)
            predictions.extend(preds)
            real_values.extend(targets)
    
    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()

    if save_predictions:
        df_pred = pd.DataFrame()
        df_pred['thread_id'] = thread_id_list
        df_pred['comment_id'] = comment_id_list
        df_pred['sentence'] = sentence_texts
        df_pred['pred_label'] = label_encoder.inverse_transform(predictions)
        df_pred['real_label'] = label_encoder.inverse_transform(real_values)

        df_pred.to_csv(save_file_name, index=False)

    return thread_id_list, comment_id_list, sentence_texts, predictions, real_values

#### Load in data

In [33]:
test_ids = ['t3_6rwcio', 't3_5jfqhp', 't3_71l9yj', 't3_4mj8v7', 't3_58t7i3',
            't3_64kkxe', 't3_6ihcuk', 't3_5o7nm3', 't3_4tf91m', 't3_4q9qng']
val_ids = ['t3_5ep0mh', 't3_4pbwvb', 't3_4g3nbn', 't3_6tsx1p', 't3_62igvv',
           't3_6694ui', 't3_6h7a4i', 't3_4plwqq', 't3_4otmqi', 't3_57tl4k']

df = pd.read_csv('final_dataset.csv')
df.columns = ['sentence', 'original_label', 'thread_id', 'comment_id', 'label']

df_train = df[~(df['thread_id'].isin((test_ids + val_ids)))]
df_val = df[df['thread_id'].isin(val_ids)]
df_test = df[df['thread_id'].isin(test_ids)]

# Transform labels to integers
label_encoder = LabelEncoder()
df_train['label_int'] = label_encoder.fit_transform(df_train['label'])
df_val['label_int'] = label_encoder.fit_transform(df_val['label'])
df_test['label_int'] = label_encoder.fit_transform(df_test['label'])

#### Instantiate models

##### General settings

In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class_weights = (1 - (df['label_int'].value_counts().sort_index() / len(df))).values
class_weights = torch.from_numpy(class_weights).float().to(device)

num_classes = len(df['label'].unique())
loss_function = nn.CrossEntropyLoss(weight=class_weights).to(device)

max_len = 128
batch_size = 32
num_epochs = 50

model_name = BertModel

#### BERT

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained('bert-base-uncased')

##### Fine-tune pretrained model and saving training history

###### Uncomment to fine-tune



In [None]:
model_bert = EvidenceTypeClassifier(num_classes, model_name, 'bert-base-uncased')
model_bert = model_bert.to(device)
training_history_bert = find_best_model(model_bert, loss_function, df_train, df_val, tokenizer_bert, max_len, batch_size, num_epochs, device)

history_df_bert = pd.DataFrame()
history_df_bert['training_accuracy'] = training_history_bert['training_accuracy']
history_df_bert['training_f1'] = training_history_bert['training_f1']
history_df_bert['training_loss'] = training_history_bert['training_loss']
history_df_bert['validation_accuracy'] = training_history_bert['validation_accuracy']
history_df_bert['validation_f1'] = training_history_bert['validation_f1']
history_df_bert['validation_loss'] = training_history_bert['validation_loss']
history_df_bert.index += 1
history_df_bert.to_csv('training_history_bert.csv', index_label='epoch')

#### Load in fine-tuned pretrained model state and predict labels on test set

In [None]:
test_data_loader = create_data_loader(df_test, tokenizer_bert, max_len, batch_size)

model = EvidenceTypeClassifier(num_classes, model_name, 'bert-base-uncased')
model.load_state_dict(torch.load('fine_tuned_model_state.bin'))
model = model.to(device)

thread_ids, comment_ids, y_sentence_texts, y_pred, y_test = get_predictions(
    model,
    test_data_loader,
    label_encoder,
    device,
    save_file_name='predictions_bert.csv'
    )

y_test = label_encoder.inverse_transform(y_test)
y_pred = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test, y_pred))

#### MiniLM

In [None]:
tokenizer_minilm = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')

##### Fine-tune pretrained model and saving training history

###### Uncomment to fine-tune

In [None]:
model_minilm = EvidenceTypeClassifier(num_classes, model_name, 'microsoft/MiniLM-L12-H384-uncased')
model_minilm = model_minilm.to(device)
training_history_minilm = find_best_model(model_minilm, loss_function, df_train, df_val, tokenizer_minilm, max_len, batch_size, num_epochs, device)

history_df_minilm = pd.DataFrame()
history_df_minilm['training_accuracy'] = training_history_minilm['training_accuracy']
history_df_minilm['training_f1'] = training_history_minilm['training_f1']
history_df_minilm['training_loss'] = training_history_minilm['training_loss']
history_df_minilm['validation_accuracy'] = training_history_minilm['validation_accuracy']
history_df_minilm['validation_f1'] = training_history_minilm['validation_f1']
history_df_minilm['validation_loss'] = training_history_minilm['validation_loss']
history_df_minilm.index += 1
history_df_minilm.to_csv('training_history_minilm.csv', index_label='epoch')

#### Load in fine-tuned pretrained model state and predict labels on test set

In [None]:
test_data_loader = create_data_loader(df_test, tokenizer_minilm, max_len, batch_size)

model = EvidenceTypeClassifier(num_classes, model_name, 'microsoft/MiniLM-L12-H384-uncased')
model.load_state_dict(torch.load('fine_tuned_model_state.bin'))
model = model.to(device)

thread_ids, comment_ids, y_sentence_texts, y_pred, y_test = get_predictions(
    model,
    test_data_loader,
    label_encoder,
    device,
    save_file_name='predictions_minilm.csv'
    )

y_test = label_encoder.inverse_transform(y_test)
y_pred = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test, y_pred))