In [1]:
import torch

use_cuda = torch.cuda.is_available()
device= torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
txt = 'asdasdsDADSDAS'.lower()
txt

'asdasdsdadsdas'

In [6]:
df = []
  
for f in os.listdir('dataset'):
  file1 = open(f'dataset/{f}', 'r')
  lines = file1.readlines()

  # Strips the newline character
  for line in lines:
      line = line.strip()
      if(line[:4] == "MISC"):
        df.append([line[5:], 'MISC'])
      elif(line[:4] == 'CONT'):
        df.append([line[5:], 'CONT'])
      elif(line[:4] == 'AIMX'):
        df.append([line[5:], 'AIMX'])
      elif(line[:4] == 'OWNX'):
        df.append([line[5:], 'OWNX'])
      elif(line[:4] == 'BASE'):
        df.append([line[5:], 'BASE'])
    # count += 1
    # print("Line{}: {}".format(count, line.strip()))

In [7]:
df = pd.DataFrame(df, columns = ['text', 'label'])

## Data Analysis

In [8]:
aimx = df[df.label == 'AIMX']
aimx['text']

2       Here, we present the first genome-scale, fine-...
5       Furthermore, it was converted into a mathemati...
32      In this study, we present a new generation of ...
45      Our study provides the atomic resolution descr...
53      It will then be beneficial to understand the m...
                              ...                        
3056    Here, using computer simulations of isolated p...
3079    we show that   can be studied by using algebra...
3095    in this paper  we approach the problem in its ...
3113    this note states two simple results about defe...
3115    in our informal discussions we will be assumin...
Name: text, Length: 194, dtype: object

In [9]:
from tqdm import tqdm

In [10]:
labels = {
    'MISC': 0,
    'CONT': 1,
    'AIMX': 2, 
    'OWNX': 3,
    'BASE': 4
}

In [11]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 32, truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'], total=len(df))]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_variables(self, idx):
        # Fetch a batch of labels
        return np.array(self.variables[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [12]:
np.random.seed(40)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

2493 312 312


In [13]:
train, val = Dataset(df_train), Dataset(df_val)

100%|██████████| 2493/2493 [00:02<00:00, 1210.79it/s]
100%|██████████| 312/312 [00:00<00:00, 1326.85it/s]


In [14]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [15]:
from torch.optim import Adam
from tqdm import tqdm

EPOCHS = 20
model = BertClassifier().cuda()
LR = 5e-6

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
train_loss = []
train_acc = []
val_loss = []
val_acc = []

In [17]:
def train_func(model, train_dataloader, val_dataloader, learning_rate, epochs):

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    
    bs = 64

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            total_data_train = 0
            bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
            
            for step, (train_input, train_label) in bar:

                bar.set_description(f"Epoch {epoch_num}")

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc


                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

                total_data_train+=bs
#                 bar.set_postfix(Epoch_Loss=total_loss_train/total_data_train, 
#                                 Batch_Loss=batch_loss.item())

                total_acc_val = 0
                total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} \
                | Val Loss: {total_loss_val / len(val_dataloader.dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
            train_loss.append(total_loss_train / len(train_dataloader.dataset))
            train_acc.append(total_acc_train / len(train_dataloader.dataset))
            val_loss.append(total_loss_val / len(val_dataloader.dataset))
            val_acc.append(total_acc_val / len(val_dataloader.dataset))

In [18]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)

In [19]:
train_func(model, train_dataloader, val_dataloader, LR, EPOCHS)

Epoch 0: 100%|██████████| 78/78 [00:14<00:00,  5.45it/s]


Epochs: 1 | Train Loss:  0.041                 | Train Accuracy:  0.521                 | Val Loss:  0.033                 | Val Accuracy:  0.663


Epoch 1: 100%|██████████| 78/78 [00:14<00:00,  5.54it/s]


Epochs: 2 | Train Loss:  0.028                 | Train Accuracy:  0.717                 | Val Loss:  0.027                 | Val Accuracy:  0.747


Epoch 2: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 3 | Train Loss:  0.021                 | Train Accuracy:  0.820                 | Val Loss:  0.023                 | Val Accuracy:  0.798


Epoch 3: 100%|██████████| 78/78 [00:13<00:00,  5.59it/s]


Epochs: 4 | Train Loss:  0.018                 | Train Accuracy:  0.850                 | Val Loss:  0.021                 | Val Accuracy:  0.808


Epoch 4: 100%|██████████| 78/78 [00:14<00:00,  5.56it/s]


Epochs: 5 | Train Loss:  0.015                 | Train Accuracy:  0.870                 | Val Loss:  0.019                 | Val Accuracy:  0.837


Epoch 5: 100%|██████████| 78/78 [00:14<00:00,  5.55it/s]


Epochs: 6 | Train Loss:  0.014                 | Train Accuracy:  0.874                 | Val Loss:  0.018                 | Val Accuracy:  0.833


Epoch 6: 100%|██████████| 78/78 [00:14<00:00,  5.53it/s]


Epochs: 7 | Train Loss:  0.012                 | Train Accuracy:  0.876                 | Val Loss:  0.018                 | Val Accuracy:  0.824


Epoch 7: 100%|██████████| 78/78 [00:14<00:00,  5.45it/s]


Epochs: 8 | Train Loss:  0.012                 | Train Accuracy:  0.878                 | Val Loss:  0.019                 | Val Accuracy:  0.821


Epoch 8: 100%|██████████| 78/78 [00:14<00:00,  5.48it/s]


Epochs: 9 | Train Loss:  0.011                 | Train Accuracy:  0.880                 | Val Loss:  0.018                 | Val Accuracy:  0.830


Epoch 9: 100%|██████████| 78/78 [00:14<00:00,  5.47it/s]


Epochs: 10 | Train Loss:  0.010                 | Train Accuracy:  0.886                 | Val Loss:  0.018                 | Val Accuracy:  0.833


Epoch 10: 100%|██████████| 78/78 [00:14<00:00,  5.47it/s]


Epochs: 11 | Train Loss:  0.010                 | Train Accuracy:  0.887                 | Val Loss:  0.018                 | Val Accuracy:  0.840


Epoch 11: 100%|██████████| 78/78 [00:14<00:00,  5.48it/s]


Epochs: 12 | Train Loss:  0.009                 | Train Accuracy:  0.892                 | Val Loss:  0.018                 | Val Accuracy:  0.843


Epoch 12: 100%|██████████| 78/78 [00:14<00:00,  5.47it/s]


Epochs: 13 | Train Loss:  0.009                 | Train Accuracy:  0.897                 | Val Loss:  0.018                 | Val Accuracy:  0.853


Epoch 13: 100%|██████████| 78/78 [00:14<00:00,  5.45it/s]


Epochs: 14 | Train Loss:  0.008                 | Train Accuracy:  0.901                 | Val Loss:  0.019                 | Val Accuracy:  0.837


Epoch 14: 100%|██████████| 78/78 [00:14<00:00,  5.44it/s]


Epochs: 15 | Train Loss:  0.008                 | Train Accuracy:  0.904                 | Val Loss:  0.018                 | Val Accuracy:  0.846


Epoch 15: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 16 | Train Loss:  0.007                 | Train Accuracy:  0.905                 | Val Loss:  0.018                 | Val Accuracy:  0.840


Epoch 16: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 17 | Train Loss:  0.007                 | Train Accuracy:  0.909                 | Val Loss:  0.019                 | Val Accuracy:  0.853


Epoch 17: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 18 | Train Loss:  0.007                 | Train Accuracy:  0.906                 | Val Loss:  0.019                 | Val Accuracy:  0.853


Epoch 18: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 19 | Train Loss:  0.007                 | Train Accuracy:  0.909                 | Val Loss:  0.018                 | Val Accuracy:  0.846


Epoch 19: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 20 | Train Loss:  0.007                 | Train Accuracy:  0.905                 | Val Loss:  0.018                 | Val Accuracy:  0.837


In [20]:
from sklearn.metrics import f1_score, classification_report

In [21]:
def predict(s):
    np_array = np.array([labels[s.label]])
    label = torch.from_numpy(np_array)
    train_label = label.to(device)

    true = train_label.item()
        
    text = tokenizer(s.text, padding='max_length', max_length = 32, truncation=True,
                            return_tensors="pt")
    
    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].squeeze(1).to(device)
    
    preds = []
    output = model(input_id, mask)
    preds = torch.argmax(output).cpu().detach().numpy()

    return preds, true

In [22]:
y_true = []
y_pred = []
T = 0
F = 0

for i, row in tqdm(df_test.iterrows(), total=len(df_test)):
    p, t = predict(row)
    y_true.append(t)
    y_pred.append(p)
    if(t in p):
        T+=1
    else:
        F+=1

100%|██████████| 312/312 [00:03<00:00, 81.36it/s]


In [23]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       175
           1       0.81      0.59      0.68        22
           2       0.85      0.52      0.65        21
           3       0.76      0.79      0.77        84
           4       0.75      0.30      0.43        10

    accuracy                           0.83       312
   macro avg       0.81      0.63      0.69       312
weighted avg       0.82      0.83      0.82       312



## With weights

In [24]:
from torch.optim import Adam
from tqdm import tqdm

EPOCHS = 20
model = BertClassifier().cuda()
LR = 5e-6

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight(class_weight='balanced', classes=df.label.unique(), y=df_train.label)
class_weights = np.array(class_weights).astype(np.float32)
for i in range(len(class_weights)):
    x = class_weights[i]
    if(x < 0.5):
        class_weights[i] = 0.5
    elif(x > 10):
        class_weights[i] = 10
class_weights = torch.tensor(class_weights)

In [26]:
class_weights

tensor([ 0.5000,  3.8061,  3.2168,  0.7143, 10.0000])

In [27]:
def train_func(model, train_dataloader, val_dataloader, learning_rate, epochs):

    criterion = nn.CrossEntropyLoss(weight = class_weights)
    optimizer = Adam(model.parameters(), lr= learning_rate)
    
    bs = 64

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            total_data_train = 0
            bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
            
            for step, (train_input, train_label) in bar:

                bar.set_description(f"Epoch {epoch_num}")

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc


                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

                total_data_train+=bs
#                 bar.set_postfix(Epoch_Loss=total_loss_train/total_data_train, 
#                                 Batch_Loss=batch_loss.item())

                total_acc_val = 0
                total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} \
                | Val Loss: {total_loss_val / len(val_dataloader.dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
            train_loss.append(total_loss_train / len(train_dataloader.dataset))
            train_acc.append(total_acc_train / len(train_dataloader.dataset))
            val_loss.append(total_loss_val / len(val_dataloader.dataset))
            val_acc.append(total_acc_val / len(val_dataloader.dataset))

In [28]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)

In [29]:
train_func(model, train_dataloader, val_dataloader, LR, EPOCHS)

Epoch 0: 100%|██████████| 78/78 [00:14<00:00,  5.44it/s]


Epochs: 1 | Train Loss:  0.050                 | Train Accuracy:  0.315                 | Val Loss:  0.051                 | Val Accuracy:  0.388


Epoch 1: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 2 | Train Loss:  0.047                 | Train Accuracy:  0.530                 | Val Loss:  0.047                 | Val Accuracy:  0.628


Epoch 2: 100%|██████████| 78/78 [00:14<00:00,  5.40it/s]


Epochs: 3 | Train Loss:  0.043                 | Train Accuracy:  0.662                 | Val Loss:  0.042                 | Val Accuracy:  0.679


Epoch 3: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 4 | Train Loss:  0.035                 | Train Accuracy:  0.726                 | Val Loss:  0.038                 | Val Accuracy:  0.708


Epoch 4: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 5 | Train Loss:  0.029                 | Train Accuracy:  0.774                 | Val Loss:  0.033                 | Val Accuracy:  0.766


Epoch 5: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 6 | Train Loss:  0.024                 | Train Accuracy:  0.839                 | Val Loss:  0.031                 | Val Accuracy:  0.776


Epoch 6: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 7 | Train Loss:  0.020                 | Train Accuracy:  0.860                 | Val Loss:  0.029                 | Val Accuracy:  0.811


Epoch 7: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 8 | Train Loss:  0.017                 | Train Accuracy:  0.874                 | Val Loss:  0.030                 | Val Accuracy:  0.804


Epoch 8: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 9 | Train Loss:  0.015                 | Train Accuracy:  0.886                 | Val Loss:  0.028                 | Val Accuracy:  0.827


Epoch 9: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 10 | Train Loss:  0.012                 | Train Accuracy:  0.894                 | Val Loss:  0.027                 | Val Accuracy:  0.821


Epoch 10: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 11 | Train Loss:  0.011                 | Train Accuracy:  0.900                 | Val Loss:  0.027                 | Val Accuracy:  0.821


Epoch 11: 100%|██████████| 78/78 [00:14<00:00,  5.41it/s]


Epochs: 12 | Train Loss:  0.010                 | Train Accuracy:  0.903                 | Val Loss:  0.028                 | Val Accuracy:  0.833


Epoch 12: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 13 | Train Loss:  0.009                 | Train Accuracy:  0.907                 | Val Loss:  0.029                 | Val Accuracy:  0.833


Epoch 13: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 14 | Train Loss:  0.009                 | Train Accuracy:  0.909                 | Val Loss:  0.029                 | Val Accuracy:  0.830


Epoch 14: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 15 | Train Loss:  0.008                 | Train Accuracy:  0.909                 | Val Loss:  0.030                 | Val Accuracy:  0.824


Epoch 15: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 16 | Train Loss:  0.007                 | Train Accuracy:  0.914                 | Val Loss:  0.029                 | Val Accuracy:  0.827


Epoch 16: 100%|██████████| 78/78 [00:14<00:00,  5.43it/s]


Epochs: 17 | Train Loss:  0.007                 | Train Accuracy:  0.914                 | Val Loss:  0.030                 | Val Accuracy:  0.833


Epoch 17: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 18 | Train Loss:  0.007                 | Train Accuracy:  0.914                 | Val Loss:  0.033                 | Val Accuracy:  0.827


Epoch 18: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 19 | Train Loss:  0.007                 | Train Accuracy:  0.915                 | Val Loss:  0.029                 | Val Accuracy:  0.824


Epoch 19: 100%|██████████| 78/78 [00:14<00:00,  5.42it/s]


Epochs: 20 | Train Loss:  0.007                 | Train Accuracy:  0.913                 | Val Loss:  0.032                 | Val Accuracy:  0.830


In [30]:
from sklearn.metrics import f1_score, classification_report

In [31]:
def predict(s):
    np_array = np.array([labels[s.label]])
    label = torch.from_numpy(np_array)
    train_label = label.to(device)

    true = train_label.item()
        
    text = tokenizer(s.text, padding='max_length', max_length = 32, truncation=True,
                            return_tensors="pt")
    
    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].squeeze(1).to(device)
    
    preds = []
    output = model(input_id, mask)
    preds = torch.argmax(output).cpu().detach().numpy()

    return preds, true

In [32]:
y_true = []
y_pred = []
T = 0
F = 0

for i, row in tqdm(df_test.iterrows(), total=len(df_test)):
    p, t = predict(row)
    y_true.append(t)
    y_pred.append(p)
    if(t in p):
        T+=1
    else:
        F+=1

100%|██████████| 312/312 [00:03<00:00, 82.17it/s]


In [33]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       175
           1       0.73      0.73      0.73        22
           2       0.84      0.76      0.80        21
           3       0.80      0.79      0.80        84
           4       0.86      0.60      0.71        10

    accuracy                           0.85       312
   macro avg       0.82      0.76      0.79       312
weighted avg       0.85      0.85      0.85       312

