In [170]:
import shutil

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [171]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
#from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from torchmetrics.classification import AUROC
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [172]:
df = pd.read_csv('https://raw.githubusercontent.com/theartificialguy/NLP-with-Deep-Learning/master/BERT/Multi%20Label%20Text%20Classification%20using%20BERT%20PyTorch/train.csv')
df.head(5)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [173]:
df['context'] = df['TITLE']+df['ABSTRACT']
df.head(2)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,context
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,Reconstructing Subject-Specific Effect Maps P...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,Rotation Invariance Neural Network Rotation i...


In [174]:
# Dropping the unused columns

df.drop(columns = ['TITLE', 'ABSTRACT','ID'], axis=1, inplace=True)

In [175]:
# Rearranging the  columns
df = df[['context','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']]

In [176]:
df.head(4)

Unnamed: 0,context,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps P...,0,0,0,0,0
1,Rotation Invariance Neural Network Rotation i...,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,1,0,0,0
3,A finite element approximation for the stochas...,0,1,0,0,0


In [177]:
df[(df['Physics']==1) & (df['Mathematics']==1) & (df['Statistics']==1)]

Unnamed: 0,context,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
639,Smallest eigenvalue density for regular or fix...,1,1,1,0,0
9180,A study of periodograms standardized using tra...,1,1,1,0,0
9254,Copy the dynamics using a learning machine Is...,1,1,1,0,0
11294,Low-dose cryo electron ptychography via non-co...,1,1,1,0,0
13077,Construction of and efficient sampling from th...,1,1,1,0,0
15687,An unbiased estimator for the ellipticity from...,1,1,1,0,0
17231,A geometric approach to non-linear correlation...,1,1,1,0,0
17676,Structure and Randomness of Continuous-Time Di...,1,1,1,0,0
20627,Lectures on the mean values of functionals -- ...,1,1,1,0,0


In [178]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [179]:
print(f"Train size shape : {train.shape}")
print(f"Test Size shape : {test.shape}")

Train size shape : (16777, 6)
Test Size shape : (4195, 6)


In [180]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [181]:
target_list = df.iloc[:,1:].columns.values.tolist()

In [182]:
# Changing the dataset into PyTorch Formats

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.title = df['context']
        self.targets = self.df[target_list].values
        
        
    def __len__(self):
        return len(self.title)
    
    
    def __getitem__(self, idx):
        item = str(self.title[idx])
        item = "".join(item.split())
        
        item = self.tokenizer.encode_plus(
            text=item,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': item['input_ids'].flatten(),
            'attention_mask': item['attention_mask'].flatten(),
            'token_type_ids': item['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[idx]),
        }
        

In [183]:
train_df = train.sample(frac=0.8, random_state = 200).reset_index(drop=True)
val_df = train_df.drop(train_df.index).reset_index(drop=True)

In [184]:
max_length = 256
# CustomDataset will create the data object.

train_datasets = CustomDataset(train_df, tokenizer, max_len=max_length)
val_datasets = CustomDataset(val_df, tokenizer, max_len=max_length)

In [185]:
train_datasets

<__main__.CustomDataset at 0x393586680>

In [186]:
# hyperparameters
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [187]:
# data loader

train_dataloader = torch.utils.data.DataLoader(train_datasets, 
                                               batch_size=TRAIN_BATCH_SIZE, 
                                               shuffle=True, 
                                               num_workers=0)

val_dataloader = torch.utils.data.DataLoader(val_datasets, 
                                             batch_size=VALID_BATCH_SIZE,
                                             shuffle=False, 
                                             num_workers=0)

In [188]:
#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps')

In [189]:
def load_ckp(checkpoint_path = None, model= None, optimizer=None):
    """
    Load checkpoint model and optimizer
    :param checkpoint_path: path to save checkpoint
    :param model: Model which we want to load checkpoint paramters.
    :param optimizer: Optimizer which we want defined in previous training loop.
    :return: 
    """
    # Load the check-points
    checkpoint = torch.load(checkpoint_path)
    
    # Initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    
    # Initialize optimizer from checkpoint optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    
    # Initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()



def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    Save checkpoint model and optimizer
    :param state: checkpoint we want to save
    :param is_best: is this the best checkpoints ; min validation loss
    :param checkpoint_path: path to save checkpoint
    :param best_model_path: path to save best model
    :return: 
    """
    
    torch.save(state, checkpoint_path)
    
    # If it's a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # Copy that checkpoint file to best file path, best model path
        shutil.copyfile(checkpoint_path, best_fpath)

In [190]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()   
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = torch.nn.Dropout(p=0.3)
        self.linear = torch.nn.Linear(768, 2)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(input_ids, 
                                  attention_mask=attention_mask, 
                                  token_type_ids=token_type_ids)
        
        output_dropout = self.dropout(outputs.pooler_output)
        output = self.linear(output_dropout)
        return output
    
model = BERTClass()
model.to(device)

RuntimeError: MPS backend out of memory (MPS allocated: 9.05 GB, other allocations: 688.00 KB, max allowed: 9.07 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets = []
val_outputs = []

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
    # Initialize tracker for minimum validation loss
    valid_loss_min = np.Inf
    
    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0
        accumulation_steps = 4
        model.train()
        print('######### EPOCH {}: Training Start #########'.format(epoch))
        
        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            if (batch_idx + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                
            loss = loss_fn(outputs, targets)
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Print before loss data in training 
            train_loss = train_loss + ((1/  (batch_idx + 1)) * (loss.item() - train_loss))
            
            # print after loss data in training
        print('######### EPOCH {}: Training End #########'.format(epoch))
        print('######### EPOCH {}: Validation Start #########'.format(epoch))
        
        model.eval()
        
        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)
                outputs = model(ids, mask, token_type_ids)
                
                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1/ (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy()).tolist()
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
                
        print('######### EPOCH {}: Validation End #########'.format(epoch))
        train_loss = train_loss / len(validation_loader)
        val_loss = val_loss / len(validation_loader)
        
        print(f'Epoch {val_loss} \Average Training Loss: {train_loss:.4f} \Average Validation Loss: {valid_loss:.4f}')
        
        # Create checkpoint variable and add important data
        checkpoint = {
            'epoch': epoch+1,
            'valid_loss_min': valid_loss,
            'optimizer': optimizer.state_dict(),
            'state_dict': model.state_dict()
        }
        
        # save checkpoints
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
        # TODO: save the model if the model decreased
        if val_loss <= valid_loss:
            print('Validation loss decreased from {:.6f} to {:.6f}'.format(valid_loss_min, valid_loss))
            
            # save the best model
            print('Saving checkpoint...')
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss
            
    print('######### EPOCH {}: Done #########'.format(epoch))
    return model

In [None]:
import os
ckpt_path = os.path.join(os.getcwd(), 'checkpoints')
best_model_path = os.path.join(ckpt_path, 'best_model.pth')

In [None]:
trained_model = train_model(EPOCHS, train_dataloader, val_dataloader, model,optimizer, ckpt_path, best_model_path)