In [None]:
from google.colab import drive


drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Getting the data and unzipping


In [None]:
import gdown 
 
url = 'https://drive.google.com/uc?id=1XPWPSsGGcUgyp0BfOViGOaz8VKLiFw-v' 
output = 'Dataset.zip' 
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1XPWPSsGGcUgyp0BfOViGOaz8VKLiFw-v
To: /content/Dataset.zip
67.2MB [00:00, 69.8MB/s]


'Dataset.zip'

In [None]:
!unzip Dataset.zip

Archive:  Dataset.zip
   creating: task1/
  inflating: task1/non_chembl_papers.csv  
   creating: __MACOSX/
   creating: __MACOSX/task1/
  inflating: __MACOSX/task1/._non_chembl_papers.csv  
  inflating: task1/chembl_papers.csv  
  inflating: __MACOSX/task1/._chembl_papers.csv  
   creating: task2/
  inflating: task2/als_literature.csv  
   creating: __MACOSX/task2/
  inflating: __MACOSX/task2/._als_literature.csv  


### Installing some libraries

In [None]:

!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 50.9 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.8 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

### imports and data loading

In [None]:
import re
import os
import torch
import random
import string
import numpy as np
import pandas as pd
import wordcloud as wc 
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit 
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
from scipy.sparse import vstack
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [None]:
# random seeds
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

In [None]:
def load_data(path):
    return pd.read_csv(path, index_col=False).drop(columns= ['Unnamed: 0','journal', 'year'])
    
def pre_process(text):
    text = text.lower()
    text = re.sub('\d+', ' N ', text)
    text = re.sub('[^\w^\s]+', ' P ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [None]:
# loading data and cleaning
df_pos = load_data('./task1/chembl_papers.csv')
df_neg = load_data('./task1/non_chembl_papers.csv')
df_pos['tag'] = '1'
df_neg['tag'] = '-1'
df = pd.concat([df_pos, df_neg])
df =df.dropna(subset=['title'])
df.abstact = df.abstact.fillna('')
df.abstact = df.abstact.apply(pre_process)
df.title = df.title.apply(pre_process)
df.mesh = df.mesh.apply(pre_process)

In [None]:
# splitting data into train, val and test
# Note that this would give us the same splits as the shallow experiment because 
# we are freezing the random seeds
# first shuffle 
df = df.sample(frac=1)
# then split
train_thres = int(len(df)*0.85)
val_thres = int(len(df)*0.95)
train = df[:train_thres]
val = df[train_thres:val_thres]
test = df[val_thres:]

### Fine Tuning BERT using titles only 

In [None]:
# Ideally we would use a hugging face model that is trained on the domain 
# e.g. biomedical text, However I couldn't find any like this that is why I am 
# relaying on a vannilla bert
# another thing to note is that we are applying our simple pre-processig 
# pipline on the data, in the normal setting the bert tokenizer would be 
# trained on the domain data and thus would optimize the best way to pre-process
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# tokenizing the titles
train_titles = tokenizer(train.title.to_list(), padding="max_length", truncation=True)
val_titles = tokenizer(val.title.to_list(), padding="max_length", truncation=True)
test_titles = tokenizer(test.title.to_list(), padding="max_length", truncation=True)

In [None]:
encode_labels = lambda el: int(int(el)>0)
train_labels = train.tag.apply(encode_labels).to_list()
val_labels = val.tag.apply(encode_labels).to_list()
test_labels = test.tag.apply(encode_labels).to_list()

In [None]:
class MDCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MDCDataset(train_titles, train_labels)
val_dataset = MDCDataset(val_titles, val_labels)
test_dataset = MDCDataset(test_titles, test_labels)


In [None]:
# preparing the evaluation metrics for 
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1408,0.172713,0.957291
2,0.2144,0.18136,0.956004


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1408,0.172713,0.957291
2,0.2144,0.18136,0.956004
3,0.164,0.139282,0.961473


***** Running Evaluation *****
  Num examples = 12433
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=19815, training_loss=0.1407088475422654, metrics={'train_runtime': 18155.4025, 'train_samples_per_second': 17.462, 'train_steps_per_second': 1.091, 'total_flos': 8.341515032500224e+16, 'train_loss': 0.1407088475422654, 'epoch': 3.0})

In [16]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 6217
  Batch size = 64


{'epoch': 3.0,
 'eval_accuracy': 0.9609136239343735,
 'eval_loss': 0.14033368229866028,
 'eval_runtime': 107.8894,
 'eval_samples_per_second': 57.624,
 'eval_steps_per_second': 0.908}