In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
import torch 
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from utils import eval_model, train_model, augment_data_multiclass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore")

## Data Preprocessing

In [3]:
train_df = pd.read_json('../train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('../dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('../test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']

train_df

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0000,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0000,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0000,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0000,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0000,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8238,explicit,50.0,,28.0,"Importantly, the results of Pascalis et al. (2...",background,0.7350,6f68ccd37718366c40ae6aeedf0b935bf560b215,60ed4bdabf92b2fbd6162dbd8979888cccca55d7,True,6f68ccd37718366c40ae6aeedf0b935bf560b215>60ed4...,6f68ccd37718366c40ae6aeedf0b935bf560b215>60ed4...,15,,
8239,explicit,182.0,DISCUSSION,179.0,"As suggested by Nguena et al, there is a need ...",background,0.7508,f2a1c1704f9587c94ed95bc98179dc499e933f5e,574e659da7f6c62c07bfaaacd1f31d65bd75524c,True,f2a1c1704f9587c94ed95bc98179dc499e933f5e>574e6...,f2a1c1704f9587c94ed95bc98179dc499e933f5e>574e6...,1,,
8240,explicit,120.0,DISCUSSION,108.0,Skeletal muscle is also a primary site of dise...,background,1.0000,18c97ea2ff60c110cc2a523e0fdf729608cbb083,fc13b9c3dfcc121013edaa12fa8ce7842aaed21a,False,18c97ea2ff60c110cc2a523e0fdf729608cbb083>fc13b...,18c97ea2ff60c110cc2a523e0fdf729608cbb083>fc13b...,8,,
8241,explicit,221.0,,185.0,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method,,4ec9b89857c0b27e8a4bd3745b7358f387773527,81affdba19e38e2b17cf7b9e93792cc2028cf21d,True,4ec9b89857c0b27e8a4bd3745b7358f387773527>81aff...,4ec9b89857c0b27e8a4bd3745b7358f387773527>81aff...,0,,


In [4]:
X_train, y_train = augment_data_multiclass(X_train, y_train)

In [5]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform string column
y_train = label_encoder.fit_transform(y_train)
y_dev = label_encoder.transform(y_dev)
y_test = label_encoder.transform(y_test)

print(y_train)

[0 0 0 ... 2 2 2]


## Model Creation

In [6]:
model_name = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
for name, param in model.named_parameters():
    print(name)

albert.embeddings.word_embeddings.weight
albert.embeddings.position_embeddings.weight
albert.embeddings.token_type_embeddings.weight
albert.embeddings.LayerNorm.weight
albert.embeddings.LayerNorm.bias
albert.encoder.embedding_hidden_mapping_in.weight
albert.encoder.embedding_hidden_mapping_in.bias
albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight
albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight
albert.encoder.al

## Model training

In [8]:
# Freeze some layers
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True
#for param in model.albert.embeddings.parameters():
#    param.requires_grad = True

In [9]:
# model = train_model(model, tokenizer, 10, X_train, y_train, 1e-3)
# WARNING: This will take very long

## First previously trained model
This is pre-trained albert model with only classification (last) layer being fine-tuned.

We use:
- learning_rate = 1e-2
- num_epoch = 5
- batch_size = 256
- tokenizer = default AlbertTokenizer
- NO preprocessing before using AlbertTokenizer

In [10]:
# Load the previously trained model
model.load_state_dict(torch.load('albert1.pth'))

<All keys matched successfully>

In [11]:
# Find f1 score in training data, this will return 0.6923237234957776
# WARNING: this will take quite long without GPU
eval_model(model, tokenizer, X_train, y_train)

In [12]:
# Finf f1 score in testing data, this will return 0.7048704066259638
# WARNING: this will take quite long without GPU
eval_model(model, tokenizer, X_test, y_test)

## Second previously trained model
This is albert model with all parameters trained. Note that it takes around 6 hours to train this model on SOC compute cluster

We use:
- learning_rate = 4e-5
- num_epoch = 10
- batch_size = 16
- tokenizer = default AlbertTokenizer
- NO preprocessing before using AlbertTokenizer

In [13]:
# Load the previously trained model
model.load_state_dict(torch.load('albert2.pth'))

<All keys matched successfully>

In [None]:
# Find evaluation in test data, we can get:
# f1 = 0.789986023104638 and accuracy = 0.8097796883396023
# WARNING: this will take quite long without GPU
eval_model(model, tokenizer, X_test, y_test)

## Third previously trained model
This is albert model with all parameters trained. Will take ~6 hours to train in SOC cluster.

We use:
- learning_rate = 4e-5
- num_epoch = 10
- batch_size = 16
- tokenizer = default AlbertTokenizer
- preprocessed with RegexpTokenizer, stopwords removal, and lemmatization before using AlbertTokenizer

In [8]:
# Load the previously trained model
model.load_state_dict(torch.load('albert3.pth'))

<All keys matched successfully>

In [None]:
# Find evaluation in test data, we can get:
# f1 = 0.7715051817575284 and accuracy = 0.7920472864051585
# WARNING: this will take quite long without GPU
eval_model(model, tokenizer, X_test, y_test)