In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
import torch 
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.metrics import f1_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [3]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
test_df = test_df[['string', 'label']]

test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1860,3
top,For datasets with multiple human annotations (...,background
freq,2,997


In [4]:
def cleaning(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = ' '.join(x for x in text.split() if x not in stop_words)
    return text

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = []
    for x in text.split():
        x = lemmatizer.lemmatize(x)
        words.append(x)
    text = ' '.join(words)
    return text

def preprocessing(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    text = cleaning(text)
    text = lemmatize(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

In [7]:
roberta_train = train_df[['string','label']]
roberta_train['string'] = roberta_train['string'].apply(lambda x: preprocessing(x))
label_encoder = LabelEncoder()
roberta_train['label'] = label_encoder.fit_transform(roberta_train['label'])
roberta_train

Unnamed: 0,string,label
0,however frataxin interacts fe s cluster biosyn...,0
1,study hickey et al 2012 spike sampled field po...,0
2,drug also reduces catecholamine secretion ther...,0
3,clustering lowly aggressive close kin king 198...,0
4,ophthalmic symptom rare manifestation intracra...,0
...,...,...
8238,importantly result pascalis et al 2005 also re...,0
8239,suggested nguena et al need educate health pro...,0
8240,skeletal muscle also primary site disease mous...,0
8241,activation transcription factor role several t...,1


In [12]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        string = str(self.data.string[index])
        label = int(self.data.label[index])
        encoding = self.tokenizer.encode_plus(
            string,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'string': string,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## 1st Category: Short data

Define short data as text with number of words <= 25

In [8]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [10]:
short_df

Unnamed: 0,string,label
9,"After secondary review, 93 studies were includ...",method
15,"[12], is fast and simple to apply as positioni...",background
24,"1a), or individually via sharp electrode penet...",background
33,"According to the literature, the clinical resu...",background
34,The abnormal histological alterations observed...,result
...,...,...
1828,PGA7 has been shown to be upregulated in hypha...,background
1830,bouts of the Windgate Anaerobic Test thus affe...,background
1837,HA have been shown previously to increase surv...,background
1847,"Moreover, DIR1 is required for AA-induced (Jun...",background


In [43]:
roberta_test1 = short_df[['string','label']]
roberta_test1['string'] = roberta_test1['string'].apply(lambda x: preprocessing(x))
roberta_test1['label'] = label_encoder.transform(roberta_test1['label'])
roberta_test1 = roberta_test1.reset_index(drop=True)
roberta_test1

Unnamed: 0,string,label
0,secondary review 93 study included final repor...,1
1,12 fast simple apply positioning irradiation p...,0
2,1a individually via sharp electrode penetratio...,0
3,according literature clinical result tend wors...,0
4,abnormal histological alteration observed grou...,2
...,...,...
257,pga7 shown upregulated hypha 75 regulated bcr1...,0
258,bout windgate anaerobic test thus affecting re...,0
259,ha shown previously increase survival shrimp m...,0
260,moreover dir1 required aa induced jung et al 2...,0


## 2nd Category: Long data

Define long data as text with number of words > 25

In [13]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [14]:
long_df

Unnamed: 0,string,label
0,"Chapel, as well as X10 [2], UPC [3] , CoArray ...",background
1,"In addition, the result of the present study s...",result
2,Several instruments that more specifically add...,background
3,Organotypic hippocampal slice cultures\nInterf...,method
4,Activated PBMC are the basis of the standard P...,background
...,...,...
1855,Recent studies using di¡erent cell types have ...,background
1856,"Additionally, no sex differences were detected...",result
1857,WBRT (40Gy in 20 fractions) along with concurr...,background
1859,"Additionally, encapsulated spheroids may be mu...",background


In [44]:
roberta_test2 = long_df[['string','label']]
roberta_test2['string'] = roberta_test2['string'].apply(lambda x: preprocessing(x))
roberta_test2['label'] = label_encoder.transform(roberta_test2['label'])
roberta_test2 = roberta_test2.reset_index(drop=True)
roberta_test2

Unnamed: 0,string,label
0,chapel well x10 2 upc 3 coarray fortran 6 tita...,0
1,addition result present study support previous...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal slice culture interfac...,1
4,activated pbmc basis standard pbmc blast assay...,0
...,...,...
1594,recent study using di erent cell type investig...,0
1595,additionally sex difference detected present s...,2
1596,wbrt 40gy 20 fractions along concurrent intrat...,0
1597,additionally encapsulated spheroid may multipl...,0


## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [16]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [18]:
paragraph_df

Unnamed: 0,string,label
3,Organotypic hippocampal slice cultures\nInterf...,method
7,"Therefore, we can compare our findings only wi...",result
8,…as an ocs element (Bouchez et al. 1989; Lam e...,background
9,"After secondary review, 93 studies were includ...",method
13,"This was expected, as the literature has shown...",result
...,...,...
1838,[Ca21]i was ascertained by using the fluoresce...,method
1840,hydrogen-bonded cluster of charged amino acids...,background
1843,One study conducted on elderly patients showed...,background
1849,There are two subpopulations of afferent neuro...,background


In [45]:
roberta_test3 = paragraph_df[['string','label']]
roberta_test3['string'] = roberta_test3['string'].apply(lambda x: preprocessing(x))
roberta_test3['label'] = label_encoder.transform(roberta_test3['label'])
roberta_test3 = roberta_test3.reset_index(drop=True)
roberta_test3

Unnamed: 0,string,label
0,organotypic hippocampal slice culture interfac...,1
1,therefore compare finding data obtained koubek...,2
2,as ocs element bouchez et al 1989 lam et al 19...,0
3,secondary review 93 study included final repor...,1
4,expected literature shown ethylene inhibitory ...,2
...,...,...
408,ca21 i ascertained using fluorescent calcium i...,1
409,hydrogen bonded cluster charged amino acids ca...,0
410,one study conducted elderly patient showed com...,0
411,two subpopulation afferent neuron spiral gangl...,0


## 4th Category: Typo data

In [20]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)
    
    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])
    
    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [21]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

In [22]:
typo_df

Unnamed: 0,label,string
0,background,"Chapel , as well as X10 [ 2 ] , UPC [ 3 ] , Co..."
1,result,"In addition , the result of the present tsudy ..."
2,background,Several instruments that more specifically add...
3,method,Organotypic hippocampal slice cultrues Interfa...
4,background,PBMC Activated are the basis of standard the P...
...,...,...
1856,result,"Additionally , no sex were differences detecte..."
1857,background,WBRT ( 40Gy in 20 along fractions ) with intra...
1858,method,The data from obtained htis crosssectinoal suv...
1859,background,"Additionally , encapsulated spheroids may eb m..."


In [46]:
roberta_test4 = typo_df[['string','label']]
roberta_test4['string'] = roberta_test4['string'].apply(lambda x: preprocessing(x))
roberta_test4['label'] = label_encoder.transform(roberta_test4['label'])
roberta_test4 = roberta_test4.reset_index(drop=True)
roberta_test4

Unnamed: 0,string,label
0,chapel well x10 2 upc 3 coarray fortran 6 tita...,0
1,addition result present tsudy support previous...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal slice cultrues interfa...,1
4,pbmc activated basis standard pbmc blast assay...,0
...,...,...
1856,additionally sex difference detected present s...,2
1857,wbrt 40gy 20 along fraction intrathecal concur...,0
1858,data obtained htis crosssectinoal suvrey het a...,1
1859,additionally encapsulated spheroid may eb mult...,0


## 5th Category: Synonym data

For each sentence, iterate through the words and convert it to its synonym.

In [25]:
synonymized_test_df = pd.read_json('synonymized.jsonl', lines=True)
synonymized_test_df = synonymized_test_df[['string', 'label']]

synonymized_test_df

Unnamed: 0,string,label
0,"Chapel, as good as X10 [2], UPC [3] , CoArray ...",background
1,"In addition, the effect of the present study b...",result
2,several instrument that more specifically addr...,background
3,Organotypic hippocampal piece civilization int...,method
4,actuate PBMC are the basis of the standard PBM...,background
...,...,...
1856,"Additionally, no sexual practice difference we...",result
1857,WBRT (40Gy in 20 fractions) along with coincid...,background
1858,The information obtain from this crosssectiona...,method
1859,"Additionally, encapsulate ellipsoid of revolut...",background


In [47]:
roberta_test5 = synonymized_test_df[['string','label']]
roberta_test5['string'] = roberta_test5['string'].apply(lambda x: preprocessing(x))
roberta_test5['label'] = label_encoder.transform(roberta_test5['label'])
roberta_test5 = roberta_test5.reset_index(drop=True)
roberta_test5

Unnamed: 0,string,label
0,chapel good x10 2 upc 3 coarray fortran 6 ti 5...,0
1,addition effect present study back old studies...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal piece civilization int...,1
4,actuate pbmc basis standard pbmc blast check h...,0
...,...,...
1856,additionally sexual practice difference observ...,2
1857,wbrt 40gy 20 fractions along coincidental intr...,0
1858,information obtain crosssectional study dutch ...,1
1859,additionally encapsulate ellipsoid revolution ...,0


## 6th Category: Paraphrased data

In [28]:
paraphrased_test_df = pd.read_json('paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

paraphrased_test_df

Unnamed: 0,string,label
0,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background
1,"Moreover, the findings of this current researc...",result
2,Various tools that are designed to capture pat...,background
3,Organotypic hippocampal slice cultures created...,method
4,Activated PBMCs serve as the fundamental compo...,background
...,...,...
1856,"Moreover, the current study did not find any d...",result
1857,The combination of whole-brain radiation thera...,background
1858,The information collected from this survey con...,method
1859,"Furthermore, combining encapsulated spheroids ...",background


In [48]:
roberta_test6 = paraphrased_test_df[['string','label']]
roberta_test6['string'] = roberta_test6['string'].apply(lambda x: preprocessing(x))
roberta_test6['label'] = label_encoder.transform(roberta_test6['label'])
roberta_test6 = roberta_test6.reset_index(drop=True)
roberta_test6

Unnamed: 0,string,label
0,chapel x10 upc coarray fortran titanium utiliz...,0
1,moreover finding current research align earlie...,2
2,various tool designed capture patient reported...,0
3,organotypic hippocampal slice culture created ...,1
4,activated pbmcs serve fundamental component co...,0
...,...,...
1856,moreover current study find difference based g...,2
1857,combination whole brain radiation therapy admi...,0
1858,information collected survey conducted amsterd...,1
1859,furthermore combining encapsulated spheroid en...,0


In [None]:
from sklearn.metrics import f1_score as calculate_f1_score

MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-5


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

train_dataset = CustomDataset(roberta_train, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_datasets = {
    "test1": CustomDataset(roberta_test1, tokenizer, MAX_LEN),
    "test2": CustomDataset(roberta_test2, tokenizer, MAX_LEN),
    "test3": CustomDataset(roberta_test3, tokenizer, MAX_LEN),
    "test4": CustomDataset(roberta_test4, tokenizer, MAX_LEN),
    "test5": CustomDataset(roberta_test5, tokenizer, MAX_LEN),
    "test6": CustomDataset(roberta_test6, tokenizer, MAX_LEN)
}

test_loaders = {
    name: DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    for name, dataset in test_datasets.items()
}

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [36]:
def train():
    for epoch in range(EPOCHS):
        model.train()
        train_losses = []
        print(f"Epoch {epoch+1}/{EPOCHS}")

        for batch in tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch+1} Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_losses.append(loss.item())

            loss.backward()
            optimizer.step()

        print(f"Avg training loss for Epoch {epoch+1}: {sum(train_losses)/len(train_losses)}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
def evaluate(test_loader):
    model.eval()
    test_losses = []
    test_correct = 0
    test_total = 0
    test_f1_scores = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, total=len(test_loader), desc=f'Evaluation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_losses.append(loss.item())

            _, predicted = torch.max(outputs.logits, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            f1_batch = calculate_f1_score(labels.cpu(), predicted.cpu(), average='macro')
            test_f1_scores.append(f1_batch)

    accuracy = test_correct / test_total
    f1_score_avg = sum(test_f1_scores) / len(test_f1_scores)
    return sum(test_losses)/len(test_losses), accuracy, f1_score_avg

In [37]:
# Execute training and evaluation
print("Starting training...")
train()

Starting training...
Epoch 1/2


Epoch 1 Training: 100%|██████████| 516/516 [1:50:41<00:00, 12.87s/it]


Avg training loss for Epoch 1: 0.5556096460006034
Epoch 2/2


Epoch 2 Training: 100%|██████████| 516/516 [1:49:13<00:00, 12.70s/it]

Avg training loss for Epoch 2: 0.3922859859154668





In [61]:
for name, loader in test_loaders.items():
    print(f"Evaluating on {name}...")
    loss, accuracy, f1_score = evaluate(loader)
    print(f"{name} - Loss: {loss}, Accuracy: {accuracy}, F1-Score: {f1_score}")

Evaluating on test1...


Evaluation: 100%|██████████| 17/17 [01:04<00:00,  3.78s/it]


test1 - Loss: 0.4820978448671453, Accuracy: 0.8091603053435115, F1-Score: 0.7605692935240098
Evaluating on test2...


Evaluation: 100%|██████████| 100/100 [06:33<00:00,  3.93s/it]


test2 - Loss: 0.5194442676007748, Accuracy: 0.8017510944340213, F1-Score: 0.7576678912007958
Evaluating on test3...


Evaluation: 100%|██████████| 26/26 [01:41<00:00,  3.90s/it]


test3 - Loss: 0.5000252537429333, Accuracy: 0.7990314769975787, F1-Score: 0.7841944063761885
Evaluating on test4...


Evaluation: 100%|██████████| 117/117 [07:35<00:00,  3.90s/it]


test4 - Loss: 0.5451847191931855, Accuracy: 0.7909725953788286, F1-Score: 0.735029587644363
Evaluating on test5...


Evaluation: 100%|██████████| 117/117 [07:37<00:00,  3.91s/it]


test5 - Loss: 0.6152601167241223, Accuracy: 0.7667920472864052, F1-Score: 0.6996939579488795
Evaluating on test6...


Evaluation: 100%|██████████| 117/117 [07:36<00:00,  3.91s/it]

test6 - Loss: 0.629040043323468, Accuracy: 0.7581945190757657, F1-Score: 0.7102331887459297



