In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']

test_df.describe()

Unnamed: 0,citeEnd,citeStart,excerpt_index,label_confidence
count,1858.0,1858.0,1861.0,193.0
mean,162.018837,143.779871,1.266523,0.822149
std,90.325071,79.448005,2.410082,0.183872
min,3.0,0.0,0.0,0.3892
25%,107.0,92.0,0.0,0.731
50%,151.0,135.0,0.0,0.7691
75%,208.0,189.75,1.0,1.0
max,1162.0,804.0,17.0,1.0


In [3]:
def cleaning(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = ' '.join(x for x in text.split() if x not in stop_words)
    return text

In [4]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = []
    for x in text.split():
        x = lemmatizer.lemmatize(x)
        words.append(x)
    text = ' '.join(words)
    return text

In [5]:
def preprocessing(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    text = cleaning(text)
    text = lemmatize(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm
base_train = train_df[['string','label']]
base_train['string'] = base_train['string'].apply(lambda x: preprocessing(x))
base_test = test_df[['string','label']]
base_test['string'] = base_test['string'].apply(lambda x: preprocessing(x))

label_encoder = LabelEncoder()
base_train['label'] = label_encoder.fit_transform(base_train['label'])
base_test['label'] = label_encoder.transform(base_test['label'])
base_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_train['string'] = base_train['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test['string'] = base_test['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_train['label'] = label_encoder.fit_transform(b

Unnamed: 0,string,label
0,however frataxin interacts fe s cluster biosyn...,0
1,study hickey et al 2012 spike sampled field po...,0
2,drug also reduces catecholamine secretion ther...,0
3,clustering lowly aggressive close kin king 198...,0
4,ophthalmic symptom rare manifestation intracra...,0
...,...,...
8238,importantly result pascalis et al 2005 also re...,0
8239,suggested nguena et al need educate health pro...,0
8240,skeletal muscle also primary site disease mous...,0
8241,activation transcription factor role several t...,1


In [8]:
class Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        string = str(self.data.string[index])
        label = int(self.data.label[index])
        encoding = self.tokenizer.encode_plus(
            string,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'string': string,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


## 1st Category: Short data

Define short data as text with number of words <= 25

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [13]:
short_df

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label_confidence
9,explicit,75.0,Results,68.0,"After secondary review, 93 studies were includ...",method,a33b22565c6579e2d728d755be0b31111bb98981,1a454884d27fd063878dadecd0054e862fe63d52,False,a33b22565c6579e2d728d755be0b31111bb98981>1a454...,a33b22565c6579e2d728d755be0b31111bb98981>1a454...,1,,
15,explicit,4.0,Discussion,0.0,"[12], is fast and simple to apply as positioni...",background,33780743b0d4fd2f73d4618f1fa6d833f3a6c1bb,6d8ce1bedc01f4020fe403a8e2237278bb664f07,False,33780743b0d4fd2f73d4618f1fa6d833f3a6c1bb>6d8ce...,33780743b0d4fd2f73d4618f1fa6d833f3a6c1bb>6d8ce...,0,,
24,explicit,96.0,Results,72.0,"1a), or individually via sharp electrode penet...",background,5227ec6b6b9e6630d3eec53e8784b1d43b97febf,61ba484c3fd142d5ddf606eae1a549fe29594208,False,5227ec6b6b9e6630d3eec53e8784b1d43b97febf>61ba4...,5227ec6b6b9e6630d3eec53e8784b1d43b97febf>61ba4...,12,,0.4824
33,explicit,112.0,DISCUSSION,109.0,"According to the literature, the clinical resu...",background,031a4d6f7ab525db9e782396b731d87fdc092cc1,92293f35ea20d289a5b638602e99d42aa3e1d8b5,False,031a4d6f7ab525db9e782396b731d87fdc092cc1>92293...,031a4d6f7ab525db9e782396b731d87fdc092cc1>92293...,1,,
34,explicit,148.0,4. Discussion,141.0,The abnormal histological alterations observed...,result,b84a6ebbf7328c48a4113f17818c60e4cbd4f3f7,,False,b84a6ebbf7328c48a4113f17818c60e4cbd4f3f7>None,b84a6ebbf7328c48a4113f17818c60e4cbd4f3f7>None_0,0,supportive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1828,explicit,52.0,DISCUSSION,48.0,PGA7 has been shown to be upregulated in hypha...,background,a5052c7b845059b8f66d563c40fe40b0e10cd7e9,ad2df409527703c46ff484098ce2894173e5dff6,False,a5052c7b845059b8f66d563c40fe40b0e10cd7e9>ad2df...,a5052c7b845059b8f66d563c40fe40b0e10cd7e9>ad2df...,0,,
1830,explicit,110.0,Discussion,103.0,bouts of the Windgate Anaerobic Test thus affe...,background,99093d37b4ebfc542f88d14a84807027fd5bc0a0,216a01bc9980348e4e68453c1ab769e447f3d3a7,False,99093d37b4ebfc542f88d14a84807027fd5bc0a0>216a0...,99093d37b4ebfc542f88d14a84807027fd5bc0a0>216a0...,0,,
1837,explicit,109.0,4. Discussion,82.0,HA have been shown previously to increase surv...,background,bb3074e425bfaf99eb204c6130e9a54eee9dec15,11c8eb51792645248ca534461912d88079f7ad4f,False,bb3074e425bfaf99eb204c6130e9a54eee9dec15>11c8e...,bb3074e425bfaf99eb204c6130e9a54eee9dec15>11c8e...,1,,
1847,explicit,126.0,DISCUSSION,103.0,"Moreover, DIR1 is required for AA-induced (Jun...",background,acf4f8e5546446508c0f22bd5e386abb1e91426b,09310473ea701efa2a538e2aa1a5c1d000075be3,True,acf4f8e5546446508c0f22bd5e386abb1e91426b>09310...,acf4f8e5546446508c0f22bd5e386abb1e91426b>09310...,3,,1.0000


In [14]:
base_test1 = short_df[['string','label']]
base_test1['string'] = base_test1['string'].apply(lambda x: preprocessing(x))
base_test1['label'] = label_encoder.transform(base_test1['label'])
base_test1 = base_test1.reset_index(drop=True)
base_test1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test1['string'] = base_test1['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test1['label'] = label_encoder.transform(base_test1['label'])


Unnamed: 0,string,label
0,secondary review 93 study included final repor...,1
1,12 fast simple apply positioning irradiation p...,0
2,1a individually via sharp electrode penetratio...,0
3,according literature clinical result tend wors...,0
4,abnormal histological alteration observed grou...,2
...,...,...
257,pga7 shown upregulated hypha 75 regulated bcr1...,0
258,bout windgate anaerobic test thus affecting re...,0
259,ha shown previously increase survival shrimp m...,0
260,moreover dir1 required aa induced jung et al 2...,0


## 2nd Category: Long data

Define long data as text with number of words > 25

In [15]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [16]:
long_df

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label_confidence
0,acronym,31.0,,28.0,"Chapel, as well as X10 [2], UPC [3] , CoArray ...",background,2c6797dab4c118cb73197f65ba39dacc99ac743d,95c37bc99982d33873fd141ee00857160fd717a0,True,2c6797dab4c118cb73197f65ba39dacc99ac743d>95c37...,2c6797dab4c118cb73197f65ba39dacc99ac743d>95c37...,6,,
1,explicit,166.0,Discussion,156.0,"In addition, the result of the present study s...",result,fa7145adc9f8cfb8af7a189d9040c13c84ced094,20e23b4f76761d246a7c3b00b80e139e2008f77d,False,fa7145adc9f8cfb8af7a189d9040c13c84ced094>20e23...,fa7145adc9f8cfb8af7a189d9040c13c84ced094>20e23...,0,supportive,
2,explicit,145.0,Discussion,137.0,Several instruments that more specifically add...,background,98a8d8c0c5dae246720d4f339b88e8a9f44e3002,bd222c7ec83dadefba513738290b3624f6dd6b21,True,98a8d8c0c5dae246720d4f339b88e8a9f44e3002>bd222...,98a8d8c0c5dae246720d4f339b88e8a9f44e3002>bd222...,1,,
3,explicit,158.0,Methods,137.0,Organotypic hippocampal slice cultures\nInterf...,method,aeb178ef1910a61152cd74209c28641199c82855,754c04953c261072fa367f4104e3deff082d9484,False,aeb178ef1910a61152cd74209c28641199c82855>754c0...,aeb178ef1910a61152cd74209c28641199c82855>754c0...,1,,1.0
4,explicit,186.0,RESULTS,178.0,Activated PBMC are the basis of the standard P...,background,e4d2591ac3bb65e2ec59f092884a7b15b8018592,f0fb468a54fe8021bc7986a1618222c4fcd16df4,False,e4d2591ac3bb65e2ec59f092884a7b15b8018592>f0fb4...,e4d2591ac3bb65e2ec59f092884a7b15b8018592>f0fb4...,6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855,explicit,184.0,Discussion,160.0,Recent studies using di¡erent cell types have ...,background,34ca39c05705b98ca3cff26d2e579f26836327f7,6c1b814510a75fe567702e152dc5d1f9059618d2,False,34ca39c05705b98ca3cff26d2e579f26836327f7>6c1b8...,34ca39c05705b98ca3cff26d2e579f26836327f7>6c1b8...,0,,
1856,explicit,155.0,4. Discussion,136.0,"Additionally, no sex differences were detected...",result,3cf9c7cd259a356839f42ecf143af3a8f6ef8b54,74cbd6d0eeb051b036f806d8a86c3a85859f9d7d,False,3cf9c7cd259a356839f42ecf143af3a8f6ef8b54>74cbd...,3cf9c7cd259a356839f42ecf143af3a8f6ef8b54>74cbd...,0,supportive,
1857,explicit,188.0,3. Discussion,184.0,WBRT (40Gy in 20 fractions) along with concurr...,background,e609824e9ea6bee5aca817238d81d1cdd6b462ad,f7bfdcf8892a561b6030ed541924551fb78acf1f,False,e609824e9ea6bee5aca817238d81d1cdd6b462ad>f7bfd...,e609824e9ea6bee5aca817238d81d1cdd6b462ad>f7bfd...,1,,
1859,explicit,360.0,Discussion,321.0,"Additionally, encapsulated spheroids may be mu...",background,62ac94ab9227b84f1317edad1b6312e311981961,df5084196ea93af9250fae27c981ea3d7959599d,False,62ac94ab9227b84f1317edad1b6312e311981961>df508...,62ac94ab9227b84f1317edad1b6312e311981961>df508...,1,,


In [17]:
base_test2 = long_df[['string','label']]
base_test2['string'] = base_test2['string'].apply(lambda x: preprocessing(x))
base_test2['label'] = label_encoder.transform(base_test2['label'])
base_test2 = base_test2.reset_index(drop=True)
base_test2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test2['string'] = base_test2['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test2['label'] = label_encoder.transform(base_test2['label'])


Unnamed: 0,string,label
0,chapel well x10 2 upc 3 coarray fortran 6 tita...,0
1,addition result present study support previous...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal slice culture interfac...,1
4,activated pbmc basis standard pbmc blast assay...,0
...,...,...
1594,recent study using di erent cell type investig...,0
1595,additionally sex difference detected present s...,2
1596,wbrt 40gy 20 fractions along concurrent intrat...,0
1597,additionally encapsulated spheroid may multipl...,0


## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [18]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [19]:
paragraph_df

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label_confidence
3,explicit,158.0,Methods,137.0,Organotypic hippocampal slice cultures\nInterf...,method,aeb178ef1910a61152cd74209c28641199c82855,754c04953c261072fa367f4104e3deff082d9484,False,aeb178ef1910a61152cd74209c28641199c82855>754c0...,aeb178ef1910a61152cd74209c28641199c82855>754c0...,1,,1.0
7,explicit,90.0,Discussion,70.0,"Therefore, we can compare our findings only wi...",result,8e8e19bf6e9d7bcf921e91f369a140da2e86bfa3,,False,8e8e19bf6e9d7bcf921e91f369a140da2e86bfa3>None,8e8e19bf6e9d7bcf921e91f369a140da2e86bfa3>None_0,0,supportive,
8,explicit,257.0,Discussion,240.0,…as an ocs element (Bouchez et al. 1989; Lam e...,background,891bcbe15cb6e49c2d95d9de5af618771aa2ea12,facd81eed1e7851736b2213402e1cd523135cb83,False,891bcbe15cb6e49c2d95d9de5af618771aa2ea12>facd8...,891bcbe15cb6e49c2d95d9de5af618771aa2ea12>facd8...,1,,
9,explicit,75.0,Results,68.0,"After secondary review, 93 studies were includ...",method,a33b22565c6579e2d728d755be0b31111bb98981,1a454884d27fd063878dadecd0054e862fe63d52,False,a33b22565c6579e2d728d755be0b31111bb98981>1a454...,a33b22565c6579e2d728d755be0b31111bb98981>1a454...,1,,
13,explicit,163.0,4. Discussion,146.0,"This was expected, as the literature has shown...",result,1a8e2dac8bf06a7a2db9b9928b442921efa69061,533fdd3ebedaa0fdee7760ecbe828c9c4e11af08,True,1a8e2dac8bf06a7a2db9b9928b442921efa69061>533fd...,1a8e2dac8bf06a7a2db9b9928b442921efa69061>533fd...,4,supportive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,properNoun,91.0,METHODS,75.0,[Ca21]i was ascertained by using the fluoresce...,method,27881300024ef59f3ca30c690fa773e8bdb9a6cf,a17ece8b1a76e94c590a86b7360bb856a9f6ddfe,False,27881300024ef59f3ca30c690fa773e8bdb9a6cf>a17ec...,27881300024ef59f3ca30c690fa773e8bdb9a6cf>a17ec...,0,,
1840,explicit,127.0,Results,117.0,hydrogen-bonded cluster of charged amino acids...,background,0e0283cd0cb50fdd5c3dcb23c679e3b3952148d7,441ed8876cf13c9a50da3adda9b243b62a29b9d4,True,0e0283cd0cb50fdd5c3dcb23c679e3b3952148d7>441ed...,0e0283cd0cb50fdd5c3dcb23c679e3b3952148d7>441ed...,8,,
1843,explicit,216.0,Discussion,212.0,One study conducted on elderly patients showed...,background,25a76ad4b8c70bd007e66ea1ff226c0235e90e81,7835ee1e13225b289a0284c28d2e5232528930bc,False,25a76ad4b8c70bd007e66ea1ff226c0235e90e81>7835e...,25a76ad4b8c70bd007e66ea1ff226c0235e90e81>7835e...,0,,
1849,explicit,226.0,RESULTS,209.0,There are two subpopulations of afferent neuro...,background,6e1ca34603f3587e78ad064af18f6a3906258df9,cb0c9bafdb117efa94931a32415cdbef7103f323,False,6e1ca34603f3587e78ad064af18f6a3906258df9>cb0c9...,6e1ca34603f3587e78ad064af18f6a3906258df9>cb0c9...,0,,


In [20]:
base_test3 = paragraph_df[['string','label']]
base_test3['string'] = base_test3['string'].apply(lambda x: preprocessing(x))
base_test3['label'] = label_encoder.transform(base_test3['label'])
base_test3 = base_test3.reset_index(drop=True)
base_test3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test3['string'] = base_test3['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_test3['label'] = label_encoder.transform(base_test3['label'])


Unnamed: 0,string,label
0,organotypic hippocampal slice culture interfac...,1
1,therefore compare finding data obtained koubek...,2
2,as ocs element bouchez et al 1989 lam et al 19...,0
3,secondary review 93 study included final repor...,1
4,expected literature shown ethylene inhibitory ...,2
...,...,...
408,ca21 i ascertained using fluorescent calcium i...,1
409,hydrogen bonded cluster charged amino acids ca...,0
410,one study conducted elderly patient showed com...,0
411,two subpopulation afferent neuron spiral gangl...,0


## 4th Category: Typo data

In [21]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)

    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])

    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [22]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

In [23]:
typo_df

Unnamed: 0,label,string
0,background,"Chapel , as well as X10 [ 2 ] , UPC [ 3 ] , Co..."
1,result,"addition In , teh result of the present stuyd ..."
2,background,Several instruments that more specifically add...
3,method,Organotypic hippocampal lsiec ucltures Interfa...
4,background,Activated PBMC are the basis of the PBMC stand...
...,...,...
1856,result,"Addiitonally no , sex differences were in dete..."
1857,background,WBRT ( 40Gy in fractions 20 ) with along intra...
1858,method,Teh data obtained from this crosssectional sur...
1859,background,"Additionally , encapsulated spheroids may be m..."


In [24]:
base_test4 = typo_df[['string','label']]
base_test4['string'] = base_test4['string'].apply(lambda x: preprocessing(x))
base_test4['label'] = label_encoder.transform(base_test4['label'])
base_test4 = base_test4.reset_index(drop=True)
base_test4

Unnamed: 0,string,label
0,chapel well x10 2 upc 3 coarray fortran 6 tita...,0
1,addition teh result present stuyd support prev...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal lsiec ucltures interfa...,1
4,activated pbmc basis pbmc standard blast assay...,0
...,...,...
1856,addiitonally sex difference detected present s...,2
1857,wbrt 40gy fraction 20 along intrathecal concur...,0
1858,teh data obtained crosssectional survey amsted...,1
1859,additionally encapsulated spheroid may multipl...,0


## 5th Category: Synonym data

In [27]:
synonymized_test_df = pd.read_json('synonymized.jsonl', lines=True)
synonymized_test_df = synonymized_test_df[['string', 'label']]

synonymized_test_df

Unnamed: 0,string,label
0,"Chapel, as good as X10 [2], UPC [3] , CoArray ...",background
1,"In addition, the effect of the present study b...",result
2,several instrument that more specifically addr...,background
3,Organotypic hippocampal piece civilization int...,method
4,actuate PBMC are the basis of the standard PBM...,background
...,...,...
1856,"Additionally, no sexual practice difference we...",result
1857,WBRT (40Gy in 20 fractions) along with coincid...,background
1858,The information obtain from this crosssectiona...,method
1859,"Additionally, encapsulate ellipsoid of revolut...",background


In [28]:
base_test5 = synonymized_test_df[['string','label']]
base_test5['string'] = base_test5['string'].apply(lambda x: preprocessing(x))
base_test5['label'] = label_encoder.transform(base_test5['label'])
base_test5 = base_test5.reset_index(drop=True)
base_test5

Unnamed: 0,string,label
0,chapel good x10 2 upc 3 coarray fortran 6 ti 5...,0
1,addition effect present study back old studies...,2
2,several instrument specifically address patien...,0
3,organotypic hippocampal piece civilization int...,1
4,actuate pbmc basis standard pbmc blast check h...,0
...,...,...
1856,additionally sexual practice difference observ...,2
1857,wbrt 40gy 20 fractions along coincidental intr...,0
1858,information obtain crosssectional study dutch ...,1
1859,additionally encapsulate ellipsoid revolution ...,0


## 6th Category: Paraphrased data

In [30]:
paraphrased_test_df = pd.read_json('paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

paraphrased_test_df

Unnamed: 0,string,label
0,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background
1,"Moreover, the findings of this current researc...",result
2,Various tools that are designed to capture pat...,background
3,Organotypic hippocampal slice cultures created...,method
4,Activated PBMCs serve as the fundamental compo...,background
...,...,...
1856,"Moreover, the current study did not find any d...",result
1857,The combination of whole-brain radiation thera...,background
1858,The information collected from this survey con...,method
1859,"Furthermore, combining encapsulated spheroids ...",background


In [31]:
base_test6 = paraphrased_test_df[['string','label']]
base_test6['string'] = base_test6['string'].apply(lambda x: preprocessing(x))
base_test6['label'] = label_encoder.transform(base_test6['label'])
base_test6 = base_test6.reset_index(drop=True)
base_test6

Unnamed: 0,string,label
0,chapel x10 upc coarray fortran titanium utiliz...,0
1,moreover finding current research align earlie...,2
2,various tool designed capture patient reported...,0
3,organotypic hippocampal slice culture created ...,1
4,activated pbmcs serve fundamental component co...,0
...,...,...
1856,moreover current study find difference based g...,2
1857,combination whole brain radiation therapy admi...,0
1858,information collected survey conducted amsterd...,1
1859,furthermore combining encapsulated spheroid en...,0


In [33]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-5

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3个类别

train_dataset = Dataset(base_train, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_datasets = {
    "test1": Dataset(base_test1, tokenizer, MAX_LEN),
    "test2": Dataset(base_test2, tokenizer, MAX_LEN),
    "test3": Dataset(base_test3, tokenizer, MAX_LEN),
    "test4": Dataset(base_test4, tokenizer, MAX_LEN),
    "test5": Dataset(base_test5, tokenizer, MAX_LEN),
    "test6": Dataset(base_test6, tokenizer, MAX_LEN)
}

test_loaders = {
    name: DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    for name, dataset in test_datasets.items()
}

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [34]:
def train():
    for epoch in range(EPOCHS):
        model.train()
        train_losses = []
        print(f"Epoch {epoch+1}/{EPOCHS}")

        for batch in tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch+1} Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_losses.append(loss.item())

            loss.backward()
            optimizer.step()

        print(f"Avg training loss for Epoch {epoch+1}: {sum(train_losses)/len(train_losses)}")

In [35]:
def evaluate(test_loader):
    model.eval()
    test_losses = []
    test_correct = 0
    test_total = 0
    test_f1_scores = []

    with torch.no_grad():
        for batch in tqdm(test_loader, total=len(test_loader), desc=f'Evaluation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_losses.append(loss.item())

            _, predicted = torch.max(outputs.logits, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            f1_batch = calculate_f1_score(labels.cpu(), predicted.cpu(), average='macro')
            test_f1_scores.append(f1_batch)

    accuracy = test_correct / test_total
    f1_score_avg = sum(test_f1_scores) / len(test_f1_scores)
    return sum(test_losses)/len(test_losses), accuracy, f1_score_avg

In [36]:
# Execute training and evaluation
print("Starting training...")
train()

Starting training...
Epoch 1/2


Epoch 1 Training: 100%|██████████| 516/516 [02:52<00:00,  3.00it/s]


Avg training loss for Epoch 1: 0.5444613478124835
Epoch 2/2


Epoch 2 Training: 100%|██████████| 516/516 [02:57<00:00,  2.91it/s]

Avg training loss for Epoch 2: 0.3300209336935781





In [38]:
from sklearn.metrics import f1_score as calculate_f1_score
for name, loader in test_loaders.items():
    print(f"Evaluating on {name}...")
    loss, accuracy, f1_score = evaluate(loader)
    print(f"{name} - Loss: {loss}, Accuracy: {accuracy}, F1-Score: {f1_score}")

Evaluating on test1...


Evaluation: 100%|██████████| 17/17 [00:02<00:00,  7.72it/s]


test1 - Loss: 0.4442212165278547, Accuracy: 0.8396946564885496, F1-Score: 0.7727915059639701
Evaluating on test2...


Evaluation: 100%|██████████| 100/100 [00:14<00:00,  6.88it/s]


test2 - Loss: 0.5491810807585716, Accuracy: 0.8055034396497811, F1-Score: 0.7521675057073763
Evaluating on test3...


Evaluation: 100%|██████████| 26/26 [00:03<00:00,  7.45it/s]


test3 - Loss: 0.5248704187285441, Accuracy: 0.8159806295399515, F1-Score: 0.7796070325112955
Evaluating on test4...


Evaluation: 100%|██████████| 117/117 [00:15<00:00,  7.38it/s]


test4 - Loss: 0.5712202516121742, Accuracy: 0.7904352498656636, F1-Score: 0.736352156471221
Evaluating on test5...


Evaluation: 100%|██████████| 117/117 [00:15<00:00,  7.67it/s]


test5 - Loss: 0.7398037213163499, Accuracy: 0.7512090274046211, F1-Score: 0.6649697028249052
Evaluating on test6...


Evaluation: 100%|██████████| 117/117 [00:15<00:00,  7.76it/s]

test6 - Loss: 0.6251240577707943, Accuracy: 0.7608812466415905, F1-Score: 0.7085228298497388



