In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression

## Data Preprocessing

In [2]:
train_df = pd.read_json('../train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('../dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('../test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']

train_df

Unnamed: 0,source,citeEnd,sectionName,citeStart,string,label,label_confidence,citingPaperId,citedPaperId,isKeyCitation,id,unique_id,excerpt_index,label2,label2_confidence
0,explicit,175.0,Introduction,168.0,"However, how frataxin interacts with the Fe-S ...",background,1.0000,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,True,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be...,11,,
1,explicit,36.0,Novel Quantitative Trait Loci for Seminal Root...,16.0,"In the study by Hickey et al. (2012), spikes w...",background,1.0000,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,True,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b>b6642...,2,,
2,explicit,228.0,Introduction,225.0,"The drug also reduces catecholamine secretion,...",background,1.0000,9cdf605beb1aa1078f235c4332b3024daa8b31dc,4e6a17fb8d7a3cada601d942e22eb5da6d01adbd,False,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,9cdf605beb1aa1078f235c4332b3024daa8b31dc>4e6a1...,0,,
3,explicit,110.0,Discussion,46.0,By clustering with lowly aggressive close kin ...,background,1.0000,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,False,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,d9f3207db0c79a3b154f3875c9760cc6b056904b>2cc6f...,3,,
4,explicit,239.0,Discussion,234.0,Ophthalmic symptoms are rare manifestations of...,background,1.0000,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,False,88b86556857f4374842d2af2e359576806239175>a5bb0...,88b86556857f4374842d2af2e359576806239175>a5bb0...,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8238,explicit,50.0,,28.0,"Importantly, the results of Pascalis et al. (2...",background,0.7350,6f68ccd37718366c40ae6aeedf0b935bf560b215,60ed4bdabf92b2fbd6162dbd8979888cccca55d7,True,6f68ccd37718366c40ae6aeedf0b935bf560b215>60ed4...,6f68ccd37718366c40ae6aeedf0b935bf560b215>60ed4...,15,,
8239,explicit,182.0,DISCUSSION,179.0,"As suggested by Nguena et al, there is a need ...",background,0.7508,f2a1c1704f9587c94ed95bc98179dc499e933f5e,574e659da7f6c62c07bfaaacd1f31d65bd75524c,True,f2a1c1704f9587c94ed95bc98179dc499e933f5e>574e6...,f2a1c1704f9587c94ed95bc98179dc499e933f5e>574e6...,1,,
8240,explicit,120.0,DISCUSSION,108.0,Skeletal muscle is also a primary site of dise...,background,1.0000,18c97ea2ff60c110cc2a523e0fdf729608cbb083,fc13b9c3dfcc121013edaa12fa8ce7842aaed21a,False,18c97ea2ff60c110cc2a523e0fdf729608cbb083>fc13b...,18c97ea2ff60c110cc2a523e0fdf729608cbb083>fc13b...,8,,
8241,explicit,221.0,,185.0,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method,,4ec9b89857c0b27e8a4bd3745b7358f387773527,81affdba19e38e2b17cf7b9e93792cc2028cf21d,True,4ec9b89857c0b27e8a4bd3745b7358f387773527>81aff...,4ec9b89857c0b27e8a4bd3745b7358f387773527>81aff...,0,,


In [3]:
def cleaning(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = ' '.join(x for x in text.split() if x not in stop_words)
    return text

In [4]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = []
    for x in text.split():
        x = lemmatizer.lemmatize(x)
        words.append(x)
    text = ' '.join(words)
    return text

In [5]:
def preprocessing(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    text = cleaning(text)
    text = lemmatize(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

In [6]:
def augment_data_multiclass(X, y):
    df = pd.concat([X, y], axis=1)
    majority_class_size = df['label'].value_counts().max()
    upsampled_dataframes = []
    for class_label in df['label'].unique():
        class_df = df[df['label'] == class_label]
        if len(class_df) < majority_class_size:
            class_df_upsampled = resample(class_df, replace=True, n_samples=majority_class_size, random_state=10)
            upsampled_dataframes.append(class_df_upsampled)
        else:
            upsampled_dataframes.append(class_df)
    upsampled_df = pd.concat(upsampled_dataframes)
    return upsampled_df['string'], upsampled_df['label']

## Feature Extraction

### BOW Representation

In [7]:
#X_train, y_train = augment_data_multiclass(X_train, y_train)
cv = CountVectorizer(ngram_range=(1,1), preprocessor=preprocessing)
X_train_bow = cv.fit_transform(X_train)
X_dev_bow = cv.transform(X_dev)
X_test_bow = cv.transform(X_test)

### TF-IDF Representation

In [8]:
#X_train, y_train = augment_data_multiclass(X_train, y_train)
vectorizer = TfidfVectorizer(preprocessor=preprocessing, ngram_range=(1, 2), min_df=3, max_df=0.5, use_idf=True, smooth_idf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)
X_test_tfidf = vectorizer.transform(X_test)

## Model Construction and Tuning

#### Logistic Regression

In [9]:
logreg = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.1, 1, 10], 
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

In [10]:
f1_macro_scorer = make_scorer(f1_score, average='macro')
bow_grid = GridSearchCV(logreg, param_grid, cv=5, scoring=f1_macro_scorer, n_jobs=-1)
tfidf_grid = GridSearchCV(logreg, param_grid, cv=5, scoring=f1_macro_scorer, n_jobs=-1)

In [11]:
bow_grid.fit(X_train_bow, y_train)
model_bow = bow_grid.best_estimator_

tfidf_grid.fit(X_train_tfidf, y_train)
model_tfidf = tfidf_grid.best_estimator_



In [12]:
y_pred_bow = model_bow.predict(X_test_bow)
f1_score_bow = f1_score(y_test, y_pred_bow, average='macro')
acc_score_bow = accuracy_score(y_test, y_pred_bow)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
f1_score_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')
acc_score_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [13]:
list1 = ["LR x BOW", acc_score_bow, f1_score_bow]
list2 = ["LR x TFIDF", acc_score_tfidf, f1_score_tfidf]

df = pd.DataFrame([list1, list2], columns=['Model','Accuracy', 'F1'])
df

Unnamed: 0,Model,Accuracy,F1
0,LR x BOW,0.743149,0.711471
1,LR x TFIDF,0.666308,0.60434


## Evaluate previously trained model with given category

In [14]:
test_df = pd.read_json('../test.jsonl', lines=True)

### 1st Category: Short data

In [15]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

X_test_short_bow = cv.transform(short_df['string'])
X_test_short_tfidf = vectorizer.transform(short_df['string'])
y_test_short = short_df['label']

y_short_pred_bow = model_bow.predict(X_test_short_bow)
short_score_bow_acc = accuracy_score(y_test_short, y_short_pred_bow)
short_score_bow_f1 = f1_score(y_test_short, y_short_pred_bow, average='macro')

y_short_pred_tfidf = model_tfidf.predict(X_test_short_tfidf)
short_score_tfidf_acc = accuracy_score(y_test_short, y_short_pred_tfidf)
short_score_tfidf_f1 = f1_score(y_test_short, y_short_pred_tfidf, average='macro')

### 2nd Category: Long data

In [16]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

X_test_long_bow = cv.transform(long_df['string'])
X_test_long_tfidf = vectorizer.transform(long_df['string'])
y_test_long = long_df['label']

y_long_pred_bow = model_bow.predict(X_test_long_bow)
long_score_bow_acc = accuracy_score(y_test_long, y_long_pred_bow)
long_score_bow_f1 = f1_score(y_test_long, y_long_pred_bow, average='macro')

y_long_pred_tfidf = model_tfidf.predict(X_test_long_tfidf)
long_score_tfidf_acc = accuracy_score(y_test_long, y_long_pred_tfidf)
long_score_tfidf_f1 = f1_score(y_test_long, y_long_pred_tfidf, average='macro')

### 3rd Category: Paragraph data

In [17]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

X_test_paragraph_bow = cv.transform(paragraph_df['string'])
X_test_paragraph_tfidf = vectorizer.transform(paragraph_df['string'])
y_test_paragraph = paragraph_df['label']

y_paragraph_pred_bow = model_bow.predict(X_test_paragraph_bow)
paragraph_score_bow_acc = accuracy_score(y_test_paragraph, y_paragraph_pred_bow)
paragraph_score_bow_f1 = f1_score(y_test_paragraph, y_paragraph_pred_bow, average='macro')

y_paragraph_pred_tfidf = model_tfidf.predict(X_test_paragraph_tfidf)
paragraph_score_tfidf_acc = accuracy_score(y_test_paragraph, y_paragraph_pred_tfidf)
paragraph_score_tfidf_f1 = f1_score(y_test_paragraph, y_paragraph_pred_tfidf, average='macro')

### 4th Category: Typo data

In [18]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)
    
    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])
    
    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [19]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

X_test_typo_bow = cv.transform(typo_df['string'])
X_test_typo_tfidf = vectorizer.transform(typo_df['string'])
y_test_typo = typo_df['label']

y_typo_pred_bow = model_bow.predict(X_test_typo_bow)
typo_score_bow_acc = accuracy_score(y_test_typo, y_typo_pred_bow)
typo_score_bow_f1 = f1_score(y_test_typo, y_typo_pred_bow, average='macro')

y_typo_pred_tfidf = model_tfidf.predict(X_test_typo_tfidf)
typo_score_tfidf_acc = accuracy_score(y_test_typo, y_typo_pred_tfidf)
typo_score_tfidf_f1 = f1_score(y_test_typo, y_typo_pred_tfidf, average='macro')

### 5th Category: Synonymized data

In [20]:
synonymized_test_df = pd.read_json('../synonymized.jsonl', lines=True)
synonymized_test_df = synonymized_test_df[['string', 'label']]

X_test_synonymized_bow = cv.transform(synonymized_test_df['string'])
X_test_synonymized_tfidf = vectorizer.transform(synonymized_test_df['string'])
y_test_synonymized = synonymized_test_df['label']

y_synonymized_pred_bow = model_bow.predict(X_test_synonymized_bow)
synonymized_score_bow_acc = accuracy_score(y_test_synonymized, y_synonymized_pred_bow)
synonymized_score_bow_f1 = f1_score(y_test_synonymized, y_synonymized_pred_bow, average='macro')

y_synonymized_pred_tfidf = model_tfidf.predict(X_test_synonymized_tfidf)
synonymized_score_tfidf_acc = accuracy_score(y_test_synonymized, y_synonymized_pred_tfidf)
synonymized_score_tfidf_f1 = f1_score(y_test_synonymized, y_synonymized_pred_tfidf, average='macro')

### 6th Category: Paraphrased data

In [21]:
paraphrased_test_df = pd.read_json('../paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

X_test_paraphrased_bow = cv.transform(paraphrased_test_df['string'])
X_test_paraphrased_tfidf = vectorizer.transform(paraphrased_test_df['string'])
y_test_paraphrased = paraphrased_test_df['label']

y_paraphrased_pred_bow = model_bow.predict(X_test_paraphrased_bow)
paraphrased_score_bow_acc = accuracy_score(y_test_paraphrased, y_paraphrased_pred_bow)
paraphrased_score_bow_f1 = f1_score(y_test_paraphrased, y_paraphrased_pred_bow, average='macro')

y_paraphrased_pred_tfidf = model_tfidf.predict(X_test_paraphrased_tfidf)
paraphrased_score_tfidf_acc = accuracy_score(y_test_paraphrased, y_paraphrased_pred_tfidf)
paraphrased_score_tfidf_f1 = f1_score(y_test_paraphrased, y_paraphrased_pred_tfidf, average='macro')

## Compile Scores on Categories

In [22]:
list1 = ["Short", short_score_bow_acc, short_score_bow_f1]
list2 = ["Long", long_score_bow_acc, long_score_bow_f1]
list3 = ["Paragraph", paragraph_score_bow_acc, paragraph_score_bow_f1]
list4 = ["Typo", typo_score_bow_acc, typo_score_bow_f1]
list5 = ["Synoymized", synonymized_score_bow_acc, synonymized_score_bow_f1]
list6 = ["Paraphrased", paraphrased_score_bow_acc, paraphrased_score_bow_f1]

print("BOW")
bow_data_df = pd.DataFrame([list1, list2, list3, list4, list5, list6], columns=['Category', 'Accuracy', 'F1'])
bow_data_df

BOW


Unnamed: 0,Category,Accuracy,F1
0,Short,0.755725,0.708443
1,Long,0.741088,0.711521
2,Paragraph,0.750605,0.724261
3,Typo,0.727028,0.686457
4,Synoymized,0.584632,0.404069
5,Paraphrased,0.697474,0.653096


In [23]:
list1 = ["Short", short_score_tfidf_acc, short_score_tfidf_f1]
list2 = ["Long", long_score_tfidf_acc, long_score_tfidf_f1]
list3 = ["Paragraph", paragraph_score_tfidf_acc, paragraph_score_tfidf_f1]
list4 = ["Typo", typo_score_tfidf_acc, typo_score_tfidf_f1]
list5 = ["Synoymized", synonymized_score_tfidf_acc, synonymized_score_tfidf_f1]
list6 = ["Paraphrased", paraphrased_score_tfidf_acc, paraphrased_score_tfidf_f1]

print("TFIDF")
tfidf_data_df = pd.DataFrame([list1, list2, list3, list4, list5, list6], columns=['Category', 'Accuracy', 'F1'])
tfidf_data_df

TFIDF


Unnamed: 0,Category,Accuracy,F1
0,Short,0.698473,0.641204
1,Long,0.661038,0.598124
2,Paragraph,0.673123,0.616282
3,Typo,0.65986,0.595014
4,Synoymized,0.555615,0.299686
5,Paraphrased,0.631918,0.511194
