In [21]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import tensorflow as tf
import keras

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

import keras.backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Embedding, Dropout, AveragePooling1D, BatchNormalization
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

nltk.download('punkt')
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TimothyKoei\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TimothyKoei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TimothyKoei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
train_df = pd.read_json('../train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('../dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('../test.jsonl', lines=True)
test_df = test_df[['string', 'label']]
X_test = test_df['string']
y_test = test_df['label']

test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1860,3
top,For datasets with multiple human annotations (...,background
freq,2,997


## CNN Model Initialization

In [24]:
@keras.saving.register_keras_serializable()
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [25]:
model = keras.models.load_model("scicite_cnn.keras")

In [26]:
def cleaning(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = ' '.join(x for x in text.split() if x not in stop_words)
    return text

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = []
    for x in text.split():
        x = lemmatizer.lemmatize(x)
        words.append(x)
    text = ' '.join(words)
    return text

def preprocessing(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    text = cleaning(text)
    text = lemmatize(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

def augment_data_multiclass(X, y):
    df = pd.concat([X, y], axis=1)
    majority_class_size = df['label'].value_counts().max()
    upsampled_dataframes = []
    for class_label in df['label'].unique():
        class_df = df[df['label'] == class_label]
        if len(class_df) < majority_class_size:
            class_df_upsampled = resample(class_df, replace=True, n_samples=majority_class_size, random_state=10)
            upsampled_dataframes.append(class_df_upsampled)
        else:
            upsampled_dataframes.append(class_df)
    upsampled_df = pd.concat(upsampled_dataframes)
    return upsampled_df['string'], upsampled_df['label']

In [27]:
X_train, y_train = augment_data_multiclass(X_train, y_train)

In [28]:
X_train_preprocessed = X_train.apply(lambda x: preprocessing(x))
X_dev_preprocessed = X_dev.apply(lambda x: preprocessing(x))
X_test_preprocessed = X_test.apply(lambda x: preprocessing(x))

In [29]:
num_words = 5000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

In [30]:
X_train_tokenized = tokenizer.texts_to_sequences(X_train_preprocessed)
X_dev_tokenized = tokenizer.texts_to_sequences(X_dev_preprocessed)
X_test_tokenized = tokenizer.texts_to_sequences(X_test_preprocessed)

In [31]:
max_words = 250
X_train_padded = sequence.pad_sequences(X_train_tokenized, maxlen=max_words, padding='pre')
X_dev_padded = sequence.pad_sequences(X_dev_tokenized, maxlen=max_words, padding='pre')
X_test_padded = sequence.pad_sequences(X_test_tokenized, maxlen=max_words, padding='pre')

In [32]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform string column
y_train = label_encoder.fit_transform(y_train)
y_dev = label_encoder.transform(y_dev)
y_test = label_encoder.transform(y_test)

## 1st Category: Short data

Define short data as text with number of words <= 25

In [33]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [34]:
short_df.describe()

Unnamed: 0,string,label
count,262,262
unique,262,3
top,"After secondary review, 93 studies were includ...",background
freq,1,146


In [35]:
X_short = short_df['string']
y_short = short_df['label']

In [36]:
X_short_preprocessed = X_short.apply(lambda x: preprocessing(x))
X_short_tokenized = tokenizer.texts_to_sequences(X_short_preprocessed)
X_short_padded = sequence.pad_sequences(X_short_tokenized, maxlen=max_words, padding='pre')

In [37]:
y_short = label_encoder.transform(y_short)

In [38]:
short_scores = model.evaluate(X_short_padded, y_short)





## 2nd Category: Long data

Define long data as text with number of words > 25

In [39]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [40]:
long_df.describe()

Unnamed: 0,string,label
count,1599,1599
unique,1598,3
top,For datasets with multiple human annotations (...,background
freq,2,851


In [41]:
X_long = long_df['string']
y_long = long_df['label']

In [42]:
X_long_preprocessed = X_long.apply(lambda x: preprocessing(x))
X_long_tokenized = tokenizer.texts_to_sequences(X_long_preprocessed)
X_long_padded = sequence.pad_sequences(X_long_tokenized, maxlen=max_words, padding='pre')

In [43]:
y_long = label_encoder.transform(y_long)

In [44]:
long_scores = model.evaluate(X_long_padded, y_long)



## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [45]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [46]:
paragraph_df.describe()

Unnamed: 0,string,label
count,413,413
unique,413,3
top,Organotypic hippocampal slice cultures\nInterf...,background
freq,1,209


In [47]:
X_paragraph = paragraph_df['string']
y_paragraph = paragraph_df['label']

In [48]:
X_paragraph_preprocessed = X_paragraph.apply(lambda x: preprocessing(x))
X_paragraph_tokenized = tokenizer.texts_to_sequences(X_paragraph_preprocessed)
X_paragraph_padded = sequence.pad_sequences(X_paragraph_tokenized, maxlen=max_words, padding='pre')

In [49]:
y_paragraph = label_encoder.transform(y_paragraph)

In [50]:
paragraph_scores = model.evaluate(X_paragraph_padded, y_paragraph)



## 4th Category: Typo data

In [51]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)
    
    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])
    
    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [52]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

In [53]:
typo_df.describe()

Unnamed: 0,label,string
count,1861,1861
unique,3,1861
top,background,"Chapel , as well as X10 [ 2 ] , UCP [ 3 ] , Co..."
freq,997,1


In [54]:
X_typo = typo_df['string']
y_typo = typo_df['label']

In [55]:
X_typo_preprocessed = X_typo.apply(lambda x: preprocessing(x))
X_typo_tokenized = tokenizer.texts_to_sequences(X_typo_preprocessed)
X_typo_padded = sequence.pad_sequences(X_typo_tokenized, maxlen=max_words, padding='pre')

In [56]:
y_typo = label_encoder.transform(y_typo)

In [57]:
typo_scores = model.evaluate(X_typo_padded, y_typo)



## 5th Category: Synonym data

For each sentence, iterate through the words and convert it to its synonym.

In [60]:
synonymized_df = pd.read_json('../synonymized.jsonl', lines=True)
synonymized_df = synonymized_df[['string', 'label']]

synonymized_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1857,3
top,For datasets with multiple human annotation (i...,background
freq,2,997


In [61]:
X_synonymized = synonymized_df['string']
y_synonymized = synonymized_df['label']

In [62]:
X_synonymized_preprocessed = X_synonymized.apply(lambda x: preprocessing(x))
X_synonymized_tokenized = tokenizer.texts_to_sequences(X_synonymized_preprocessed)
X_synonymized_padded = sequence.pad_sequences(X_synonymized_tokenized, maxlen=max_words, padding='pre')

In [63]:
y_synonymized = label_encoder.transform(y_synonymized)

In [64]:
synonymized_scores = model.evaluate(X_synonymized_padded, y_synonymized)



## 6th Category: Paraphrased data

In [65]:
paraphrased_test_df = pd.read_json('../paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

paraphrased_test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1861,3
top,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background
freq,1,997


In [66]:
X_paraphrased = paraphrased_test_df['string']
y_paraphrased = paraphrased_test_df['label']

In [67]:
X_paraphrased_preprocessed = X_paraphrased.apply(lambda x: preprocessing(x))
X_paraphrased_tokenized = tokenizer.texts_to_sequences(X_paraphrased_preprocessed)
X_paraphrased_padded = sequence.pad_sequences(X_paraphrased_tokenized, maxlen=max_words, padding='pre')

In [68]:
y_paraphrased = label_encoder.transform(y_paraphrased)

In [69]:
paraphrased_scores = model.evaluate(X_paraphrased_padded, y_paraphrased)



## Compile Scores on Categories

In [71]:
list1 = ["Short", short_scores[1], short_scores[2]]
list2 = ["Long", long_scores[1], long_scores[2]]
list3 = ["Paragraph", paragraph_scores[1], paragraph_scores[2]]
list4 = ["Typo", typo_scores[1], typo_scores[2]]
list5 = ["Synoymized", synonymized_scores[1], synonymized_scores[2]]
list6 = ["Paraphrased", paraphrased_scores[1], paraphrased_scores[2]]

df = pd.DataFrame([list1, list2, list3, list4, list5, list6], columns=['Category', 'Accuracy', 'F1'])
df

Unnamed: 0,Category,Accuracy,F1
0,Short,0.679389,0.663622
1,Long,0.642902,0.711767
2,Paragraph,0.641647,0.751323
3,Typo,0.641053,0.673452
4,Synoymized,0.585169,0.465973
5,Paraphrased,0.631381,0.690572
