In [7]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
import torch
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
test_df = test_df[['string', 'label']]

test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1860,3
top,For datasets with multiple human annotations (...,background
freq,2,997


## 1st Category: Short data

Define short data as text with number of words <= 25

In [10]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
short_df.describe()

Unnamed: 0,string,label
count,262,262
unique,262,3
top,"After secondary review, 93 studies were includ...",background
freq,1,146


In [13]:
short_df

Unnamed: 0,string,label
9,"After secondary review, 93 studies were includ...",method
15,"[12], is fast and simple to apply as positioni...",background
24,"1a), or individually via sharp electrode penet...",background
33,"According to the literature, the clinical resu...",background
34,The abnormal histological alterations observed...,result
...,...,...
1828,PGA7 has been shown to be upregulated in hypha...,background
1830,bouts of the Windgate Anaerobic Test thus affe...,background
1837,HA have been shown previously to increase surv...,background
1847,"Moreover, DIR1 is required for AA-induced (Jun...",background


## 2nd Category: Long data

Define long data as text with number of words > 25

In [14]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [15]:
long_df.describe()

Unnamed: 0,string,label
count,1599,1599
unique,1598,3
top,For datasets with multiple human annotations (...,background
freq,2,851


## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [16]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [17]:
paragraph_df.describe()

Unnamed: 0,string,label
count,413,413
unique,413,3
top,Organotypic hippocampal slice cultures\nInterf...,background
freq,1,209


## 4th Category: Typo data

In [18]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)

    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])

    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [19]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

In [20]:
typo_df.describe()

Unnamed: 0,label,string
count,1861,1861
unique,3,1861
top,background,"Chapel , as well as X10 [ 2 ] , UPC [ 3 ] , Co..."
freq,997,1


## 5th Category: Synonym data

For each sentence, iterate through the words and convert it to its synonym.

In [21]:
synonymized_test_df = pd.read_json('synonymized.jsonl', lines=True)
synonymized_test_df = synonymized_test_df[['string', 'label']]

synonymized_test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1857,3
top,For datasets with multiple human annotation (i...,background
freq,2,997


## 6th Category: Paraphrased data

In [22]:
paraphrased_test_df = pd.read_json('paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

#paraphrased_test_df.describe()
paraphrased_test_df

Unnamed: 0,string,label
0,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background
1,"Moreover, the findings of this current researc...",result
2,Various tools that are designed to capture pat...,background
3,Organotypic hippocampal slice cultures created...,method
4,Activated PBMCs serve as the fundamental compo...,background
...,...,...
1856,"Moreover, the current study did not find any d...",result
1857,The combination of whole-brain radiation thera...,background
1858,The information collected from this survey con...,method
1859,"Furthermore, combining encapsulated spheroids ...",background


In [23]:
from tensorflow.keras.models import load_model

# 加载模型
model = load_model('model.h5')


In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

#df_test['string_clean'] = df_test['string'].apply(clean_text)
#df_test.head()

stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))

    return text


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()



In [26]:
def data_clean(df):
  df['string_clean'] = df['string'].apply(clean_text)
  df['string_clean'] = df['string_clean'].apply(remove_stopwords)
  df['string_clean'] = df['string_clean'].apply(preprocess_data)
  df['string_clean'] = df['string_clean'].apply(stemm_text)
  return df
def label_clean(df):
  le.fit(df['label'])

  df['label_encoded'] = le.transform(df['label'])
  return df


In [27]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']


In [28]:
df=train_df[['string','label']]
df_test=test_df[['string','label']]
df=data_clean(df)
df_test=data_clean(df_test)
df=label_clean(df)
df_test=label_clean(df_test)

In [29]:
x_train = pd.concat([df['string_clean'], df_test['string_clean']], axis=0).reset_index(drop=True)
y_train = pd.concat([df['label_encoded'], df_test['label_encoded']], axis=0).reset_index(drop=True)
#x_train.loc[:300]

In [30]:
# Use the trained to create a document-term matrix from train and test sets
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# instantiate the vectorizer
#vect2 = CountVectorizer()
vect.fit(x_train)
x_train_dtm = vect.transform(x_train)


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# instantiate the vectorizer
#vect2 = CountVectorizer()
vect.fit(x_train)

In [32]:

texts = pd.concat([df['string_clean'], df_test['string_clean']], axis=0).reset_index(drop=True)
target = pd.concat([df['label_encoded'], df_test['label_encoded']], axis=0).reset_index(drop=True)
# Calculate the length of our vocabulary
from tensorflow.keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length = 27693

In [33]:
import tensorflow as tf
from nltk.tokenize import word_tokenize
def embed(corpus):
    return word_tokenizer.texts_to_sequences(corpus)

ltrain = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(ltrain))

train_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(texts),
    length_long_sentence,
    padding='post'
)

train_padded_sentences

array([[   63,  3651,   104, ...,     0,     0,     0],
       [    4,  4889,     1, ...,     0,     0,     0],
       [  456,     9,   103, ...,     0,     0,     0],
       ...,
       [   21,    83,  1876, ...,     0,     0,     0],
       [   67,  4558,  7699, ...,     0,     0,     0],
       [ 2938, 27710, 10450, ...,     0,     0,     0]], dtype=int32)

In [34]:
short_df=data_clean(short_df)
long_df=data_clean(long_df)
paragraph_df=data_clean(paragraph_df)
typo_df=data_clean(typo_df)
synonymized_test_df=data_clean(synonymized_test_df)
paraphrased_test_df=data_clean(paraphrased_test_df)

In [35]:
short_df=label_clean(short_df)
long_df=label_clean(long_df)
paragraph_df=label_clean(paragraph_df)
typo_df=label_clean(typo_df)
synonymized_test_df=label_clean(synonymized_test_df)
paraphrased_test_df=label_clean(paraphrased_test_df)

In [36]:
paraphrased_test_df

Unnamed: 0,string,label,string_clean,label_encoded
0,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background,chapel upc coarray fortran titanium util part...,0
1,"Moreover, the findings of this current researc...",result,moreov find current research align earlier stu...,2
2,Various tools that are designed to capture pat...,background,various tool design captur patientreport outco...,0
3,Organotypic hippocampal slice cultures created...,method,organotyp hippocamp slice cultur creat interfa...,1
4,Activated PBMCs serve as the fundamental compo...,background,activ pbmcs serv fundament compon convent pbmc...,0
...,...,...,...,...
1856,"Moreover, the current study did not find any d...",result,moreov current studi find differ base gender a...,2
1857,The combination of whole-brain radiation thera...,background,combin wholebrain radiat therapi administ dose...,0
1858,The information collected from this survey con...,method,inform collect survey conduct amsterdam popul ...,1
1859,"Furthermore, combining encapsulated spheroids ...",background,furthermor combin encapsul spheroid enabl use ...,0


In [37]:
x = pd.concat([short_df['string_clean'],long_df['string_clean'],paragraph_df['string_clean'],typo_df['string_clean'],synonymized_test_df['string_clean'],paraphrased_test_df['string_clean']], axis=0).reset_index(drop=True)
y = pd.concat([short_df['label_encoded'],long_df['label_encoded'],paragraph_df['label_encoded'],typo_df['label_encoded'],synonymized_test_df['label_encoded'],paraphrased_test_df['label_encoded']], axis=0).reset_index(drop=True)

In [38]:
print(len(short_df['string_clean']))
print(len(long_df['string_clean']))
print(len(paragraph_df['string_clean']))
print(len(typo_df['string_clean']))
print(len(synonymized_test_df['string_clean']))
print(len(paraphrased_test_df['string_clean']))

262
1599
413
1861
1861
1861


In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# instantiate the vectorizer
#vect2 = CountVectorizer()
vect.fit(x)

In [41]:
from google.colab import drive
drive.mount('/content/drive')


MessageError: Error: credential propagation was unsuccessful

In [42]:
x_train_dtm = vect.transform(x)

In [43]:
texts=x
target=y
# Calculate the length of our vocabulary
from tensorflow.keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

14751

In [44]:
import tensorflow as tf
from nltk.tokenize import word_tokenize
def embed(corpus):
    return word_tokenizer.texts_to_sequences(corpus)

#ltrain = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
#length_long_sentence = len(word_tokenize(ltrain))

train_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(texts),
    length_long_sentence,
    padding='post'
)

train_padded_sentences

array([[ 613,  363,    3, ...,    0,    0,    0],
       [ 677,  906,  338, ...,    0,    0,    0],
       [  84,  450, 1242, ...,    0,    0,    0],
       ...,
       [  90,  279,  972, ...,    0,    0,    0],
       [ 462,  143, 3100, ...,    0,    0,    0],
       [ 839, 8013, 9849, ...,    0,    0,    0]], dtype=int32)

In [45]:
len(train_padded_sentences)

7857

In [46]:
262
1599
413
1861
1861
1861


1861

In [47]:
short_df=train_padded_sentences[:262]
long_df = train_padded_sentences[262:1861]
paragraph_df=train_padded_sentences[1861:2274]
typo_df=train_padded_sentences[2274:4135]
synonymized_test_df=train_padded_sentences[4135:5996]
paraphrased_test_df=train_padded_sentences[5996:]

In [48]:
short_y=y[:262]
long_y = y[262:1861]
paragraph_y=y[1861:2274]
typo_y=y[2274:4135]
synonymized_test_y=y[4135:5996]
paraphrased_test_y=y[5996:]

In [49]:
short_df

array([[ 613,  363,    3, ...,    0,    0,    0],
       [ 677,  906,  338, ...,    0,    0,    0],
       [  84,  450, 1242, ...,    0,    0,    0],
       ...,
       [5282,   43,    7, ...,    0,    0,    0],
       [ 393,  252, 8262, ...,    0,    0,    0],
       [  32,  103, 1075, ...,    0,    0,    0]], dtype=int32)

In [50]:
short_pre=model.predict(short_df)
long_pre=model.predict(long_df)
typo_pre=model.predict(typo_df)
paragraph_pre=model.predict(paragraph_df)
synonymized_test_pre=model.predict(synonymized_test_df)
paraphrased_test_pre=model.predict(paraphrased_test_df)



In [51]:
# categorized = [2 if x <= 0.33 else 1 if 0.33 < x <= 0.66 else 0 for x in y_pre]
def assign_label(y_pre):
  categorized=[]
  for x in y_pre:
    if x[0]<=0.33:
      categorized.append(2)
    elif 0.33<x[0]<=0.66:
      categorized.append(0)
    else:
      categorized.append(1)
  y_pred=np.array(categorized)
  return y_pred



In [52]:
short_pred=assign_label(short_pre)
long_pred=assign_label(long_pre)
typo_pred=assign_label(typo_pre)
paragraph_pred=assign_label(paragraph_pre)
synonymized_test_pred=assign_label(synonymized_test_pre)
paraphrased_test_pred=assign_label(paraphrased_test_pre)

In [58]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [59]:
short_accuracy = accuracy_score(short_y,short_pred)
short_accuracy

0.5076335877862596

In [62]:
f1_score_short = f1_score(short_y,short_pred,average='macro')
f1_score_short

0.2996041563582385

In [64]:
long_accuracy = accuracy_score(long_y,long_pred)
long_accuracy

0.4365228267667292

In [65]:
f1_score_long = f1_score(long_y,long_pred,average='macro')
f1_score_long

0.2678473679683608

In [66]:
synonymized_accuracy=accuracy_score(synonymized_test_y,synonymized_test_pred)
synonymized_accuracy

0.46856528747984955

In [69]:
typo_accuracy = accuracy_score(typo_y,typo_pred)
typo_accuracy

0.4540569586243955

In [70]:
f1_score_typo = f1_score(typo_y,typo_pred,average='macro')
f1_score_typo

0.2741658206038237

In [71]:
paragraph_accuracy = accuracy_score(paragraph_y,paragraph_pred)
paragraph_accuracy

0.41646489104116224

In [72]:
f1_score_paragraph = f1_score(paragraph_y,paragraph_pred,average='macro')
f1_score_paragraph

0.265847211343949

In [73]:
paraphrased_accuracy = accuracy_score(paraphrased_test_y,paraphrased_test_pred)
paraphrased_accuracy

0.492208490059108

In [74]:
f1_score_paraphrased = f1_score(paraphrased_test_y,paraphrased_test_pred,average='macro')
f1_score_paraphrased

0.29672712082287317

In [None]:
from sklearn.metrics import f1_score

#f1 = f1_score(long_y, long_pred)