In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
import torch 
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [54]:
import warnings
warnings.filterwarnings("ignore")

In [55]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
test_df = test_df[['string', 'label']]

test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1860,3
top,For datasets with multiple human annotations (...,background
freq,2,997


## 1st Category: Short data

Define short data as text with number of words <= 25

In [39]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [40]:
short_df.describe()

Unnamed: 0,string,label
count,262,262
unique,262,3
top,"After secondary review, 93 studies were includ...",background
freq,1,146


## 2nd Category: Long data

Define long data as text with number of words > 25

In [41]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [42]:
long_df.describe()

Unnamed: 0,string,label
count,1599,1599
unique,1598,3
top,For datasets with multiple human annotations (...,background
freq,2,851


## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [43]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [44]:
paragraph_df.describe()

Unnamed: 0,string,label
count,413,413
unique,413,3
top,Organotypic hippocampal slice cultures\nInterf...,background
freq,1,209


## 4th Category: Typo data

In [64]:
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)
    
    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])
    
    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

In [68]:
typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})

In [72]:
typo_df.describe()

Unnamed: 0,label,string
count,1861,1861
unique,3,1861
top,background,"Chapel , as well as X10 2 [ ] , UPC [ 3 ] , Co..."
freq,997,1


## 5th Category: Synonym data

## 6th Category: Paraphrased data

In [3]:
paraphrased_test_df = pd.read_json('paraphrased.jsonl', lines=True)
paraphrased_test_df = paraphrased_test_df[['string', 'label']]

paraphrased_test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1861,3
top,"Chapel, X10, UPC, CoArray Fortran, and Titaniu...",background
freq,1,997
