In [36]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
import torch 
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

dev_df = pd.read_json('dev.jsonl', lines=True)
X_dev = dev_df['string']
y_dev = dev_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
test_df = test_df[['string', 'label']]

test_df.describe()

Unnamed: 0,string,label
count,1861,1861
unique,1860,3
top,For datasets with multiple human annotations (...,background
freq,2,997


## 1st Category: Short data

Define short data as text with number of words <= 25

In [39]:
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]

In [40]:
short_df.describe()

Unnamed: 0,string,label
count,262,262
unique,262,3
top,"After secondary review, 93 studies were includ...",background
freq,1,146


## 2nd Category: Long data

Define long data as text with number of words > 25

In [41]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]

In [42]:
long_df.describe()

Unnamed: 0,string,label
count,1599,1599
unique,1598,3
top,For datasets with multiple human annotations (...,background
freq,2,851


## 3rd Category: Paragraph data

Define paragraph data as text with number of sentences > 1

In [43]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]

In [44]:
paragraph_df.describe()

Unnamed: 0,string,label
count,413,413
unique,413,3
top,Organotypic hippocampal slice cultures\nInterf...,background
freq,1,209


## 4th Category: Numerical data

Define numerical data as text which consists of a number

In [45]:
numerical_df = test_df[test_df['string'].str.contains(r'[0-9]+')]

In [46]:
numerical_df.describe()

Unnamed: 0,string,label
count,1843,1843
unique,1842,3
top,For datasets with multiple human annotations (...,background
freq,2,996


## 5th Category: Non-numerical data

Define non-numerical data as text which doesn't have any number

In [47]:
non_numerical_df = test_df[~test_df['string'].str.contains(r'[0-9]+')]

In [48]:
non_numerical_df.describe()

Unnamed: 0,string,label
count,18,18
unique,18,2
top,We choose the NYU dataset to conduct ablation ...,method
freq,1,17


## 6th Category: Equation data

Define equation data as text which consist of equal symbol (=)

In [49]:
equation_df = test_df[test_df['string'].str.contains('=')]

In [50]:
equation_df.describe()

Unnamed: 0,string,label
count,17,17
unique,17,3
top,…d (subject to g > 0) gives the analytic solut...,method
freq,1,11


## 7th Category: Multilines data

Define multilines data as text which have multiple lines

In [51]:
multilines_df = test_df[test_df['string'].str.contains('\n')]

In [52]:
multilines_df.describe()

Unnamed: 0,string,label
count,132,132
unique,132,3
top,"Chapel, as well as X10 [2], UPC [3] , CoArray ...",method
freq,1,63
