In [None]:
import pandas as pd
import time
import scipy

#for text pre-processing
import re, string
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Text pre-processing

In [None]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text 

 
# STOPWORD REMOVAL
def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)


#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

In [None]:
%store -r data_toy
%store -r data_ready

In [None]:
data_toy = data2[data2['category'].isin(['hep-ph','hep-th','quant-ph', 'gr-qc', 'stat', 'math-ph', 'nucl-th', 'q-bio',
                                         'hep-ex', 'nlin', 'hep-lat', 'q-fin', 'nucl-ex', 'eess', 'econ'])]
ind = range(len(data_toy))
data_toy = data_toy.set_index(pd.Index(ind))

s = time.time()
data_toy['clean_text'] = data_toy['abstract'].apply(lambda x: finalpreprocess(x))
f = time.time()
print(f-s)

data_ready = pd.DataFrame({"clean_text": data_toy['clean_text'], "category": data_toy['category']})

hep-ph      50603
quant-ph    44679
hep-th      39013
gr-qc       25712
stat        19868
math-ph     17597
nucl-th     13730
q-bio       13316
hep-ex      10158
nlin         8934
hep-lat      6625
q-fin        5979
nucl-ex      5735
eess         1509
econ          246
Name: category, dtype: int64


Frequency tables for preprocessed data

In [None]:
import pandas as pd
from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"1rJUCyDztqA321Uo690atZCQYS8Q1Y6XN"})
downloaded.GetContentFile('preprocessed_data.csv')
data_ready = pd.read_csv('preprocessed_data.csv')

In [None]:
cat_index = data_ready['category'].value_counts().index

dataset0 = data_ready[data_ready['category'].isin(cat_index[0:3])].dropna().reset_index(drop=True)

hep-ph      50603
quant-ph    44678
hep-th      39013
Name: category, dtype: int64


In [None]:
dataset1 = data_ready[data_ready['category'].isin(cat_index[4:9])].dropna().reset_index(drop=True)

stat       19868
math-ph    17597
nucl-th    13730
q-bio      13316
hep-ex     10158
Name: category, dtype: int64


In [None]:
cat_codes = list(dataset0['category'].unique()) + list(dataset1['category'].unique())
cat_names = ['HE Physics - Phenomenology', 'HE Physics - Theory', 'Quantum Physics', 'HE Physics - Experiment', 'Quantitative Biology', 'Nuclear Theory', 'Mathematical Physics', 'Statistics']
ind_dict = dict(zip(cat_codes, cat_names))

In [None]:
dataset0['category'].value_counts().rename(index=ind_dict)

HE Physics - Phenomenology    50603
Quantum Physics               44678
HE Physics - Theory           39013
Name: category, dtype: int64

In [None]:
dataset1

Unnamed: 0,clean_text,category
0,shape hadronic form factor f q decay k e nue m...,hep-ex
1,supplement paper arxiv q bio contain text corr...,q-bio
2,multisite phosphorylation dephosphorylation cy...,q-bio
3,investigate coulomb excitation low lie state u...,nucl-th
4,present paper propose seemingly new method fin...,math-ph
...,...,...
74664,background n back paced auditory serial additi...,q-bio
74665,recently prove invariance observables respect ...,nucl-th
74666,study diverse human population related histori...,q-bio
74667,permissive environment e coli double dry mass ...,q-bio


In [None]:
dataset1['category'].value_counts().rename(index=ind_dict)

Statistics                 19868
Mathematical Physics       17597
Nuclear Theory             13730
Quantitative Biology       13316
HE Physics - Experiment    10158
Name: category, dtype: int64

In [None]:
print(dataset0['category'].value_counts().rename(index=ind_dict).to_latex(header = ['N. di abstract'], index=True))

\begin{tabular}{lr}
\toprule
{} & N. di abstract \\
\midrule
HE Physics - Phenomenology &          50603 \\
Quantum Physics            &          44678 \\
HE Physics - Theory        &          39013 \\
\bottomrule
\end{tabular}



In [None]:
print(dataset1['category'].value_counts().rename(index=ind_dict).to_latex(index=True))

\begin{tabular}{lr}
\toprule
{} &  category \\
\midrule
Statistics              &     19868 \\
Mathematical Physics    &     17597 \\
Nuclear Theory          &     13730 \\
Quantitative Biology    &     13316 \\
HE Physics - Experiment &     10158 \\
\bottomrule
\end{tabular}

