In [5]:
import pandas as pd
import time
import scipy

#for text pre-processing
import re, string
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1]:
%store -r data2
%store -r cat_extended

Text pre-processing

In [8]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)


#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

In [10]:
%store -r data_toy
%store -r data_ready

In [29]:
data_toy = data2[data2['category'].isin(['hep-ph','hep-th','quant-ph', 'gr-qc', 'stat', 'math-ph', 'nucl-th', 'q-bio',
                                         'hep-ex', 'nlin', 'hep-lat', 'q-fin', 'nucl-ex', 'eess', 'econ'])]
ind = range(len(data_toy))
data_toy = data_toy.set_index(pd.Index(ind))

s = time.time()
data_toy['clean_text'] = data_toy['abstract'].apply(lambda x: finalpreprocess(x))
f = time.time()
print(f-s)

data_ready = pd.DataFrame({"clean_text": data_toy['clean_text'], "category": data_toy['category']})

hep-ph      50603
quant-ph    44679
hep-th      39013
gr-qc       25712
stat        19868
math-ph     17597
nucl-th     13730
q-bio       13316
hep-ex      10158
nlin         8934
hep-lat      6625
q-fin        5979
nucl-ex      5735
eess         1509
econ          246
Name: category, dtype: int64


In [24]:
%store data_toy
%store data_ready

Stored 'data_toy' (DataFrame)
Stored 'data_ready' (DataFrame)


Extracting vectors from text

In [25]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(data_ready["clean_text"], data_ready["category"],
                                                    stratify = data_ready["category"], test_size=0.2, shuffle=True)

#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train = tfidf_vectorizer.fit_transform(X_train) 
X_test = tfidf_vectorizer.transform(X_test)

In [26]:
X_train.sort_indices()
X_test.sort_indices()

y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [13]:
%store X_train_tfidf
%store y_train
%store X_test_tfidf
%store y_test

Stored 'X_train_tfidf' (csr_matrix)
Stored 'y_train' (Series)
Stored 'X_test_tfidf' (csr_matrix)
Stored 'y_test' (Series)
