## Compute Classifier

In [1]:
import pandas as pd
import numpy as np
import pickle

##### Load Text-Clean Training Set:

In [2]:
#dataset_path = "./resources/lemmatizated_training_set.pkl"
dataset_path = "./resources/stemmed_training_set.pkl"
df = pd.read_pickle(dataset_path)
df.head()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD


In [3]:
#y.replace({'PD': 0, 'N_PD': 1}, inplace=True)
#df.dtypes
df['label_id'] = df['label_subsection'].factorize()[0]
df

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD,0
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD,0
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD,0
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD,0
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD,0
...,...,...,...,...,...
132703,101131.17.2,C REPRODUCIBILITY,review confer paper iclr main task network rew...,N_PD,0
132711,101144.2.6,1. Introduction,approach preserv spatial spectral tempor struc...,PD,1
132712,101144.2.7,1. Introduction,increas effort automat detect phase start time...,PD,1
132713,101144.2.8,1. Introduction,sampl data descript data numer eeg read partic...,PD,1


In [4]:
label_df = df[['label_subsection', 'label_id']].drop_duplicates().sort_values('label_id')
label_dict = dict(label_df.values)
label_id_dict = dict(label_df[['label_id', 'label_subsection']].values)
print(label_dict, label_id_dict)

{'N_PD': 0, 'PD': 1} {0: 'N_PD', 1: 'PD'}


##### Feature Extracion:

In [5]:
%%time
corpus = df['text_subsection']

vectorizer_modes = ['tdidf_bigr']#, 'tdidf', 'count_bigr', 'count']

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

vectorizers = {
    'tdidf_bigr': {'name': 'TF-IDF_bigrams',
                   'vect': TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
    },
    'tdidf': {'name': 'TF-IDF',
              'vect': TfidfVectorizer(max_features=40000, norm='l2')
    },
    'count_bigr': {'name': 'Count_bigrams',
              'vect': CountVectorizer(max_features=40000, min_df=3, max_df=0.7, ngram_range=(1, 2))
    },
    'count': {'name': 'Count',
              'vect': CountVectorizer(max_features=40000)
    }
}

for vect_mode in vectorizer_modes:
    vectorizers[vect_mode]['vectorizer'] = vectorizers[vect_mode]['vect']
    vectorizers[vect_mode]['features'] = vectorizers[vect_mode]['vectorizer'].fit_transform(corpus).toarray()

Wall time: 18.8 s


##### Classifier:

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

model_modes = ['lr', 'svc', 'rf', 'mnb']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression()
    },
    'rf': {'name': 'Random Forest       ',
           'estimator': RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=None)
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB()
    }
}

Computation of 'Logistic Regression' classifier with 'tdidf_bigr':

In [None]:
## save vectorizer:
#vectorizer_path = "./resources/tdidf_bigram.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(vectorizers['tdidf_bigr'], picklefile)

## save vectorizer:
#vectorizer_path = "./resources/tdidf_bigram_vectorizer.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(vectorizers['tdidf_bigr']['vectorizer'], picklefile)

In [7]:
X_train = vectorizers['tdidf_bigr']['features']
y_train = df['label_id']
print(X_train.shape, y_train.shape)

(96380, 40000) (96380,)


In [8]:
%%time
classifier = models['mnb']['estimator']
classifier.fit(X_train, y_train)

Wall time: 2min 12s


MultinomialNB()

In [9]:
# save it:
classifier_path = "./resources/tdidf_bigr-mnb.pkl"
with open(classifier_path, 'wb') as picklefile:
    pickle.dump(classifier, picklefile, protocol=4)