## Classifiers

In [1]:
import pandas as pd
import numpy as np

##### Load Text-Clean Training Set:

In [2]:
#dataset_path = "./resources/lemmatized_training_set.pkl"
dataset_path = "./resources/stemmed_training_set.pkl"
df = pd.read_pickle(dataset_path)
df.head()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD


In [3]:
#y.replace({'PD': 0, 'N_PD': 1}, inplace=True)
#df.dtypes
df['label_id'] = df['label_subsection'].factorize()[0]
df

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD,0
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD,0
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD,0
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD,0
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD,0
...,...,...,...,...,...
132703,101131.17.2,C REPRODUCIBILITY,review confer paper iclr main task network rew...,N_PD,0
132711,101144.2.6,1. Introduction,approach preserv spatial spectral tempor struc...,PD,1
132712,101144.2.7,1. Introduction,increas effort automat detect phase start time...,PD,1
132713,101144.2.8,1. Introduction,sampl data descript data numer eeg read partic...,PD,1


In [5]:
label_df = df[['label_subsection', 'label_id']].drop_duplicates().sort_values('label_id')
label_dict = dict(label_df.values)
label_id_dict = dict(label_df[['label_id', 'label_subsection']].values)
print(label_dict, label_id_dict)

{'N_PD': 0, 'PD': 1} {0: 'N_PD', 1: 'PD'}


##### Feature Extracion:

In [8]:
%%time
corpus = df['text_subsection']

vectorizer_modes = ['tdidf_trigr', 'tdidf_bigr', 'tdidf', 'count']

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

vectorizers = {
    'tdidf_trigr': {'name': 'TF-IDF_trigrams  ',
                   'vect': TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
    },
    'tdidf_bigr': {'name': 'TF-IDF_bigrams  ',
                   'vect': TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
    },
    'tdidf': {'name': 'TF-IDF          ',
              'vect': TfidfVectorizer(max_features=40000, norm='l2')
    },
    'count': {'name': 'BoW             ',
              'vect': CountVectorizer(max_features=40000)
    }
}

for vect_mode in vectorizer_modes:
    vectorizers[vect_mode]['vectorizer'] = vectorizers[vect_mode]['vect'] #max_features=1500, min_df=5, max_df=0.7
    vectorizers[vect_mode]['features'] = vectorizers[vect_mode]['vectorizer'].fit_transform(corpus).toarray()

CPU times: user 30.6 s, sys: 10.2 s, total: 40.9 s
Wall time: 42.7 s


In [9]:
for vect_mode in vectorizer_modes:
    print(vect_mode +':', len(vectorizers[vect_mode]['vectorizer'].vocabulary_))
    #print(len(vectorizers[vect_mode]['vectorizer'].get_feature_names()))
    print(vectorizers[vect_mode]['features'].shape)

tdidf_trigr: 40000
(96380, 40000)
tdidf_bigr: 40000
(96380, 40000)
tdidf: 40000
(96380, 40000)
count: 40000
(96380, 40000)


In [10]:
%%time
from sklearn.feature_selection import chi2
N = 20

### all vectorizer_modes: Kernel dies

#for m in vectorizer_modes:
#    for _label, _id in sorted(label_dict.items()):
#        features_chi2 = chi2(vectorizers[m]['features'], df.label_id==_id)
#        print(features_chi2)
#        print(m, "-"*40)
#        indices = np.argsort(features_chi2[0])
#        feature_names = np.array(vectorizers[m]['vectorizer'].get_feature_names())[indices]
#        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#        trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
#        print("# {}:\t'{}'".format(vectorizers[m]['name'], _label))
#        print("  . Most Correlated UNIGRAMS:\n. {}".format(', '.join(unigrams[-N:] if _id == 1 else unigrams[:N])))
#        print("  . Most Correlated BIGRAMS:\n. {}".format(', '.join(bigrams[-N:] if _id == 1 else bigrams[:N])))
#        print("  . Most Correlated TRIGRAMS:\n. {}\n".format(', '.join(trigrams[-N:] if _id == 1 else trigrams[:N])))


### only 'tdidf_trigr'

for _label, _id in sorted(label_subsection_dict.items()):
    features_chi2 = chi2(vectorizers['tdidf_trigr']['features'], df.label_id == _id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(vectorizers['tdidf_trigr']['vectorizer'].get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    print("# {}:\t'{}'".format(vectorizers['tdidf_trigr']['name'], _label))
    print("  . Most Correlated UNIGRAMS:\n. {}".format(', '.join(unigrams[-N:] if _id == 1 else unigrams[:N])))
    print("  . Most Correlated BIGRAMS:\n. {}".format(', '.join(bigrams[-N:] if _id == 1 else bigrams[:N])))
    print("  . Most Correlated TRIGRAMS:\n. {}\n".format(', '.join(trigrams[-N:] if _id == 1 else trigrams[:N])))

# TF-IDF_trigrams  :	'N_PD'
  . Most Correlated UNIGRAMS:
. lot, presum, regardless, singlelabel, itembas, sabato, unselect, relianc, cambridg, mirror, zl, curvilinear, exacerb, creat, rq, wave, encourag, feder, demo, timeofday
  . Most Correlated BIGRAMS:
. domain includ, error use, embed learn, polici chang, equat give, approach optim, discret structur, tube predict, network present, demonstr potenti, propos comput, infer result, time featur, train phase, mean map, rout problem, issu address, plan use, dyngraphvecrnn dyngraphvecaernn, function need
  . Most Correlated TRIGRAMS:
. 

# TF-IDF_trigrams  :	'PD'
  . Most Correlated UNIGRAMS:
. defin, rd, cbr, mmt, lovasz, definit, xi, nurs, logo, arcconsist, assum, given, notat, multiteam, formal, goal, denot, descript, problem, statement
  . Most Correlated BIGRAMS:
. notat problem, arg pr, statement introduc, preced constraint, goal learn, denot set, statement assum, formal defin, statement goal, mission multiteam, lovasz extens, statem

##### Classifiers:

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

model_modes = ['lr', 'svc', 'rf', 'mnb']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression(random_state=0)
    },
    'rf': {'name': 'Random Forest       ',
           'estimator': RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=None, random_state=0)
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB()
    }
}

vectorizer_modes = ['tdidf_bigr', 'tdidf', 'count']
#model_modes = ['lr']
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(model_modes) * len(vectorizer_modes)))
entries = []
for vect_mode in vectorizer_modes:
    for model_mode in model_modes:
        model = models[model_mode]['estimator']
        features = vectorizers[vect_mode]['features']
        labels = df['label_id']
        f1_scores = cross_val_score(model, features, labels, scoring='f1', cv=CV) # all the estimators are classifiers, so StratifiedKFold is used
        accuracy_scores = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
        fold_idx=0
        for f1_score, accuracy_score in zip(f1_scores,accuracy_scores):
            entries.append((vectorizers[vect_mode]['name'].strip(), models[model_mode]['name'].strip(), fold_idx, f1_score, accuracy_score))
            fold_idx+=1
        cv_df = pd.DataFrame(entries, columns=['vectorize_name', 'model_name', 'fold_idx', 'f1_score', 'accuracy_score'])
        print("%s - %s : done." %(vect_mode, model_mode))

In [None]:
cv_df_f1_acc = cv_df.groupby(['model_name', 'vectorize_name']).mean().drop('fold_idx', axis=1)
cv_df_f1_acc

In [None]:
dataset_path = "./resources/cross_validation_df.pkl"
cv_df_f1_acc.to_pickle(dataset_path, protocol=4)