In [1]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
import time
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import dill as pickle
labels_ = ['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']

Loading raw data and training data

In [2]:
raw_train_data = pd.read_csv('../input_data/train.csv')

X = pd.read_csv('../output_files/train_tfidf_6000f.csv')
y = raw_train_data[labels_]
print(X.shape)
print(y.shape)

(20972, 6000)
(20972, 6)


In [None]:
X.head()

Unnamed: 0,effect,maps,neural,network,neural network,and,poisson,for,functions,finite,...,volunt,browser,pm25,mathcalm,memristor,ecg,lookahead,richclub,pomdp,starless
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.424398,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
y.head()

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,1,0,0,1,0,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=25 )

### Helper function
This function takes model, X_test, Y_test and prints f1-score and accuracy scores per topics

In [4]:
def model_scorer(model, xtest, ytest):
    y_pred = model.predict(xtest)
    f1score = f1_score(y_test, y_pred, average='micro')
    print(f'f1 micro score: {f1score}')
    for i in range(y_test.shape[1]):
        print(f'Topic_{i+1} accuracy= {accuracy_score(y_true=y_test.iloc[:,i], y_pred=y_pred[:,i])}')
    print(multilabel_confusion_matrix(ytest, y_pred))

## Multinomial Naive Bayes

In [5]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16777, 6000), (16777, 6), (4195, 6000), (4195, 6))

In [13]:
# Training is already done. and model is saved.
from sklearn.naive_bayes import MultinomialNB

start_time = time.time()
clf_model_mnb = OneVsRestClassifier(MultinomialNB(alpha=0.1), n_jobs=-1)
clf_model_mnb.fit(X_train, y_train)
print('Training took {:.3f} seconds'.format(time.time() - start_time))

Training took 18.344 seconds


Saving Model to pickle file...

In [6]:
mnb_model_file = '../output_files/mnb_model.sav'

In [14]:
with open(mnb_model_file, 'wb') as fopen:
    pickle.dump(clf_model_mnb, fopen)

In [7]:
with open(mnb_model_file, 'rb') as fread:
    clf_mnb = pickle.load(fread)

In [16]:
model_scorer(clf_mnb, X_test, y_test)

f1 micro score: 0.7980877390326209
Topic_1 accuracy= 0.8429082240762813
Topic_2 accuracy= 0.9218116805721096
Topic_3 accuracy= 0.8905840286054827
Topic_4 accuracy= 0.8707985697258641
Topic_5 accuracy= 0.9711561382598332
Topic_6 accuracy= 0.9892729439809297
[[[2093  420]
  [ 239 1443]]

 [[2844  120]
  [ 208 1023]]

 [[2840  223]
  [ 236  896]]

 [[2808  369]
  [ 173  845]]

 [[4040   21]
  [ 100   34]]

 [[4134   10]
  [  35   16]]]


**5-fold cross validation with grid search to find the best value of alpha**

In [12]:
alpha_arr = [1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 2, 10]
clf_CV = GridSearchCV(clf_model_mnb, {'estimator__alpha':alpha_arr}, scoring='f1_micro', verbose=1, n_jobs=-1)

start_time = time.time()
clf_CV.fit(X_train, y_train)
print('Training took {:.3f} seconds'.format(time.time() - start_time))
clf_CV.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training took 236.087 seconds


OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1), n_jobs=-1)

In [17]:
# Printing the result of grid search cross validation
pd.DataFrame(clf_CV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,13.471359,1.285233,3.044115,0.463865,0.0001,{'estimator__alpha': 0.0001},0.802331,0.798436,0.802734,0.797666,0.803041,0.800842,0.002303,4
1,11.517519,2.142521,3.806543,1.467137,0.001,{'estimator__alpha': 0.001},0.80303,0.799764,0.805259,0.798434,0.803929,0.802083,0.002572,3
2,10.031562,1.78762,4.641951,0.981307,0.01,{'estimator__alpha': 0.01},0.803903,0.802253,0.806531,0.800566,0.805284,0.803707,0.00212,2
3,9.185314,1.158198,4.930585,0.448698,0.1,{'estimator__alpha': 0.1},0.804901,0.801724,0.80755,0.801032,0.806323,0.804306,0.002543,1
4,9.584277,0.965773,4.361456,1.132168,0.0,{'estimator__alpha': 0},0.797449,0.791602,0.796901,0.793856,0.8,0.795962,0.002927,6
5,9.533543,1.284787,4.665036,1.420612,1.0,{'estimator__alpha': 1},0.799153,0.79581,0.804995,0.796167,0.80406,0.800037,0.003857,5
6,9.002414,1.268036,4.837592,1.198256,2.0,{'estimator__alpha': 2},0.792918,0.790085,0.800904,0.791908,0.798656,0.794894,0.004153,7
7,8.936553,1.179791,3.474391,1.356673,10.0,{'estimator__alpha': 10},0.71583,0.718511,0.730981,0.721503,0.728689,0.723103,0.005828,8


## Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
clf_model_lr = OneVsRestClassifier(LogisticRegression(C=2), n_jobs=-1)
clf_model_lr.fit(X_train, y_train)
print('Training took {:.3f} seconds'.format(time.time() - start_time))

Training took 109.573 seconds


In [8]:
log_reg_model_file = '../output_files/logreg_model.sav'

In [19]:
with open(log_reg_model_file, 'wb') as fopen:
    pickle.dump(clf_model_lr, fopen)

In [9]:
with open(log_reg_model_file, 'rb') as fread:
    clf_logreg = pickle.load(fread)

In [21]:
model_scorer(clf_logreg, X_test, y_test)

f1 micro score: 0.7983678343949046
Topic_1 accuracy= 0.8479141835518474
Topic_2 accuracy= 0.9234803337306317
Topic_3 accuracy= 0.9041716328963051
Topic_4 accuracy= 0.8812872467222884
Topic_5 accuracy= 0.9706793802145411
Topic_6 accuracy= 0.9895113230035757
[[[2184  329]
  [ 309 1373]]

 [[2859  105]
  [ 216 1015]]

 [[2924  139]
  [ 263  869]]

 [[2974  203]
  [ 295  723]]

 [[4054    7]
  [ 116   18]]

 [[4138    6]
  [  38   13]]]


In [19]:
# Machine hangs when running below code. Needed hard shutdown.

# # C_arr = np.linspace(, 105, 11)
# C_arr = [0.8,0.9]
# # array([ 95.,  96.,  97.,  98.,  99., 100., 101., 102., 103., 104., 105.])
# clf_CV = GridSearchCV(clf_model_lr, {'estimator__C':C_arr}, scoring='f1_micro', verbose=1, n_jobs=-1)

# start_time = time.time()
# clf_CV.fit(X_train, y_train)
# print('Training took {:.3f} seconds'.format(time.time() - start_time))
# clf_CV.best_estimator_

## Voting classifier
Both models are doing great job. Let's combine them for good

In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [11]:
estimators = [('mnb', clf_mnb), ('lr', clf_logreg)]
clf_voting = VotingClassifier(estimators= estimators,
                        voting='soft', weights=[2, 1], n_jobs=-1)

In [13]:
start_time = time.time()
clf_model_voting = OneVsRestClassifier(clf_voting, n_jobs=3)
clf_model_voting.fit(X_train, y_train)
print('Training took {:.3f} seconds'.format(time.time() - start_time))

Training took 121.254 seconds


In [15]:
voting_model_file = '../output_files/voting_model.sav'

In [16]:
with open(voting_model_file, 'wb') as fopen:
    pickle.dump(clf_model_voting, fopen)

In [19]:
with open(voting_model_file, 'rb') as fread:
    clf_model_voting = pickle.load(fread)

In [20]:
model_scorer(clf_model_voting, X_test, y_test)

f1 micro score: 0.8053627460302368
Topic_1 accuracy= 0.8522050059594756
Topic_2 accuracy= 0.9244338498212157
Topic_3 accuracy= 0.8974970202622169
Topic_4 accuracy= 0.8779499404052443
Topic_5 accuracy= 0.9706793802145411
Topic_6 accuracy= 0.9892729439809297
[[[2127  386]
  [ 234 1448]]

 [[2858  106]
  [ 211 1020]]

 [[2870  193]
  [ 237  895]]

 [[2856  321]
  [ 191  827]]

 [[4043   18]
  [ 105   29]]

 [[4134   10]
  [  35   16]]]


In [21]:
clf_model_voting.get_params

<bound method BaseEstimator.get_params of OneVsRestClassifier(estimator=VotingClassifier(estimators=[('mnb',
                                                            OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1),
                                                                                n_jobs=-1)),
                                                           ('lr',
                                                            OneVsRestClassifier(estimator=LogisticRegression(C=2),
                                                                                n_jobs=-1))],
                                               n_jobs=-1, voting='soft',
                                               weights=[2, 1]),
                    n_jobs=3)>

## Stacking Classifier

In [22]:
from sklearn.ensemble import StackingClassifier

In [23]:
clf_stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

<IPython.core.display.Javascript object>

In [24]:
start_time = time.time()
clf_model_stack = OneVsRestClassifier(clf_stacking, n_jobs=3)
clf_model_stack.fit(X_train, y_train)
print('Training took {:.3f} seconds'.format(time.time() - start_time))

Training took 505.307 seconds


In [15]:
stacking_model_file = '../output_files/stacking_model.sav'

In [16]:
with open(stacking_model_file, 'wb') as fopen:
    pickle.dump(clf_model_stack, fopen)

In [19]:
with open(stacking_model_file, 'rb') as fread:
    clf_model_stack = pickle.load(fread)

In [None]:
model_scorer(clf_model_stack, X_test, y_test)

# Conclusion
- Till now best classifier is voting classifier with less training time, high f1-micro score, high topic wise accuracy.

## next steps
- We will use saved voting model in separate python module for future predictions.