In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score
import pickle

In [3]:

df = pd.read_csv('data/training_chunks/training_chunk1.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   10000 non-null  object
 1   text    10000 non-null  object
 2   Label   10000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [5]:
df.shape

(10000, 3)

In [6]:


df.head()

Unnamed: 0,title,text,Label
0,tension rise north dakota pipeline trump set ...,tension increased week near construction site...,1
1,billboard bay area confirms time impeach trump,billboard bay area confirms time impeach trum...,0
2,texas lawmaker file crucial bill could save c...,tuesday senator jose menendez dsan antonio fi...,0
3,canada obama watch new prime minister call pr...,global wussification male identifying feminis...,0
4,former nypd vip security detail say hillary d...,almost scary idea hillary would lie major hea...,0


In [7]:
df.tail()

Unnamed: 0,title,text,Label
9995,northern ireland life inside fountain,william jackson remembers exodus vividly prot...,1
9996,sweden brink collapse gun purchase way uppepp...,politically correct country sweden paying big...,0
9997,family living traditional lifestyle torn apar...,conform pay ultimate price police seized ten ...,0
9998,trump us tragedy congratulate video,case know explosion lower manhattan saturday ...,0
9999,president trump attend paris july celebration,president donald trump travel france bastille...,1


In [8]:

count_vectorizer = CountVectorizer()
tfid_transformer = TfidfTransformer()

In [9]:
def transform_vectorize(sample):
    count_vectorizer.fit(sample)
    temp = count_vectorizer.transform(sample)
    tfid_transformer.fit(temp)
    mat = tfid_transformer.transform(temp).todense()
    return mat

In [10]:
df['Total'] = df['title']+ ''+df['text']

In [11]:
df.drop(columns=['title','text'],inplace=True)

In [12]:
df.head()

Unnamed: 0,Label,Total
0,1,tension rise north dakota pipeline trump set ...
1,0,billboard bay area confirms time impeach trum...
2,0,texas lawmaker file crucial bill could save c...
3,0,canada obama watch new prime minister call pr...
4,0,former nypd vip security detail say hillary d...


In [13]:
x_mat = transform_vectorize(df['Total'])

In [14]:
x_mat.shape

(10000, 91071)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x_mat,df['Label'].values,test_size = 0.3,random_state = 24)

In [17]:
x_train.shape, y_train.shape

((7000, 91071), (7000,))

# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegressionCV

In [28]:
logistic_model = LogisticRegressionCV(verbose=10,cv=2)

In [29]:
logistic_model.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.7min finished


LogisticRegressionCV(cv=2, verbose=10)

In [41]:
trainpredict_logistic = logistic_model.predict(x_train)

In [42]:
trainscore_log_model = accuracy_score(y_train,trainpredict_logistic)

In [43]:
testpredict_logistic = logistic_model.predict(x_test)

In [44]:
testscore_log_model = accuracy_score(y_test,testpredict_logistic)

In [45]:
log_scores = pd.DataFrame({'Training score':trainscore_log_model,'Testing score':testscore_log_model},index=[0])
log_scores

Unnamed: 0,Training score,Testing score
0,1.0,0.969333


In [46]:
logistic_f1score = f1_score(y_test,testpredict_logistic)
logistic_f1score

0.9695364238410595

   # Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
tree_model = DecisionTreeClassifier(random_state = 24)

In [32]:
from sklearn.model_selection import GridSearchCV

In [49]:
param_dict = {'max_depth':[70,75,80,100]}

In [50]:
search_cv = GridSearchCV(tree_model,param_dict,verbose=10,cv=2)

In [51]:
search_cv.fit(x_train,y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] max_depth=70 ....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................ max_depth=70, score=0.883, total=  58.8s
[CV] max_depth=70 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.6s remaining:    0.0s


[CV] ........................ max_depth=70, score=0.881, total= 1.1min
[CV] max_depth=75 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s


[CV] ........................ max_depth=75, score=0.883, total= 1.0min
[CV] max_depth=75 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.1min remaining:    0.0s


[CV] ........................ max_depth=75, score=0.881, total= 1.1min
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.2min remaining:    0.0s


[CV] ........................ max_depth=80, score=0.883, total= 1.1min
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.3min remaining:    0.0s


[CV] ........................ max_depth=80, score=0.881, total= 1.1min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.4min remaining:    0.0s


[CV] max_depth=100 ...................................................
[CV] ....................... max_depth=100, score=0.883, total= 1.0min
[CV] max_depth=100 ...................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.4min remaining:    0.0s


[CV] ....................... max_depth=100, score=0.881, total= 1.1min


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.5min finished


GridSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=24),
             param_grid={'max_depth': [70, 75, 80, 100]}, verbose=10)

In [52]:
search_cv.best_params_

{'max_depth': 70}

In [33]:
tree_model.max_depth=70

In [34]:
tree_model.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=70, random_state=24)

In [47]:
trainpredict_tree  = tree_model.predict(x_train)

In [48]:
testpredict_tree = tree_model.predict(x_test)
testpredict_tree[0:5]

array([1, 0, 1, 0, 1], dtype=int64)

In [49]:
trainscore_tree = accuracy_score(y_train,trainpredict_tree)
testscore_tree = accuracy_score(y_test,testpredict_tree)

In [50]:
tree_scores = pd.DataFrame({'Training score':trainscore_tree,'Testing score':testscore_tree},index=[0])
tree_scores

Unnamed: 0,Training score,Testing score
0,1.0,0.907333


In [51]:
tree_f1score = f1_score(y_test,testpredict_tree)
tree_f1score

0.9082508250825082

In [26]:
#

In [27]:
# We will try Random Forest model to overcome overfitting

# Random Forest Model

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
forest_clf = RandomForestClassifier(random_state= 24,verbose=5,max_depth=70)

In [20]:
params = {'n_estimators':[250,275,300]}

In [23]:
search_cv = GridSearchCV(forest_clf,params,verbose=10,cv=2)

In [24]:
search_cv.fit(x_train,y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] n_estimators=250 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 250


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


building tree 2 of 250


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


building tree 3 of 250


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


building tree 4 of 250


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s


building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250
building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250
building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250
building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.5s finished


[CV] .................... n_estimators=250, score=0.925, total= 2.0min
[CV] n_estimators=250 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 250


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 250


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


building tree 3 of 250


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s


building tree 4 of 250


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.5s remaining:    0.0s


building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250
building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250
building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250
building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.4s finished


[CV] .................... n_estimators=250, score=0.939, total= 1.7min
[CV] n_estimators=275 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 275


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 275


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


building tree 3 of 275


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


building tree 4 of 275


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


building tree 5 of 275
building tree 6 of 275
building tree 7 of 275
building tree 8 of 275
building tree 9 of 275
building tree 10 of 275
building tree 11 of 275
building tree 12 of 275
building tree 13 of 275
building tree 14 of 275
building tree 15 of 275
building tree 16 of 275
building tree 17 of 275
building tree 18 of 275
building tree 19 of 275
building tree 20 of 275
building tree 21 of 275
building tree 22 of 275
building tree 23 of 275
building tree 24 of 275
building tree 25 of 275
building tree 26 of 275
building tree 27 of 275
building tree 28 of 275
building tree 29 of 275
building tree 30 of 275
building tree 31 of 275
building tree 32 of 275
building tree 33 of 275
building tree 34 of 275
building tree 35 of 275
building tree 36 of 275
building tree 37 of 275
building tree 38 of 275
building tree 39 of 275
building tree 40 of 275
building tree 41 of 275
building tree 42 of 275
building tree 43 of 275
building tree 44 of 275
building tree 45 of 275
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed:    0.5s finished


[CV] .................... n_estimators=275, score=0.925, total= 2.0min
[CV] n_estimators=275 ................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 275


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 275


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


building tree 3 of 275


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


building tree 4 of 275


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


building tree 5 of 275
building tree 6 of 275
building tree 7 of 275
building tree 8 of 275
building tree 9 of 275
building tree 10 of 275
building tree 11 of 275
building tree 12 of 275
building tree 13 of 275
building tree 14 of 275
building tree 15 of 275
building tree 16 of 275
building tree 17 of 275
building tree 18 of 275
building tree 19 of 275
building tree 20 of 275
building tree 21 of 275
building tree 22 of 275
building tree 23 of 275
building tree 24 of 275
building tree 25 of 275
building tree 26 of 275
building tree 27 of 275
building tree 28 of 275
building tree 29 of 275
building tree 30 of 275
building tree 31 of 275
building tree 32 of 275
building tree 33 of 275
building tree 34 of 275
building tree 35 of 275
building tree 36 of 275
building tree 37 of 275
building tree 38 of 275
building tree 39 of 275
building tree 40 of 275
building tree 41 of 275
building tree 42 of 275
building tree 43 of 275
building tree 44 of 275
building tree 45 of 275
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed:    0.5s finished


[CV] .................... n_estimators=275, score=0.937, total= 1.9min
[CV] n_estimators=300 ................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 300


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 300


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


building tree 3 of 300


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


building tree 4 of 300


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 3

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished


[CV] .................... n_estimators=300, score=0.924, total= 2.1min
[CV] n_estimators=300 ................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 300


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 300


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


building tree 3 of 300


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s


building tree 4 of 300


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.5s remaining:    0.0s


building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 3

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished


[CV] .................... n_estimators=300, score=0.938, total= 2.1min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 250


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


building tree 2 of 250


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


building tree 3 of 250


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.8s remaining:    0.0s


building tree 4 of 250


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.7s remaining:    0.0s


building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250
building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250
building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250
building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  3.8min finished


GridSearchCV(cv=2,
             estimator=RandomForestClassifier(max_depth=70, random_state=24,
                                              verbose=5),
             param_grid={'n_estimators': [250, 275, 300]}, verbose=10)

In [25]:
search_cv.best_params_

{'n_estimators': 250}

In [37]:
forest_clf.n_estimators = 250

In [38]:
forest_clf.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 250


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


building tree 2 of 250


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


building tree 3 of 250


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.0s remaining:    0.0s


building tree 4 of 250


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.9s remaining:    0.0s


building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250
building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250
building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250
building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 2

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  3.7min finished


RandomForestClassifier(max_depth=70, n_estimators=250, random_state=24,
                       verbose=5)

In [52]:
trainpredict_forest = forest_clf.predict(x_train)
testpredict_forest = forest_clf.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.4s finished


In [53]:
testpredict_forest[0:10]

array([1, 0, 1, 1, 1, 1, 0, 0, 1, 1], dtype=int64)

In [54]:
trainscore_forest = accuracy_score(y_train,trainpredict_forest)
testscore_forest = accuracy_score(y_test,testpredict_forest)

In [55]:
forest_scores = pd.DataFrame({'Training score':trainscore_forest,'Testing score':testscore_forest},index=[0])
forest_scores

Unnamed: 0,Training score,Testing score
0,0.998857,0.943667


In [56]:
forest_f1score = f1_score(y_test,testpredict_forest)
forest_f1score

0.9431167956916864

# Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
nb_clf = MultinomialNB()

In [20]:
nb_clf.fit(x_train,y_train)

MultinomialNB()

In [21]:
trainpredict_nb = nb_clf.predict(x_train)
testpredict_nb = nb_clf.predict(x_test)

In [22]:
trainscore_nb = accuracy_score(y_train,trainpredict_nb)
testscore_nb = accuracy_score(y_test,testpredict_nb)

In [23]:
nb_scores = pd.DataFrame({'Training score':trainscore_nb,'Testing score':testscore_nb},index=[0])
nb_scores

Unnamed: 0,Training score,Testing score
0,0.918714,0.882


In [24]:
nb_f1score = f1_score(y_test,testpredict_nb)
nb_f1score

0.8704245973645681

# Comparing all the models

In [57]:
compare_models = pd.DataFrame({'Model':['Logistic Regression','Decision Tree','Random Forest','Naive Bayes'],\
                               'Accuracy_score':[testscore_log_model,testscore_tree,testscore_forest,testscore_nb],\
                               'F1_Score':[logistic_f1score,tree_f1score,forest_f1score,nb_f1score]})

In [58]:
compare_models

Unnamed: 0,Model,Accuracy_score,F1_Score
0,Logistic Regression,0.969333,0.969536
1,Decision Tree,0.907333,0.908251
2,Random Forest,0.943667,0.943117
3,Naive Bayes,0.882,0.870425


In [1]:
#

In [61]:
pickle.dump(logistic_model,open('pickled_models/logistic_clf.pkl','wb'))
pickle.dump(tree_model,open('pickled_models/tree_clf.pkl','wb'))
pickle.dump(forest_clf,open('pickled_models/forest_clf.pkl','wb'))

In [62]:
pickle.dump(count_vectorizer,open('pickled_models/count_vectorizer.pkl','wb'))
pickle.dump(tfid_transformer,open('pickled_models/tfid_transformer.pkl','wb'))

In [25]:
pickle.dump(nb_clf,open('pickled_models/nb_clf.pkl','wb'))