In [143]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump, load
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier

In [2]:
#get the preprocessed data
df = pd.read_csv('data_preprocessed.csv')

In [3]:
df.head()

Unnamed: 0,type,text,word_count
0,BILL,641356219cbc 422068f04236 6099f4533cc2 6413562...,254
1,BILL,6ca2dd348663 d38820625542 25c57acdf805 2a49f47...,281
2,BILL,019e6932b862 d7b4f3b093bf b9699ce57810 7f3b857...,252
3,BILL,03ea9b9cef7d b136f6349cf3 036087ac04f9 999d400...,434
4,BILL,586242498a88 9de56cbee28f 7c81ee28575a c5d48b6...,385


In [6]:
#Total rows of data
len(df)

50262

# Spliting Data

In [7]:
# Separate into input and output columns
X = df['text']
y = df['type']

In [8]:
#Lets split the data into train and test
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

# Encoding

I would be using sklearn pipeline for all the transformation/ encoding/ modeling steps
For encoding the text data i would be using scikit-learn CountVectorizer followed by TfidTransformer

In [16]:
#Empty Dictonary for final model comparision
final_eval = dict()

# Applying Naive Bayes:

Naive Bayes is one of the most widely used classification algorithm in text mining applications. Its a good starting point to test its performance on the provided data.

In [19]:
# Create MultinomialNB model pipeline
model_nm = 'MultinomialNB'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',MultinomialNB()), ],verbose=True)

In [20]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  19.8s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.9s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.3s


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()), ('model', MultinomialNB())],
         verbose=True)

In [23]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

MultinomialNB accuracy : 0.7342034060162342


# Applying Logistic Regression

In [33]:
# Create Logistic Regression model pipeline
model_nm = 'LogisticRegression'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',LogisticRegression(max_iter=1000)), ],verbose=True)

In [34]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  21.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total= 2.9min


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model', LogisticRegression(max_iter=1000))],
         verbose=True)

In [35]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

LogisticRegression accuracy : 0.8623269138946363


# Applying LinearSVM

In [27]:
# Create LinearSVC model pipeline
model_nm = 'LinearSVC'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',LinearSVC()), ],verbose=True)

In [28]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  20.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total=   6.2s


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()), ('model', LinearSVC())],
         verbose=True)

In [29]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

LinearSVC accuracy : 0.8764125417794048


# Applying MLP Classifier

In [31]:
# Create MLPClassifier model pipeline
model_nm = 'MLPClassifier'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',MLPClassifier(alpha=1)), ],verbose=True)

In [32]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  20.9s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total=22.9min




Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model', MLPClassifier(alpha=1))],
         verbose=True)

In [None]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

# Applying SGD Classifier

In [140]:
# Create SGDClassifier model pipeline
model_nm = 'SGDClassifier'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',SGDClassifier(max_iter=1000,loss='modified_huber',class_weight='balanced')), ],verbose=True)

In [141]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  21.9s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total=   3.1s


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model',
                 SGDClassifier(class_weight='balanced',
                               loss='modified_huber'))],
         verbose=True)

In [142]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

SGDClassifier accuracy : 0.8725131306700621


In [61]:
final_eval

{'MultinomialNB': 0.7342034060162342,
 'LogisticRegression': 0.8623269138946363,
 'LinearSVC': 0.8764125417794048,
 'SGDClassifier': 0.8727518701257361}

# Applying RandomForestClassifier

In [63]:
# Create RandomForestClassifier model pipeline
model_nm = 'RandomForestClassifier'
# Create RandomForestClassifier model
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',RandomForestClassifier(n_estimators=50)) ],verbose=True)


In [64]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  22.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total= 3.5min


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model', RandomForestClassifier(n_estimators=50))],
         verbose=True)

In [65]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

RandomForestClassifier accuracy : 0.8495941429253542


# Applying K-Neighbours

In [136]:
# Create K-Neighbours model pipeline
model_nm = 'K-Neighbours'
# Create K-Neighbours model
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',KNeighborsClassifier(n_neighbors=14)) ],verbose=True)

In [137]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  20.7s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ............. (step 3 of 3) Processing model, total=   0.0s


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model', KNeighborsClassifier(n_neighbors=14))],
         verbose=True)

In [138]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

K-Neighbours accuracy : 0.8193538118733089


Linear SGD Classifier seems to be performing very well among all the classifiers.
Lets see if selecting the k-best features helps it

# Applying Chi-Square Feature Selection

In [125]:
#SGDClassifier(alpha=0.0001,penalty='elasticnet',n_iter=50)
#k=73000
# Create SGDClassifier model pipeline
model_nm = 'SGDClassifierKbest'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('selectkbest',SelectKBest(chi2, k=30000)),
                  ('model',SGDClassifier(max_iter=1000,loss='modified_huber',class_weight='balanced')), ],verbose=True)

In [126]:
#Lets fit the model
model.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 4) Processing vect, total=  19.7s
[Pipeline] ............. (step 2 of 4) Processing tfidf, total=   0.8s
[Pipeline] ....... (step 3 of 4) Processing selectkbest, total=   0.5s
[Pipeline] ............. (step 4 of 4) Processing model, total=   2.0s


Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('selectkbest',
                 SelectKBest(k=30000,
                             score_func=<function chi2 at 0x7fb403608c80>)),
                ('model',
                 SGDClassifier(class_weight='balanced',
                               loss='modified_huber'))],
         verbose=True)

In [127]:
#Lets test the model
# Make predictions

ytest = np.array(y_test)
pred_y = model.predict(X_test)

# Evaluate predictions
final_eval[model_nm] = accuracy_score(pred_y, y_test)
print('{} accuracy : {}'.format(model_nm,accuracy_score(pred_y, y_test)))
#print(classification_report(ytest, pred_y))

SGDClassifierKbest accuracy : 0.8656692662740729


In [139]:
final_eval

{'MultinomialNB': 0.7342034060162342,
 'LogisticRegression': 0.8623269138946363,
 'LinearSVC': 0.8764125417794048,
 'SGDClassifier': 0.8727518701257361,
 'RandomForestClassifier': 0.8495941429253542,
 'SGDClassifierKbest': 0.8656692662740729,
 'K-Neighbours': 0.8193538118733089}

# Applying GridSearch

In [148]:
model_nm = 'SGDClassifier'
model = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                  ('tfidf', TfidfTransformer()),
                  ('model',SGDClassifier(class_weight='balanced')), ],verbose=True)

In [152]:
pararm_grid = {"model__loss":["hinge","log","squared_hinge","modified_huber"]
              ,"model__alpha":[0.0001,0.001,0.01,0.1]
              ,"model__penalty":["l2","l1","none"]}

In [153]:
grid = GridSearchCV(model,param_grid=pararm_grid)

In [154]:
grid.fit(X_train,y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.0s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.2s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.2s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  15.9s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.1s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  15.9s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipel



[Pipeline] ............. (step 3 of 3) Processing model, total=  22.6s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  22.5s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.0s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  24.1s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  31.0s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.7s
[Pipeline] ............. (step 3 of 3) Processing model, total=  19.8s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipel

[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   3.1s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.5s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.5s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.4s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.4s
[Pipel

[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.6s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.4s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   5.5s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   5.8s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipel

[Pipeline] ............. (step 3 of 3) Processing model, total=   4.2s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   4.4s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.7s
[Pipeline] ............. (step 3 of 3) Processing model, total=   4.3s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   4.3s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   2.6s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.5s
[Pipel

[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  15.6s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  16.7s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  16.7s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  14.9s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=  16.7s
[Pipel

[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   3.5s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   3.9s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   4.0s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.7s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing model, total=   3.8s
[Pipeline] .............. (step 1 of 3) Processing vect, total=  16.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.6s
[Pipel

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(min_df=5,
                                                        ngram_range=(1, 2))),
                                       ('tfidf', TfidfTransformer()),
                                       ('model',
                                        SGDClassifier(class_weight='balanced'))],
                                verbose=True),
             param_grid={'model__alpha': [0.0001, 0.001, 0.01, 0.1],
                         'model__loss': ['hinge', 'log', 'squared_hinge',
                                         'modified_huber'],
                         'model__penalty': ['l2', 'l1', 'none']})

In [155]:
grid.best_score_
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(min_df=5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('model',
                 SGDClassifier(class_weight='balanced',
                               loss='modified_huber'))],
         verbose=True)