In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
pd.pandas.set_option('display.max_columns',None)

In [4]:
train_procreesed_data = pd.read_csv("../data/train_processed_data.csv")
print(train_procreesed_data.shape)

(162980, 2)


In [5]:
train_procreesed_data = train_procreesed_data.dropna()

In [6]:
train_procreesed_data.head()

Unnamed: 0,clean_text,category
0,modi promise minimum government maximum govern...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp tell rahul main camp...,1.0
3,ask supporter prefix chowkidar name modi great...,1.0
4,answer among powerful world leader today trump...,1.0


In [7]:
cat1 = train_procreesed_data[train_procreesed_data["category"] == -1].iloc[0:3550]
cat2 = train_procreesed_data[train_procreesed_data["category"] == 0].iloc[0:5515]
cat3 = train_procreesed_data[train_procreesed_data["category"] == 1].iloc[0:7224]

In [8]:
train_data = cat1.append([cat2, cat3],ignore_index = True)

In [9]:
train_data.shape

(16289, 2)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidfvector=TfidfVectorizer(ngram_range=(2,2))                                     
traindataset = tfidfvector.fit_transform(train_data['clean_text'])   

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV,  KFold
from sklearn.metrics import confusion_matrix                              
from sklearn.metrics import accuracy_score                                
from sklearn.metrics import roc_curve, auc                                

In [13]:
traindataset                                                              

<16289x143526 sparse matrix of type '<class 'numpy.float64'>'
	with 224128 stored elements in Compressed Sparse Row format>

In [14]:
x = traindataset
y = train_data['category'].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state=303)

In [15]:
from sklearn import metrics
def model_compare(model_name,model,y_test,y_pred,y_score, roc_auc = None):
    binclass_metrics = {
                        'Accuracy' : metrics.accuracy_score(y_test, y_pred),
                        'Precision' : metrics.precision_score(y_test, y_pred),
                        'Recall' : metrics.recall_score(y_test, y_pred),
                        'F1 Score' : metrics.f1_score(y_test, y_pred),
                        'ROC AUC' : roc_auc
                       }

    df_metrics = pd.DataFrame.from_dict(binclass_metrics, orient='index')
    df_metrics.columns = [model_name] 
    return df_metrics

## DecisionTree

In [52]:
from sklearn.tree import DecisionTreeClassifier

In [53]:
model_dt = DecisionTreeClassifier(random_state=303)
np.random.seed(303)
# start = time.time()
param_dist = {'max_depth': [2, 3, 4, 5, 6, 7],
              'min_samples_leaf':[100,200,70, 30],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(model_dt, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 1)

cv_rf.fit(x_train, y_train)
print('Best Parameters using grid search: \n', cv_rf.best_params_)

Best Parameters using grid search: 
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 70}


In [54]:
model_dt = DecisionTreeClassifier(random_state=303,
                                  max_depth=7, 
                                  criterion = 'gini', 
                                  min_samples_leaf=70 )
model_dt.fit(x_train, y_train)
model_dt_score_train = model_dt.score(x_train, y_train)
print("Training score: ",model_dt_score_train)
model_dt_score_test = model_dt.score(x_test, y_test)
print("Testing score: ",model_dt_score_test)            

Training score:  0.4682679763640549
Testing score:  0.46838551258440764


In [55]:
y_pred = model_dt.predict(x_test)
# Model Accuracy
print (model_dt.score(x_test, y_test))

0.46838551258440764


In [24]:
y_pred

array([1., 1., 1., ..., 1., 1., 1.])

In [57]:
y_pred = model_dt.predict(x_test)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr_dt, tpr_dt)
y_score = accuracy_score(y_test, y_pred)
metrix_dt = model_compare("Decision Tree",model_dt, y_test, y_pred, y_score, roc_auc  )
metrix_dt

ValueError: multiclass format is not supported

## Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
model_rf = RandomForestClassifier(random_state=303)

In [60]:
param_dist = {'max_depth': [3,4,5,7,8],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(model_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 1)

cv_rf.fit(x_train, y_train)
print('Best Parameters using grid search: \n', cv_rf.best_params_)

Best Parameters using grid search: 
 {'bootstrap': True, 'criterion': 'gini', 'max_depth': 8, 'max_features': None}


In [18]:
model_rf.set_params(criterion    = 'gini',
                    max_features = None, 
                    max_depth    = 8,
                    bootstrap    = True
                   )

RandomForestClassifier(max_depth=8, max_features=None, random_state=303)

In [19]:
model_rf.set_params(n_estimators=1500,
                  bootstrap = True,
                  warm_start=False, 
                  oob_score=False)

RandomForestClassifier(max_depth=8, max_features=None, n_estimators=1500,
                       random_state=303)

In [20]:
model_rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=8, max_features=None, n_estimators=1500,
                       random_state=303)

In [21]:
prediction_test = model_rf.predict(x_test)
y_score= metrics.accuracy_score(y_test, prediction_test)

In [22]:
y_score

0.48158379373848986

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, prediction_test)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, prediction_test, average='weighted')))


Accuracy: 0.48

Micro Precision: 0.48
Micro Recall: 0.48
Micro F1-score: 0.48

Macro Precision: 0.60
Macro Recall: 0.38
Macro F1-score: 0.32

Weighted Precision: 0.58
Weighted Recall: 0.48
Weighted F1-score: 0.37


In [26]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, prediction_test, target_names=['Negatice  -1', ' Neutral 0', 'Positive 1']))


Classification Report

              precision    recall  f1-score   support

Negatice  -1       0.70      0.08      0.15       703
   Neutral 0       0.64      0.11      0.19      1099
  Positive 1       0.47      0.96      0.63      1456

    accuracy                           0.48      3258
   macro avg       0.60      0.38      0.32      3258
weighted avg       0.58      0.48      0.37      3258



## NavyBayes Classifier

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
model_nb = MultinomialNB()

In [29]:
model_nb.fit(x_train, y_train)

MultinomialNB()

In [33]:
prediction_test = model_nb.predict(x_test)
y_score= metrics.accuracy_score(y_test, prediction_test)

In [34]:
y_score

0.5276243093922652

In [35]:
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, prediction_test)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, prediction_test, average='weighted')))
print('\nClassification Report\n')
print(classification_report(y_test, prediction_test, target_names=['Negatice  -1', ' Neutral 0', 'Positive 1']))


Accuracy: 0.53

Micro Precision: 0.53
Micro Recall: 0.53
Micro F1-score: 0.53

Macro Precision: 0.68
Macro Recall: 0.43
Macro F1-score: 0.40

Weighted Precision: 0.63
Weighted Recall: 0.53
Weighted F1-score: 0.46

Classification Report

              precision    recall  f1-score   support

Negatice  -1       0.94      0.07      0.12       703
   Neutral 0       0.58      0.35      0.44      1099
  Positive 1       0.51      0.88      0.64      1456

    accuracy                           0.53      3258
   macro avg       0.68      0.43      0.40      3258
weighted avg       0.63      0.53      0.46      3258



In [29]:
train_data.isnull().sum()

clean_text              0
category                0
clean_processed_text    0
dtype: int64

## Training completely on the Data

In [37]:
tfidfvector=TfidfVectorizer(ngram_range=(2,2))                                     
traindataset_completedata = tfidfvector.fit_transform(train_procreesed_data['clean_text']) 

In [38]:
x = traindataset_completedata
y = train_procreesed_data['category'].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2, random_state=303)

In [39]:
new_model_nb = MultinomialNB()

In [40]:
new_model_nb.fit(x_train, y_train)

MultinomialNB()

In [41]:
prediction_test = new_model_nb.predict(x_test)
y_score= metrics.accuracy_score(y_test, prediction_test)

In [42]:
y_score

0.5606641907860409

In [46]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, prediction_test)

In [47]:
print('Confusion Matrix\n')
print(confusion)
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, prediction_test)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, prediction_test, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, prediction_test, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, prediction_test, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, prediction_test, average='weighted')))
print('\nClassification Report\n')
print(classification_report(y_test, prediction_test))
# , target_names=['Negatice  -1', ' Neutral 0', 'Positive 1']

Confusion Matrix

[[  660  1014  5460]
 [   17  4296  6665]
 [   36  1122 13311]]

Accuracy: 0.56

Micro Precision: 0.56
Micro Recall: 0.56
Micro F1-score: 0.56

Macro Precision: 0.71
Macro Recall: 0.47
Macro F1-score: 0.44

Weighted Precision: 0.66
Weighted Recall: 0.56
Weighted F1-score: 0.50

Classification Report

              precision    recall  f1-score   support

        -1.0       0.93      0.09      0.17      7134
         0.0       0.67      0.39      0.49     10978
         1.0       0.52      0.92      0.67     14469

    accuracy                           0.56     32581
   macro avg       0.71      0.47      0.44     32581
weighted avg       0.66      0.56      0.50     32581



In [54]:
# Saving the TFIDF In a file
import pickle

In [55]:
#Save vectorizer.vocabulary_
pickle.dump(tfidfvector.vocabulary_,open("../data/tfidfvector.pkl","wb"))

In [56]:
filename_path = '../data/finalized_model.sav'
pickle.dump(new_model_nb, open(filename_path, 'wb'))

In [57]:
tfidfvector.vocabulary_

{'modi promise': 556350,
 'promise minimum': 686245,
 'minimum government': 539475,
 'government maximum': 353531,
 'maximum governance': 525842,
 'governance expect': 352389,
 'expect begin': 289766,
 'begin difficult': 96199,
 'difficult job': 239702,
 'job reform': 444757,
 'reform state': 719408,
 'state take': 819103,
 'take year': 845952,
 'year get': 955826,
 'get justice': 339920,
 'justice state': 449911,
 'state business': 818316,
 'business exit': 130440,
 'exit psus': 289562,
 'psus temple': 692563,
 'talk nonsense': 847145,
 'nonsense continue': 601066,
 'continue drama': 189001,
 'drama vote': 252345,
 'vote modi': 919435,
 'say vote': 758627,
 'modi welcome': 559614,
 'welcome bjp': 932313,
 'bjp tell': 113392,
 'tell rahul': 854125,
 'rahul main': 703340,
 'main campaigner': 510322,
 'campaigner modi': 135778,
 'modi think': 558664,
 'think modi': 865515,
 'modi relax': 556790,
 'ask supporter': 70042,
 'supporter prefix': 837048,
 'prefix chowkidar': 676348,
 'chowkida