In [40]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import string
import spacy
import re
import multiprocessing as mp
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from Shallow_ML_Models.DataPreprocessor import TextPreprocessor
from Shallow_ML_Models.DataPreprocessor import TextPreprocessor_withStem


import nltk
from nltk.stem import PorterStemmer

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth',999)

## Load Train and Test data

In [41]:
#importing financial phrase bank
train_data_path = os.path.join("data",  "train_data.csv")
test_data_path = os.path.join("data",  "test_data.csv")

In [42]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [43]:
train_data.shape

(5183, 3)

In [44]:
test_data.shape

(1296, 3)

In [45]:
X_train, y_train = train_data['sentence'], train_data['sentiment_label']
X_test, y_test   = test_data['sentence'],  test_data['sentiment_label']

## Creating model pipeline without stemming


#### 1. LR Model without Stemming

In [None]:
## Create LR model pipeline without stemming

preprocessor = TextPreprocessor()

vectorizer = CountVectorizer(analyzer      = 'word',
                             token_pattern = r'\S+',
                             stop_words    = 'english',
                             ngram_range   = (1,3),
                             binary        = True)
func = f_classif
selector = SelectKBest(func, k=1000)

lr_classfier   = LogisticRegression(random_state=42, max_iter=400)

pipe_lr = Pipeline([('prep', preprocessor),
                    ('vec', vectorizer),
                    ('sel', selector),
                    ('clf',lr_classfier)
                   ])

# hyperparameter grid to search on
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': [1.0, 0.5, 0.1],
                   'clf__solver': ['liblinear'],
                  
                   'sel__k':[1000, 5000, 10000, 20000, 40000],
                  
                   'sel__score_func':[f_classif, chi2],
                   
                   'vec__ngram_range':[(1,3),(2,3),(1,2)],
                   'vec__binary':[True, False],
                   'prep__Stemming':[True, False]}] 

## Final LR model with Grid
LR_model = GridSearchCV(estimator  = pipe_lr,
                        param_grid = grid_params_lr,
                        scoring    = 'f1_macro',
                        cv         = 5,
                        n_jobs     = -1) 


In [None]:
%%time
### Performing gridsearch on LR model
LR_model.fit(X_train, y_train)

In [27]:
print('Best params are : %s' % LR_model.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % LR_model.best_score_)

Best params are : {'clf__C': 1.0, 'clf__penalty': 'l1', 'clf__solver': 'liblinear', 'prep__Stemming': False, 'sel__k': 20000, 'sel__score_func': <function chi2 at 0x00000271F7A023A0>, 'vec__binary': False, 'vec__ngram_range': (1, 3)}
Best training f1_macro: 0.617


In [28]:
best_LR_model = LR_model.best_estimator_

#predict on test data
y_pred_lr = best_LR_model.predict(X_test)
print('Test Data classification report')
print(classification_report(y_test, y_pred_lr))

Test Data classification report
              precision    recall  f1-score   support

    negative       0.65      0.44      0.53       205
     neutral       0.72      0.83      0.77       713
    positive       0.67      0.60      0.63       378

    accuracy                           0.70      1296
   macro avg       0.68      0.62      0.64      1296
weighted avg       0.70      0.70      0.69      1296



#### 2. LR Model with Stemming

In [None]:
Stempreprocessor = TextPreprocessor_withStem()

pipe_lr_withStem = Pipeline([('prep', Stempreprocessor),
                             ('vec', vectorizer),
                             ('sel', selector),
                             ('clf',lr_classfier)
                             ])

## Final LR model with Grid
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': [1.0, 0.5, 0.1],
                   'clf__solver': ['liblinear'],
                  
                   'sel__k':[1000, 5000, 10000, 20000, 40000],
                  
                   'sel__score_func':[f_classif, chi2],
                   
                   'vec__ngram_range':[(1,3),(2,3),(1,2)],
                   'vec__binary':[True, False]}] 
LR_model_withStem = GridSearchCV(estimator  = pipe_lr_withStem,
                                 param_grid = grid_params_lr,
                                 scoring    = 'f1_macro',
                                 cv         = 5,
                                 n_jobs     = -1) 

In [26]:
print(1)

1


In [25]:
%%time
### Performing gridsearch on LR model
LR_model_withStem.fit(X_train, y_train)

 0.61283173 0.45237197 0.62404522 0.60923082 0.45471252 0.61776046
 0.62223877 0.45432318 0.62880825 0.62542008 0.45699271 0.62392592
 0.62158357 0.45345135 0.62785681 0.62200859 0.45943604 0.62363948
 0.62903388 0.46514206 0.62581331 0.62302621 0.46626887 0.62212786
 0.62781576 0.46540586 0.6257255  0.62248861 0.4667154  0.62298478
 0.62631029 0.46537765 0.62467571 0.62107391 0.47019355 0.62312444
 0.62718619 0.46537765 0.62467571 0.62046579 0.46784962 0.62289559
 0.62419172 0.47042182        nan 0.62290603 0.47179336        nan
 0.62419172 0.47042182        nan 0.62393738 0.47044804        nan
 0.6155381  0.45269377 0.62318409 0.61507291 0.45301247 0.6183902
 0.61432417 0.45269377 0.6229426  0.6102064  0.45299762 0.61435307
 0.59000292 0.44522654 0.59413121 0.59411269 0.44590605 0.59409392
 0.58995608 0.44522654 0.59300901 0.59405507 0.44769369 0.59740359
 0.56615999 0.453167   0.59588377 0.56667072 0.45415543 0.59327841
 0.56543291 0.45377004 0.59504875 0.56670199 0.45459664 0.59511

Wall time: 28min 22s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep', TextPreprocessor_withStem()),
                                       ('vec',
                                        CountVectorizer(binary=True,
                                                        ngram_range=(1, 3),
                                                        stop_words='english',
                                                        token_pattern='\\S+')),
                                       ('sel', SelectKBest(k=1000)),
                                       ('clf',
                                        LogisticRegression(max_iter=400,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 0.5, 0.1],
                          'clf__penalty': ['l1', 'l2'],
                          'clf__solver': ['liblinear'],
                          'sel__k': [1000, 5000, 10000, 20000, 40000],
                        

In [29]:
print('Best params are : %s' % LR_model_withStem.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % LR_model_withStem.best_score_)

Best params are : {'clf__C': 1.0, 'clf__penalty': 'l1', 'clf__solver': 'liblinear', 'sel__k': 10000, 'sel__score_func': <function f_classif at 0x00000271F7A02280>, 'vec__binary': True, 'vec__ngram_range': (1, 3)}
Best training f1_macro: 0.629


In [30]:
best_LR_model_withStem = LR_model_withStem.best_estimator_

#predict on test data
y_pred_lr_withStem = best_LR_model_withStem.predict(X_test)
print('Test Data classification report')
print(classification_report(y_test, y_pred_lr_withStem))

Test Data classification report
              precision    recall  f1-score   support

    negative       0.62      0.47      0.53       205
     neutral       0.72      0.83      0.78       713
    positive       0.68      0.57      0.62       378

    accuracy                           0.70      1296
   macro avg       0.68      0.63      0.64      1296
weighted avg       0.69      0.70      0.69      1296



In [None]:
import pickle
pickle.dump(best_lr_pipe, open('LR_model.pkl', 'wb'))


### 3. SVC Model without Stemming

In [32]:
svc_classifier = SVC( kernel       = 'rbf',
                      C            =  100,
                      random_state = 42)

pipe_svm = Pipeline([('prep', preprocessor),
                    ('vec', vectorizer),
                    ('sel', selector),
                    ('clf',svc_classifier)
                   ])

grid_params_svm = [{'clf__gamma': ['scale', 'auto'],
                   'clf__C': [100, 10, 1.0, 0.1, 0.01],
                  
                   'sel__k':[1000, 5000, 10000, 20000, 40000],
                  
                   'sel__score_func':[f_classif, chi2],
                   
                   'vec__ngram_range':[(1,3),(2,3),(1,2)],
                   'vec__binary':[True, False]}] 


SVC_model = GridSearchCV(estimator  = pipe_svm,
                        param_grid  = grid_params_svm,
                        scoring     = 'f1_macro',
                        cv          = 5,
                        n_jobs      = -1) 

In [33]:
%%time
SVC_model.fit(X_train, y_train)


 0.57020256 0.44976561 0.56954104 0.5526227  0.4520655  0.54703894
 0.54079968 0.44690291 0.53480528 0.52696057 0.44525818 0.51692856
 0.54041575 0.44690291 0.53394963 0.52914295 0.44827025 0.51961383
 0.51535117 0.45182927 0.53298371 0.50921871 0.44506001 0.52315788
 0.51628592 0.45140654 0.53298371 0.50680402 0.44491202 0.52723015
 0.48471948 0.43644467 0.4972764  0.47656742 0.43460761 0.49656486
 0.48471948 0.43644467 0.4972764  0.47654359 0.43317267 0.49907072
 0.4492967  0.42343758        nan 0.456332   0.42461159        nan
 0.4492967  0.42343758        nan 0.45942158 0.42461159        nan
 0.56106343 0.39941061 0.5761131  0.55830911 0.39873078 0.57217867
 0.56231385 0.39941061 0.57683236 0.5589471  0.39972502 0.56752086
 0.45894802 0.34481934 0.46235908 0.45906105 0.34561725 0.45973116
 0.45796535 0.34481934 0.4621086  0.45848251 0.34643303 0.46175982
 0.40213511 0.32119917 0.40852124 0.40091579 0.32663459 0.40226836
 0.40181448 0.32119917 0.40852124 0.4009158  0.32735322 0.4045

Wall time: 28min 25s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep', TextPreprocessor()),
                                       ('vec',
                                        CountVectorizer(binary=True,
                                                        ngram_range=(1, 3),
                                                        stop_words='english',
                                                        token_pattern='\\S+')),
                                       ('sel', SelectKBest(k=1000)),
                                       ('clf', SVC(C=100, random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [100, 10, 1.0, 0.1, 0.01],
                          'clf__gamma': ['scale', 'auto'],
                          'sel__k': [1000, 5000, 10000, 20000, 40000],
                          'sel__score_func': [<function f_classif at 0x00000271F7A02280>,
                                              <function chi2 at 0x00000271F7A023A0>],
                     

In [34]:
print('Best params are : %s' % SVC_model.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % SVC_model.best_score_)

Best params are : {'clf__C': 10, 'clf__gamma': 'scale', 'sel__k': 1000, 'sel__score_func': <function chi2 at 0x00000271F7A023A0>, 'vec__binary': True, 'vec__ngram_range': (1, 2)}
Best training f1_macro: 0.585


In [35]:
best_SVC_model = SVC_model.best_estimator_

#predict on test data
y_pred_svc = best_SVC_model.predict(X_test)
print('Test Data classification report')
print(classification_report(y_test, y_pred_svc))

Test Data classification report
              precision    recall  f1-score   support

    negative       0.57      0.46      0.51       205
     neutral       0.71      0.80      0.75       713
    positive       0.61      0.54      0.57       378

    accuracy                           0.67      1296
   macro avg       0.63      0.60      0.61      1296
weighted avg       0.66      0.67      0.66      1296



### Creating SVC model with Stemming


In [36]:
pipe_svm_withStem = Pipeline([('prep', Stempreprocessor),
                              ('vec', vectorizer),
                              ('sel', selector),
                              ('clf',svc_classifier)
                             ])

SVC_model_withStem = GridSearchCV(estimator  = pipe_svm_withStem,
                        param_grid  = grid_params_svm,
                        scoring     = 'f1_macro',
                        cv          = 5,
                        n_jobs      = -1) 

In [37]:
%%time
SVC_model_withStem.fit(X_train, y_train)


 0.58161955 0.46112608 0.5894212  0.57578904 0.46124472 0.56899675
 0.54036575 0.45633134 0.53414196 0.52616252 0.45591174 0.51211108
 0.54193122 0.45633134 0.53588758 0.52057288 0.45476824 0.51440864
 0.51401509 0.46638726 0.54009474 0.49904874 0.46548872 0.53266802
 0.51474099 0.46649066 0.54074887 0.50411794 0.46495227 0.53068852
 0.48714338 0.44345504 0.51608173 0.48051244 0.44158982 0.52062329
 0.48709891 0.44345504 0.51608173 0.4837496  0.4411883  0.52273013
 0.45590822 0.42684474        nan 0.46482648 0.42493848        nan
 0.45590822 0.42684474        nan 0.46771572 0.4249363         nan
 0.6010072  0.41772341 0.60544901 0.59452458 0.4194193  0.60429901
 0.59997605 0.41772341 0.60615931 0.59057787 0.41881183 0.5998215
 0.49640579 0.34696891 0.51257141 0.49418316 0.34891164 0.50575611
 0.49786553 0.34696891 0.51042678 0.49699668 0.3499904  0.50816238
 0.43054555 0.32149723 0.43660366 0.43023022 0.32652776 0.43110978
 0.42922737 0.32149723 0.43660366 0.43001115 0.32689832 0.43203

Wall time: 1h 5min 7s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep', TextPreprocessor_withStem()),
                                       ('vec',
                                        CountVectorizer(binary=True,
                                                        ngram_range=(1, 3),
                                                        stop_words='english',
                                                        token_pattern='\\S+')),
                                       ('sel', SelectKBest(k=1000)),
                                       ('clf', SVC(C=100, random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [100, 10, 1.0, 0.1, 0.01],
                          'clf__gamma': ['scale', 'auto'],
                          'sel__k': [1000, 5000, 10000, 20000, 40000],
                          'sel__score_func': [<function f_classif at 0x00000271F7A02280>,
                                              <function chi2 at 0x00000271F7A023A0>],
            

In [38]:
print('Best params are : %s' % SVC_model_withStem.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % SVC_model_withStem.best_score_)

Best params are : {'clf__C': 100, 'clf__gamma': 'auto', 'sel__k': 1000, 'sel__score_func': <function chi2 at 0x00000271F7A023A0>, 'vec__binary': True, 'vec__ngram_range': (1, 2)}
Best training f1_macro: 0.606


In [39]:
best_SVC_model_withStem = SVC_model_withStem.best_estimator_

#predict on test data
y_pred_svc_withStem = best_SVC_model_withStem.predict(X_test)
print('Test Data classification report')
print(classification_report(y_test, y_pred_svc_withStem))

Test Data classification report
              precision    recall  f1-score   support

    negative       0.70      0.41      0.52       205
     neutral       0.71      0.88      0.78       713
    positive       0.68      0.52      0.59       378

    accuracy                           0.70      1296
   macro avg       0.69      0.60      0.63      1296
weighted avg       0.70      0.70      0.68      1296



### Final Model Results on Test Data

||Model|Test Macro F1|
|--|--|--|
||LR without Stemming|0.64|
||LR with Stemming|0.64|
||SVC without Stemming|0.61|
||SVC with Stemming|0.63|

In [46]:
### pickle and save all 4 models
import pickle
pickle.dump(best_LR_model, open('LR_model_withoutStem.pkl', 'wb'))
pickle.dump(best_LR_model_withStem, open('LR_model_withStem.pkl', 'wb'))
pickle.dump(best_SVC_model, open('SVC_model_withoutStem', 'wb'))
pickle.dump(SVC_model_withStem, open('SVC_model_withStem', 'wb'))