In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import joblib
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

In [2]:
def tfidf(X_train, X_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    # from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

    vectorizer = TfidfVectorizer(max_features=2000, ngram=(1, 4))
    X_train = vectorizer.fit_transform(X_train)
    X_train = X_train.toarray()
    print(X_train.shape)

    X_test = vectorizer.transform(X_test)
    X_test = X_test.toarray()
    print(X_test.shape)
    
    return X_train, X_test, vectorizer

In [3]:
def split_data(dfff, ratio):
    Y = dfff['Sentiment'].values
    X = dfff['processed_text']
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    from sklearn.model_selection import train_test_split

    # Random sampling
    return train_test_split(X, Y, test_size=ratio) 

In [4]:
## SVM Linear & Non Linear

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, roc_curve, auc \
                , accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression





def sgd_svm_classifier(X_train, y_train, X_test, y_test, kernel=None):
    if kernel == None:
        alphaList = np.array([0.0001,])
        params_dict = [{'alpha': alphaList}]
        svc_or_sgd = SGDClassifier(loss='hinge', penalty='l2',n_jobs=-1)
        # print(svc_or_sgd)
    
    
    elif kernel == "rbf":
        cList = [0.001,]
        gammaList = [1, 10]
        params_dict = [{'C': cList, 'gamma': gammaList}]
        svc_or_sgd = SVC(kernel='rbf')
        # print(svc_or_sgd)
        
    grid = GridSearchCV(estimator=svc_or_sgd, 
                    param_grid=params_dict, 
                    scoring='f1_micro', n_jobs=3,
                    cv=3, return_train_score=True)
    grid_result = grid.fit(X_train, y_train)

    
    if kernel == None:
        # Model trainig
        model = SGDClassifier(loss='hinge', alpha=grid_result.best_estimator_.get_params()['alpha'],
                                    penalty='l2', 
                                    n_jobs=-1)
        
    elif kernel == "rbf":
        model = SVC(kernel='rbf', C=grid_result.best_estimator_.get_params()['C'],
                    gamma=grid_result.best_estimator_.get_params()['gamma'],
                    probability=True)
        # we have to fit the SGDClassifier so that we can access the coef_
        
    model = model.fit(X_train, y_train)
    calibrator = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    calibrator.fit(X_train, y_train)
    y_pred = calibrator.predict(X_test)

    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())

    # Classification report 
    print(classification_report(y_test, y_pred))
      
    return calibrator

In [5]:
## Naive bayes

def nb_classifier(X_train, y_train, X_test, y_test):
    """
    This method is a wrapper over the actual naive bayes classifier. It will return the most optimal value 
    of Alpha based on the results obtained in cross_validation after running the algorithm on the given dataset.
    """
    alphaList = np.array([0.5,])
    params_Dict = {'alpha' : alphaList}
    nb_Optimal = MultinomialNB()
    grid = GridSearchCV(estimator=nb_Optimal, 
                        param_grid=params_Dict,
                        scoring='f1_micro', n_jobs=4, cv=5, return_train_score=True)
    grid_result = grid.fit(X_train, y_train)
    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())

    model = MultinomialNB(alpha=grid_result.best_estimator_.get_params()['alpha'])
    model.fit(X_train, y_train)
    model = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Classification report 
    print(classification_report(y_test, y_pred))
    # returning the optimal value of K using the MSE of cross validation scores
    return model

In [6]:
## KNN

def knn_classifier(X_train, y_train, X_test, y_test):
    '''
    This method run the kNN classification algorithm on the given dataset, using brute force approach. 
    It plots the curves of various metrics employed to gauge the classifier performance, and returns the 
    optimal value of k, based on the cross_validaton_score as defined in the problem
    '''

    # knn_classifier(X_train, y_train, X_test, y_test)
    kNeighbours = [1,]    
    algo = ['ball_tree']
    params_dict = [{'n_neighbors': kNeighbours, 'algorithm':algo}]
    knn = KNeighborsClassifier()

    grid = GridSearchCV(estimator=knn, 
                param_grid=params_dict, 
                scoring='f1_micro', n_jobs=3,
                cv=3, return_train_score=True)
    grid_result = grid.fit(X_train, y_train)
    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())

    model = KNeighborsClassifier(n_neighbors=grid_result.best_estimator_.get_params()['n_neighbors'],
                                       algorithm=grid_result.best_estimator_.get_params()['algorithm'])
    model.fit(X_train, y_train)
    calibrator = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    calibrator.fit(X_train, y_train)
    y_pred = calibrator.predict(X_test)

    # Classification report 
    print(classification_report(y_test, y_pred))
    # returning the optimal value of K using the MSE of cross validation scores
    return calibrator

In [7]:
## Logestic Regression

def lr_classifier(X_train, y_train, X_test, y_test):
    cList = np.array([10])
    p = ['l2']
    sol = ['liblinear']                
    params_dict = [{'C': cList, 'penalty':p, 'solver':sol}]
    lr_optimal = LogisticRegression()
    
    grid = GridSearchCV(estimator=lr_optimal, 
                        param_grid=params_dict,
#                           param_distributions=params_dict, 
                        scoring='f1_micro', n_jobs=-1, cv=2,  
                        return_train_score=True
                       )
    
    grid_result = grid.fit(X_train, y_train)
    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())
    
    model = LogisticRegression(C=grid_result.best_estimator_.get_params()['C'],
                                    penalty=grid_result.best_estimator_.get_params()['penalty'],
                                    solver=grid_result.best_estimator_.get_params()['solver'])
    model.fit(X_train, y_train)
                      
    model.fit(X_train, y_train)
    model = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Classification report 
    print(classification_report(y_test, y_pred))
    # returning the optimal value of K using the MSE of cross validation scores
    return model


In [8]:
## Dicission Tree

from sklearn.tree import DecisionTreeClassifier
def dt_classifier(X_train, y_train, X_test, y_test):
    max_depth = np.array([500, 1000])
    min_samples_split = np.array([5,])#10, 100, 500])
    params_dict = [{'max_depth': max_depth, 
                    'min_samples_split': min_samples_split}]
    dt_optimal = DecisionTreeClassifier(random_state=1)
    
    grid = GridSearchCV(estimator=dt_optimal, 
                        param_grid=params_dict, 
                        scoring='f1_micro', n_jobs=4, cv=5,
                        return_train_score=True)
    grid_result = grid.fit(X_train, y_train)
    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())
    

    model = DecisionTreeClassifier(max_depth=grid_result.best_estimator_.get_params()['max_depth'], 
                        min_samples_split=grid_result.best_estimator_.get_params()['min_samples_split'],
                                random_state=1)
    model.fit(X_train, y_train)
    model = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Classification report 
    print(classification_report(y_test, y_pred))
    # returning the optimal value of K using the MSE of cross validation scores
    return model

In [9]:
from sklearn.ensemble import RandomForestClassifier

def rf_classifier(X_train, y_train, X_test, y_test):
    max_depth = np.array([150, 200,])
    min_samples_leaf = np.array([1, 2,])
    n_estimators = np.array([500, 600])
    params_dict = [{'max_depth': max_depth,
                    'min_samples_leaf': min_samples_leaf,
                    'n_estimators': n_estimators}]
    rf_optimal = RandomForestClassifier(random_state=1,n_jobs=6,
                                        class_weight='balanced')
    
    grid = GridSearchCV(estimator=rf_optimal, 
                        param_grid=params_dict, 
                        scoring='f1_micro', n_jobs=4, cv=5,
                        return_train_score=True)
    grid_result = grid.fit(X_train, y_train)
    print("Optimal Parameters : ", grid_result.best_estimator_.get_params())

    model = DecisionTreeClassifier(max_depth=grid_result.best_estimator_.get_params()['max_depth'], 
                        min_samples_split=grid_result.best_estimator_.get_params()['min_samples_split'],
                                random_state=1)
    
    model = RandomForestClassifier(max_depth=grid_result.best_estimator_.get_params()['max_depth'],
                        min_samples_leaf=grid_result.best_estimator_.get_params()['min_samples_leaf'],
                        n_estimators=grid_result.best_estimator_.get_params()['n_estimators'],  
                        n_jobs=-4,
                        random_state=1)
    model.fit(X_train, y_train)
    model = CalibratedClassifierCV(model, cv=5, 
                                        method='isotonic')
    # Lets refit the calibrator to find probabilities
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Classification report 
    print(classification_report(y_test, y_pred))
    # returning the optimal value of K using the MSE of cross validation scores
    return model

In [10]:
df = joblib.load('data/oversampled.df')
df['Sentiment'].value_counts()

2    4000
0    3753
1    3169
Name: Sentiment, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = split_data(df, ratio=20)
X_train, X_test, VEC = tfidf(X_train, X_test)



joblib.dump(VEC, 'tfidf_X_train.model')

(10902, 2000)
(20, 2000)


['tfidf_X_train.model']

In [12]:
# SVM Linear
model = sgd_svm_classifier(X_train, y_train, X_test, y_test, kernel=None)
joblib.dump(model, "data/svm_linear.model")

Optimal Parameters :  {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': -1, 'penalty': 'l2', 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       1.00      1.00      1.00         5
           2       0.67      0.67      0.67         6

    accuracy                           0.80        20
   macro avg       0.81      0.81      0.81        20
weighted avg       0.80      0.80      0.80        20



['data/svm_linear.model']

In [12]:
# SVM Non-Linear
model = sgd_svm_classifier(X_train, y_train, X_test, y_test, kernel='rbf')
joblib.dump(model, "data/svm_nonLinear.model")

Optimal Parameters :  {'C': 0.001, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.78      0.78      0.78         9
           2       0.00      0.00      0.00         3

    accuracy                           0.65        20
   macro avg       0.54      0.51      0.53        20
weighted avg       0.69      0.65      0.67        20



['data/svm_nonLinear.model']

In [12]:
## Niave Bayes
model = nb_classifier(X_train, y_train, X_test, y_test)
joblib.dump(model, "data/nb.model")

Optimal Parameters :  {'alpha': 0.5, 'class_prior': None, 'fit_prior': True}
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       1.00      0.75      0.86         8
           2       0.33      0.50      0.40         2

    accuracy                           0.80        20
   macro avg       0.72      0.72      0.70        20
weighted avg       0.84      0.80      0.81        20



['data/nb.model']

In [13]:
## KNN
model = knn_classifier(X_train, y_train, X_test, y_test)
joblib.dump(model, "data/knn.model")

Optimal Parameters :  {'algorithm': 'ball_tree', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        10
           1       0.86      0.75      0.80         8
           2       0.00      0.00      0.00         2

    accuracy                           0.70        20
   macro avg       0.53      0.52      0.52        20
weighted avg       0.71      0.70      0.70        20



['data/knn.model']

In [14]:
## Logistic Regression
model = lr_classifier(X_train, y_train, X_test, y_test)
joblib.dump(model, "data/lr.model")

Traceback (most recent call last):
  File "/home/hamza/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hamza/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/hamza/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/hamza/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hamza/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver

Optimal Parameters :  {'C': 10.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        10
           1       1.00      0.75      0.86         8
           2       0.25      0.50      0.33         2

    accuracy                           0.80        20
   macro avg       0.72      0.72      0.70        20
weighted avg       0.88      0.80      0.83        20



['data/lr.model']

In [15]:
## Decision Tree
model = dt_classifier(X_train, y_train, X_test, y_test)
joblib.dump(model, "data/dt.model")

Optimal Parameters :  {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 500, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'random_state': 1, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       0.88      0.70      0.78        10
           1       0.67      0.50      0.57         8
           2       0.17      0.50      0.25         2

    accuracy                           0.60        20
   macro avg       0.57      0.57      0.53        20
weighted avg       0.72      0.60      0.64        20



['data/dt.model']

In [13]:
## Random Forest
model = rf_classifier(X_train, y_train, X_test, y_test)
joblib.dump(model, "data/rf.model")

Optimal Parameters :  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 200, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 600, 'n_jobs': 6, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.88      0.78      0.82         9
           2       0.33      0.67      0.44         3

    accuracy                           0.75        20
   macro avg       0.74      0.73      0.71        20
weighted avg       0.84      0.75      0.78        20



['data/rf.model']

In [14]:
## Stacking

def stacking_models(x_train, y_train, x_test, y_test, x_cv, y_cv, *models):
    from sklearn.metrics import log_loss,f1_score,  classification_report
    from mlxtend.classifier import StackingClassifier
    from sklearn.calibration import CalibratedClassifierCV

    models_list = []
    for model in models:
        model.fit(x_train, y_train)
        model = CalibratedClassifierCV(model, method="sigmoid")
        models_list.append(model)
        
    alpha = [0.01] 
    best_alpha = 999
    best_f1_score = -99.99
    for i in alpha:
        lr = LogisticRegression(C=i)
        model = StackingClassifier(classifiers=models_list, meta_classifier=lr, use_probas=True)
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_cv)
        log_error = log_loss(y_cv, model.predict_proba(x_cv))
        print(f"Stacking Classifer: alpha: {i} Log Loss: {log_error}")
        if best_alpha > log_error:
            best_alpha = log_error         
            
    model = StackingClassifier(classifiers=models_list, meta_classifier=LogisticRegression(C=best_alpha), use_probas=True)
    model.fit(x_train, y_train)

    # Classification report 
    print(classification_report(y_test, model.predict(x_test)))
    
    return model   

In [15]:
def votting_classifier(x_train, y_train, x_test, y_test, x_cv, y_cv, *models):
    #Refer:http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
    from sklearn.ensemble import VotingClassifier
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.metrics import log_loss, classification_report
    
    models_list = []
    for model in models:
        name = str(model).split("(")[0]
        model.fit(x_train, y_train)
        model = CalibratedClassifierCV(model, method="sigmoid")
        models_list.append( tuple((name, model)) )
        
        
    model = VotingClassifier(estimators=models_list, voting='soft')
    model.fit(x_train, y_train)
    
    # Classification report 
    print(classification_report(y_test, model.predict(x_test)))
    
    return model

In [16]:
# from sklearn.model_selection import train_test_split

Y = df['Sentiment'].values
X = df['processed_text']
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, Y,  test_size=0.2)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.2)

from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

vectorizer = TfidfVectorizer(max_features=2000)
X_train = vectorizer.fit_transform(X_train)
X_train = X_train.toarray()
print(X_train.shape)

X_test = vectorizer.transform(X_test)
X_test = X_test.toarray()
print(X_test.shape)

X_cv = vectorizer.transform(X_cv)
X_cv = X_cv.toarray()
print(X_cv.shape)

(6989, 2000)
(2185, 2000)
(1748, 2000)


In [17]:
model = stacking_models(X_train, y_train, X_test, y_test, X_cv, y_cv,
            SGDClassifier(loss='hinge', penalty='l2',n_jobs=-1, alpha=0.0001),  # Linear svm
            SVC(kernel='rbf', C=10, gamma=1),  # non-linear svm
            MultinomialNB(alpha=0.5),  # nb
            LogisticRegression(C=10.0, penalty='l2', solver='lbfgs'), # lr
            DecisionTreeClassifier(random_state=1, max_depth=500, min_samples_split=5), # dt
            RandomForestClassifier(random_state=1, n_jobs=6, max_depth=150, min_samples_leaf=1, n_estimators=500) # 
        )

joblib.dump(model, 'data/stacking_model.model')

Stacking Classifer: alpha: 0.01 Log Loss: 0.5774776432718101
              precision    recall  f1-score   support

           0       0.83      0.80      0.81       726
           1       0.77      0.77      0.77       649
           2       0.73      0.76      0.75       810

    accuracy                           0.77      2185
   macro avg       0.78      0.77      0.78      2185
weighted avg       0.78      0.77      0.77      2185



['data/stacking_model.model']

In [18]:
model = votting_classifier(X_train, y_train, X_test, y_test, X_cv, y_cv,
            SGDClassifier(loss='hinge', penalty='l2',n_jobs=-1, alpha= 0.0001),  # Linear
            SVC(kernel='rbf', C=10, gamma=1),
            MultinomialNB(alpha=0.5),
            # KNeighborsClassifier(algorithm='auto', n_neighbors=1),
            LogisticRegression(C=10.0, penalty='l2', solver='lbfgs'),
            DecisionTreeClassifier(random_state=1, max_depth=500, min_samples_split=5),
            RandomForestClassifier(random_state=1, n_jobs=6, max_depth=150, min_samples_leaf=1, n_estimators=500)
        )

joblib.dump(model, 'data/votting_model.model')

              precision    recall  f1-score   support

           0       0.84      0.78      0.81       726
           1       0.79      0.73      0.76       649
           2       0.70      0.80      0.75       810

    accuracy                           0.77      2185
   macro avg       0.78      0.77      0.77      2185
weighted avg       0.78      0.77      0.77      2185



['data/votting_model.model']

In [19]:
df = joblib.load('data/oversampled.df')

from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

vectorizer = TfidfVectorizer(max_features=2000)
vectorizer.fit_transform(df['processed_text'])

joblib.dump(vectorizer, 'data/tfidf_model.model')

['data/tfidf_model.model']

In [20]:
X_train.shape

(6989, 2000)