# Imports

In [1]:
import numpy as np
import pandas as pd

import gzip
import pickle

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('Datasets/5k_tweets_lda_10.csv', index_col=0)
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(575949, 32)

In [3]:
with gzip.open('Datasets/features_df.pkl', 'rb') as handle:
    features_df = pickle.load(handle)  
features_df.shape

(575949, 3725)

# Train Test Split

In [4]:
# Fucntion to split into train and test data
def tt_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)
    return X_train, X_test, y_train, y_test

# Split into train and test data
X_train, X_test, y_train, y_test = tt_split(features_df, df['sentiment'])

X_train.shape, X_test.shape

((431961, 3725), (143988, 3725))

# MinMax Scaler

In [5]:
# MinMax Scaler
mm_scaler = MinMaxScaler()
X_train_mm_scl = mm_scaler.fit_transform(X_train)
X_test_mm_scl = mm_scaler.transform(X_test)

# Decision Tree

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
param_grid = {
    'criterion': ['gini','entropy'],
    'max_features': [None, 10, 20, 30],
    'min_samples_leaf': [10, 100, 1000, 10000]
}

In [8]:
dt_grid = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'), param_grid, cv=5,
                     scoring='accuracy', n_jobs=-1, verbose=2, return_train_score=True)

In [None]:
dt_grid.fit(X_train_mm_scl, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
# examine the best model
print(dt_grid.best_score_)
print(dt_grid.best_params_)
print(dt_grid.best_estimator_)

In [None]:
#your code here
y_preds = dt_grid.best_estimator_.predict(X_test_mm_scl)

accuracy_score(y_test, y_preds), f1_score(y_test, y_preds, average='weighted')

# Standard Scaler

In [17]:
# Standard Scaler
std_scaler = StandardScaler()
X_train_std_scl = std_scaler.fit_transform(X_train)
X_test_std_scl = std_scaler.transform(X_test)

# Bernoulli Naive Bayes

In [12]:
# Function to fit bernoulli naive bayes model
def bnb(X_train, y_train):
    bnb = BernoulliNB()
    bnb_fit = bnb.fit(X_train, y_train)
    return bnb_fit

In [13]:
# Fit bernoulli naive bayes model on train data, predict on minmax scaler test data
bnb_fit_mm_scl = bnb(X_train_mm_scl, y_train)
bnb_preds_mm_scl = bnb_fit_mm_scl.predict(X_test_mm_scl)

accuracy_score(bnb_preds_mm_scl, y_test), f1_score(bnb_preds_mm_scl, y_test, average='weighted')

(0.6962663555296275, 0.6938904652543706)

In [18]:
# Fit bernoulli naive bayes model on train data, predict on standard scaler test data
bnb_fit_std_scl = bnb(X_train_std_scl, y_train)
bnb_preds_std_scl = bnb_fit_std_scl.predict(X_test_std_scl)

accuracy_score(bnb_preds_std_scl, y_test), f1_score(bnb_preds_std_scl, y_test, average='weighted')

(0.7007250604217018, 0.698552621939297)

# Multinomial Naive Bayes

In [14]:
# Function to fit multinomial naive bayes model
def mnb(X_train, y_train):
    mnb = MultinomialNB()
    mnb_fit = mnb.fit(X_train, y_train)
    return mnb_fit

In [15]:
# Fit multinomial naive bayes model on train data, predict on minmax scaler test data
mnb_fit_mm_scl = mnb(X_train_mm_scl, y_train)
mnb_preds_mm_scl = mnb_fit_mm_scl.predict(X_test_mm_scl)

accuracy_score(mnb_preds_mm_scl, y_test), f1_score(mnb_preds_mm_scl, y_test, average='weighted')

(0.648366530544212, 0.6560903029607621)

In [25]:
# Function to fit multinomial naive bayes model
def dt(X_train, y_train):
    dt = DecisionTreeClassifier()
    dt_fit = dt.fit(X_train, y_train)
    return dt_fit

In [26]:
# Fit bernoulli naive bayes model on train data, predict on minmax scaler test data
dt_fit_mm_scl = dt(X_train_mm_scl, y_train)
dt_preds_mm_scl = dt_fit_mm_scl.predict(X_test_mm_scl)

accuracy_score(dt_preds_mm_scl, y_test), f1_score(dt_preds_mm_scl, y_test, average='weighted')

(0.723782537433675, 0.7237418687722337)

In [None]:
# Fit bernoulli naive bayes model on train data, predict on standard scaler test data
dt_fit_std_scl = dt(X_train_std_scl, y_train)
dt_preds_std_scl = dt_fit_std_scl.predict(X_test_std_scl)

accuracy_score(dt_preds_std_scl, y_test), f1_score(dt_preds_std_scl, y_test, average='weighted')

# Confusion Matrix

In [21]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion Matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize=15,
                 horizontalalignment="center", verticalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

In [None]:
cm_bnb = confusion_matrix(y_test, y_pred_bnb)
classes = ['Negative', 'Neutral', 'Positive']
plot_confusion_matrix(cm_bnb, classes,
                          normalize=True,
                          title='Bernoulli confusion matrix',
                          cmap=plt.cm.Blues)