In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB

import helper
import text_embeddings
import performance
import models
import predict

# 1. Load data

In [2]:
x, y = helper.load_data()

# 2. Split data into train/test

In [3]:
x_train, x_test, y_train, y_test = helper.split_data(x, y)

In [4]:
len(x_train) 

116322

In [5]:
len(x_test)

57294

# 3. Text embedding

In [6]:
x_train_features, x_test_features, vectorizer = text_embeddings.encode_tdfif(x_train, x_test, 'text')

# 4. Grid Search using 

-  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
-  https://scikit-learn.org/stable/modules/kernel_approximation.html  

In [7]:
final_x_train = x_train_features['text']['data']
final_y_train = y_train

final_x_test = x_test_features['text']['data']
final_y_test = y_test

In [None]:
%%time
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint


alpha_range = [ 0.0001,0.001,0.01,0.1,1,2,3,4,5,6,7,8,9,10,50,100]
bayes = models.multinomialNB()
params = {'alpha' : alpha_range}
search = GridSearchCV(bayes,
                      params,
                      cv=5,
                      scoring="roc_auc",
                     return_train_score=True)


search.fit(final_x_train, final_y_train)
results = pd.DataFrame.from_dict(search.cv_results_)
results = results.sort_values(['param_alpha'])
results['mean_train_score-mean_test_score'] = results['mean_train_score'] - results['mean_test_score']
results[['param_alpha','mean_train_score','mean_test_score', 'mean_train_score-mean_test_score']]

In [None]:
# Train
plt.plot(results['param_alpha'], results['mean_train_score'], label="Train AUC")
plt.scatter(results['param_alpha'], results['mean_train_score'], label="Train AUC points")

# Validation
plt.plot(results['param_alpha'], results['mean_test_score'], label="Validation AUC")
plt.scatter(results['param_alpha'], results['mean_test_score'], label="Validation AUC points")

plt.xlabel("Alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("AUC vs alpha curves")
plt.legend()
plt.show()

# 5. Prediction

In [8]:
alpha = 10
naive_bayes = MultinomialNB(alpha=alpha)
naive_bayes.fit(final_x_train, final_y_train)

final_y_train_pred = naive_bayes.predict_proba(final_x_train)
final_y_test_pred = naive_bayes.predict_proba(final_x_test)


In [None]:
from sklearn.metrics import roc_curve, auc

train_fpr, train_tpr, train_thresholds = roc_curve(final_y_train, final_y_train_pred[:,1])
test_fpr, test_tpr, test_thresholds = roc_curve(final_y_test, final_y_test_pred[:,1])


plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC PLOTS for Train and Test sets")
plt.grid()
plt.show()

# 7. Performance Measures

In [9]:
performance_measures = performance.get_performance_measures(naive_bayes, final_x_test, final_y_test)
performance_measures

{'accuracy': 0.9287883548015499,
 'f1': 0.9162630325917414,
 'confusion_matrix':              actual_0  actual_1
 predicted_0     30892      2012
 predicted_1      2068     22322,
 'roc_auc': 0.9787500827384884}

# Test

In [10]:
predict.manual_test(naive_bayes, 
                         model_id="NB",
                         vectorizer=vectorizer)

Unnamed: 0,Text,is_depressed(expected),is_depressed(model output),model output probability (if any)
0,I love my life,0,0,0.430612
1,I hate my life,1,1,0.93416
2,Nothing ever goes right for me.,1,1,0.722721
3,Why does everything bad happen with me?,1,1,0.784359
4,Today was such a good day!,0,0,0.095174
5,Wow. I had such an amazing time,0,0,0.063746
6,Loving how me and my lovely partner is talking...,0,0,0.451324
7,Happy Thursday everyone. Thought today was Wed...,0,0,0.17114
8,It’s the little things that make me smile. Got...,0,0,0.262349
9,Lately I have been feeling unsure of myself as...,1,1,0.683977


# Future Work
- Explore string kernel https://github.com/timshenkao/StringKernelSVM
- word2vec using SVM (https://shop.tarjomeplus.com/UploadFileEn/TPLUS_EN_3959.pdf)
- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/