In [2]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

import helper
import text_embeddings
import performance
import models
import predict
import constants

In [3]:
text_embedder = constants.TFIDF
text_embedder_args = {}
model_class = MultinomialNB
model_params = {}

In [4]:
nb = models.Model(model_class,
                model_params,
                text_embedder,
                text_embedder_args)

In [5]:
nb.initialize()

In [5]:
alpha_range = [ 0.0001,0.001,0.01,0.1,1,2,3,4,5,6,7,8,9,10,50,100]
params_search = {'alpha' : alpha_range}

nb.search_hyperparameter(params_search,
                         scoring='roc_auc')

Unnamed: 0,param_alpha,mean_train_score,mean_test_score,mean_train_score-mean_test_score
0,0.0001,0.989655,0.980159,0.009495
1,0.001,0.98948,0.981325,0.008155
2,0.01,0.989092,0.982433,0.00666
3,0.1,0.988208,0.983091,0.005117
4,1.0,0.986199,0.98259,0.00361
5,2.0,0.985189,0.982018,0.003172
6,3.0,0.984483,0.981559,0.002925
7,4.0,0.983927,0.981181,0.002746
8,5.0,0.983462,0.980852,0.00261
9,6.0,0.983063,0.980562,0.002501


In [6]:
nb.set_params(**dict(alpha=10))
nb.fit()

In [7]:
nb.model

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

In [8]:
nb.get_performance_measures()

{'accuracy': 0.9287883548015499,
 'f1': 0.9162630325917414,
 'confusion_matrix':              actual_0  actual_1
 predicted_0     30892      2012
 predicted_1      2068     22322,
 'roc_auc': 0.9787500827384884}

In [9]:
nb.predict("I hate my life.")

{'class': 1, 'prob': 0.9341597540258263}

In [10]:
models.dump_model(nb)

# Test

In [None]:
predict.manual_test(naive_bayes, 
                         model_id="NB",
                         vectorizer=vectorizer)

In [None]:
test_results = predict.test_tweets_from_file(naive_bayes,
                             "bot/tweets.json",
                             model_id="NB",
                             vectorizer=vectorizer)

In [None]:
print(test_results['is_depressed(model output)'].value_counts())

In [None]:
depressed_tweets = test_results[test_results['is_depressed(model output)'] == 1]

In [None]:
depressed_tweets.sort_values("model output probability (if any)", ascending=False).head(20)

In [None]:
eighty = depressed_tweets[depressed_tweets['model output probability (if any)'] > 0.8]

In [None]:
eighty['Text'].values

# Future Work
- Explore string kernel https://github.com/timshenkao/StringKernelSVM
- word2vec using SVM (https://shop.tarjomeplus.com/UploadFileEn/TPLUS_EN_3959.pdf)
- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/