In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

from models import helper
from models import text_embeddings
from models import performance
from models import model
from models import predict
from models import constants

In [2]:
text_embedder = constants.TFIDF
text_embedder_args = {
    'min_df' : 0.01,
    'stop_words' :'english'
}
model_class = MultinomialNB
model_params = {}

In [3]:
nb = model.Model(model_class,
                model_params,
                text_embedder,
                text_embedder_args,
                id="nb_tfidf")

In [4]:
nb.initialize()

In [5]:
nb.vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.01, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
alpha_range = [0.00001, 0.0001,0.001,0.01,0.1,1,10,50,100,1000,10000, 100000]
params_search = {'alpha' : alpha_range}

nb.search_hyperparameter(params_search,
                         scoring='precision')

Unnamed: 0,param_alpha,mean_train_score,mean_test_score,mean_train_score-mean_test_score
0,1e-05,0.831892,0.831763,0.000129
1,0.0001,0.831892,0.831763,0.000129
2,0.001,0.831892,0.831763,0.000128
3,0.01,0.831892,0.831763,0.000129
4,0.1,0.831895,0.831764,0.000131
5,1.0,0.831984,0.831847,0.000137
6,10.0,0.832145,0.831994,0.00015
7,50.0,0.832903,0.832768,0.000135
8,100.0,0.83372,0.833618,0.000102
9,1000.0,0.843688,0.843661,2.7e-05


In [13]:
alpha_range = [0.00001, 0.0001,0.001,0.01,0.1,1,10,50,100,1000,10000, 100000]
params_search = {'alpha' : alpha_range}

nb.search_hyperparameter(params_search,
                         scoring='f1')

Unnamed: 0,param_alpha,mean_train_score,mean_test_score,mean_train_score-mean_test_score
0,1e-05,0.858169,0.858101,6.8e-05
1,0.0001,0.858169,0.858101,6.8e-05
2,0.001,0.858168,0.858101,6.7e-05
3,0.01,0.858168,0.858101,6.8e-05
4,0.1,0.858168,0.858098,7e-05
5,1.0,0.858172,0.858099,7.3e-05
6,10.0,0.85799,0.857897,9.2e-05
7,50.0,0.857281,0.857192,8.9e-05
8,100.0,0.856453,0.856376,7.7e-05
9,1000.0,0.846731,0.846739,-8e-06


In [14]:
alpha_range = [0.00001, 0.0001,0.001,0.01,0.1,1,10,50,100,1000,10000, 100000]
params_search = {'alpha' : alpha_range}

nb.search_hyperparameter(params_search,
                         scoring='roc_auc')

Unnamed: 0,param_alpha,mean_train_score,mean_test_score,mean_train_score-mean_test_score
0,1e-05,0.938386,0.938331,5.5e-05
1,0.0001,0.938386,0.938331,5.5e-05
2,0.001,0.938386,0.938331,5.5e-05
3,0.01,0.938386,0.938331,5.5e-05
4,0.1,0.938383,0.938328,5.5e-05
5,1.0,0.938359,0.938304,5.5e-05
6,10.0,0.938141,0.938085,5.6e-05
7,50.0,0.937278,0.937231,4.7e-05
8,100.0,0.936359,0.936312,4.7e-05
9,1000.0,0.927199,0.927141,5.8e-05


In [6]:
nb.set_params(**dict(alpha=1000))
nb.fit()

In [None]:
nb.get_performance_measures()

In [7]:
# test_text = "I really don’t know what’s going on in my life right now. Few months ago I was too happy. And now, I’m lost."
# test_text= "I really hate my life. I have no friends. I'm going to kill myself"
test_text = "I hate my life"



In [8]:
nb.predict(test_text)



{'class': 1, 'prob': 0.7893837769609016}

# Test

In [None]:
predict.manual_test(naive_bayes, 
                         model_id="NB",
                         vectorizer=vectorizer)

In [None]:
test_results = predict.test_tweets_from_file(naive_bayes,
                             "bot/tweets.json",
                             model_id="NB",
                             vectorizer=vectorizer)

In [None]:
print(test_results['is_depressed(model output)'].value_counts())

In [None]:
depressed_tweets = test_results[test_results['is_depressed(model output)'] == 1]

In [None]:
depressed_tweets.sort_values("model output probability (if any)", ascending=False).head(20)

In [None]:
eighty = depressed_tweets[depressed_tweets['model output probability (if any)'] > 0.8]

In [None]:
eighty['Text'].values

# Future Work
- Explore string kernel https://github.com/timshenkao/StringKernelSVM
- word2vec using SVM (https://shop.tarjomeplus.com/UploadFileEn/TPLUS_EN_3959.pdf)
- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/