In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

from models import helper
from models import text_embeddings
from models import performance
from models import model
from models import predict
from models import constants

In [2]:
text_embedder = constants.TFIDF
text_embedder_args = {
    'min_df' : 0.01,
    'stop_words' :'english'
}
model_class = MultinomialNB
model_params = {}

In [3]:
nb = model.Model(model_class,
                model_params,
                text_embedder,
                text_embedder_args,
                id="nb_tfidf")

In [4]:
nb.initialize()

In [5]:
nb.vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.01, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
alpha_range = [0.0001,0.001,0.01,0.1,1,10,50,100,1000,10000, 100000]
params_search = {'alpha' : alpha_range}

nb.search_hyperparameter(params_search,
                         scoring='roc_auc')

In [6]:
nb.set_params(**dict(alpha=1000))
nb.fit()

In [None]:
nb.get_performance_measures()

In [7]:
# test_text = "I really don’t know what’s going on in my life right now. Few months ago I was too happy. And now, I’m lost."
# test_text= "I really hate my life. I have no friends. I'm going to kill myself"
test_text = "I hate my life"



In [8]:
nb.predict(test_text)



{'class': 1, 'prob': 0.7893837769609016}

# Test

In [None]:
predict.manual_test(naive_bayes, 
                         model_id="NB",
                         vectorizer=vectorizer)

In [None]:
test_results = predict.test_tweets_from_file(naive_bayes,
                             "bot/tweets.json",
                             model_id="NB",
                             vectorizer=vectorizer)

In [None]:
print(test_results['is_depressed(model output)'].value_counts())

In [None]:
depressed_tweets = test_results[test_results['is_depressed(model output)'] == 1]

In [None]:
depressed_tweets.sort_values("model output probability (if any)", ascending=False).head(20)

In [None]:
eighty = depressed_tweets[depressed_tweets['model output probability (if any)'] > 0.8]

In [None]:
eighty['Text'].values

# Future Work
- Explore string kernel https://github.com/timshenkao/StringKernelSVM
- word2vec using SVM (https://shop.tarjomeplus.com/UploadFileEn/TPLUS_EN_3959.pdf)
- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/