# Creating a Stress Detection Tool using Data From Mental Health Subreddits: Modeling

#### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Import dataframe from pickle

In [3]:
df = pd.read_pickle('df.pickle')

#### Define x and y

In [4]:
x=df['text']
y=df['score']

#### Define stopwords

In [5]:
vect=CountVectorizer(stop_words="english")

#### Train/test split x and y

In [6]:
x=vect.fit_transform(x)

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

#### Naive Bayes

In [8]:
mb=MultinomialNB()
m1=mb.fit(x_train,y_train).predict(x_test)

In [9]:
print("Naive Bayes Model Accuracy is {p}%".format(p =round (accuracy_score(m1,y_test)*100, 2)))

Naive Bayes Model Accuracy is 75.58%


#### Logistic Regression

In [10]:
m2=LogisticRegression().fit(x_train, y_train)
score = m2.score(x_test,y_test)

In [11]:
print("Logistic Regression Model Accuracy is {p}%".format(p =round (score*100, 2)))

Logistic Regression Model Accuracy is 80.14%


#### Decision Tree

In [12]:
d=DecisionTreeClassifier()
d.fit(x_train,y_train)
m3=d.predict(x_test)

In [13]:
print("Decision Tree Model Accuracy is {p}%".format(p =round (accuracy_score(y_test,m3)*100, 2)))

Decision Tree Model Accuracy is 70.59%


#### Testing the models
* Here I will enter a sentence that I as the user know to be positive to see what the models do

In [14]:
user_input = input("Enter a sentence: ")

Enter a sentence: I am very happy and relaxed today! I got everything done on my to-do list!


#### Naive Bayes

In [15]:
p = vect.transform([user_input]).toarray()
op = mb.predict(p)
print("Using the Naive Bayes model...")
if op == 'neg':
    print("...This input indicated that the user is stressed")
if op == 'pos':
    print("...This input indicated that the user is NOT stressed")

Using the Naive Bayes model...
...This input indicated that the user is NOT stressed


#### Logistic Regression

In [16]:
p = vect.transform([user_input]).toarray()
op = m2.predict(p)
print("Using the Logistical Regression model...")
if op == 'neg':
    print("...This input indicated that the user is stressed")
if op == 'pos':
    print("...This input indicated that the user is NOT stressed")

Using the Logistical Regression model...
...This input indicated that the user is NOT stressed


#### Decision Tree

In [17]:
p = vect.transform([user_input]).toarray()
op = d.predict(p)
print("Using the Decision Tree model...")
if op == 'neg':
    print("...This input indicated that the user is stressed")
if op == 'pos':
    print("...This input indicated that the user is NOT stressed")

Using the Decision Tree model...
...This input indicated that the user is NOT stressed


#### Outcomes of first test:
* All three models performed as expected

#### Creating the function
* Because all three models have a high enough accuracy and have peformed well preliminarily, I will include all three in a final user-input function

In [18]:
def stress_test(x):
    p = vect.transform([user_input]).toarray()
    op = mb.predict(p)
    print("Using the Naive Bayes model...")
    if op == 'neg':
        print("...This input indicated that the user is stressed\n")
    if op == 'pos':
        print("...This input indicated that the user is NOT stressed\n")
        
    p1 = vect.transform([user_input]).toarray()
    op1 = m2.predict(p1)
    print("Using the Logistical Regression model...")
    if op1 == 'neg':
        print("...This input indicated that the user is stressed\n")
    if op1 == 'pos':
        print("...This input indicated that the user is NOT stressed\n")
        
    p2 = vect.transform([user_input]).toarray()
    op2 = d.predict(p2)
    print("Using the Decision Tree model...")
    if op2 == 'neg':
        print("...This input indicated that the user is stressed\n")
    if op2 == 'pos':
        print("...This input indicated that the user is NOT stressed\n")

#### More testing!

In [19]:
user_input = input("Enter a sentence: ")
stress_test(user_input)

Enter a sentence: I had a horrible day, it started out bad when I missed the bus, then I was late to work, and then it started raining
Using the Naive Bayes model...
...This input indicated that the user is stressed

Using the Logistical Regression model...
...This input indicated that the user is stressed

Using the Decision Tree model...
...This input indicated that the user is stressed



In [20]:
user_input = input("Enter a sentence: ")
stress_test(user_input)

Enter a sentence: I've been having a really hard time studying for my midterm, sometimes I think I'm stupid
Using the Naive Bayes model...
...This input indicated that the user is stressed

Using the Logistical Regression model...
...This input indicated that the user is stressed

Using the Decision Tree model...
...This input indicated that the user is NOT stressed



In [21]:
user_input = input("Enter a sentence: ")
stress_test(user_input)

Enter a sentence: I worked really hard all semester and studied hard for my final, and I got an A! Let's go!
Using the Naive Bayes model...
...This input indicated that the user is NOT stressed

Using the Logistical Regression model...
...This input indicated that the user is NOT stressed

Using the Decision Tree model...
...This input indicated that the user is NOT stressed



#### Conclusion
* As you can see with some further testing, the models are NOT 100% accurate!
* The more 'positive' and 'negative' words used, along with certain punctuation, can make a big difference in accuracy based on our method of scoring in previous notebooks
* Overall though, all three models are adequately accurate for the purpose