# Creating a Stress Detection Tool using Data From Subreddits: Modeling

#### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Import dataframe from pickle

In [3]:
df = pd.read_pickle('df.pickle')

#### Define x and y

In [4]:
x=df['text']
y=df['stress_label']

#### Define stopwords

In [5]:
vect=CountVectorizer(stop_words="english")

#### Train/test split x and y

In [6]:
x=vect.fit_transform(x)

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

#### Naive Bayes

In [8]:
mb=MultinomialNB()
m1=mb.fit(x_train,y_train).predict(x_test)

In [9]:
print("Naive Bayes Model Accuracy is {p}%".format(p =round (accuracy_score(m1,y_test)*100, 2)))

Naive Bayes Model Accuracy is 79.24%


#### Logistic Regression

In [10]:
m2=LogisticRegression().fit(x_train, y_train)
score = m2.score(x_test,y_test)

In [11]:
print("Logistic Regression Model Accuracy is {p}%".format(p =round (score*100, 2)))

Logistic Regression Model Accuracy is 80.95%


#### Decision Tree

In [12]:
d=DecisionTreeClassifier()
d.fit(x_train,y_train)
m3=d.predict(x_test)

In [13]:
print("Decision Tree Model Accuracy is {p}%".format(p =round (accuracy_score(y_test,m3)*100, 2)))

Decision Tree Model Accuracy is 71.0%


#### Testing the models
* Here I will enter a sentence that I as the user know to be positive to see what the models do

In [14]:
user_input = input("Enter a sentence: ")

Enter a sentence: I am sad today


#### Naive Bayes

In [15]:
p = vect.transform([user_input]).toarray()
op = mb.predict(p)
print("Using the Naive Bayes model...")
if op == 'stress':
    print("...This input indicated that the user is stressed")
if op == 'no stress':
    print("...This input indicated that the user is NOT stressed")

Using the Naive Bayes model...
...This input indicated that the user is stressed


#### Logistic Regression

In [16]:
p = vect.transform([user_input]).toarray()
op = m2.predict(p)
print("Using the Logistic Regression model...")
if op == 'stres':
    print("...This input indicated that the user is stressed")
if op == 'no stress':
    print("...This input indicated that the user is NOT stressed")

Using the Logistic Regression model...
...This input indicated that the user is NOT stressed


#### Decision Tree

In [17]:
p = vect.transform([user_input]).toarray()
op = d.predict(p)
print("Using the Decision Tree model...")
if op == 'stress':
    print("...This input indicated that the user is stressed")
if op == 'no stress':
    print("...This input indicated that the user is NOT stressed")

Using the Decision Tree model...
...This input indicated that the user is stressed


#### Outcomes of first test:
* All three models performed as expected

#### Creating the function
* Because all three models performed well with a user input test sentence, I will choose the model with the highest accuracy which is the Logistic Regression model

In [18]:
def stress_test_demo(x):
        
    p1 = vect.transform([user_input]).toarray()
    op1 = m2.predict(p1)
    print("Using the Logistic Regression model...")
    if op1 == 'stress':
        print("...This input indicated that the user is stressed\n")
    if op1 == 'no stress':
        print("...This input indicated that the user is NOT stressed\n")

#### More testing!

In [19]:
user_input = input("Enter a sentence: ")
stress_test_demo(user_input)

Enter a sentence: I had a great day today!
Using the Logistic Regression model...
...This input indicated that the user is NOT stressed



In [20]:
user_input = input("Enter a sentence: ")
stress_test_demo(user_input)

Enter a sentence: I studied really hard and got an A on my test!
Using the Logistic Regression model...
...This input indicated that the user is NOT stressed



In [21]:
user_input = input("Enter a sentence: ")
stress_test_demo(user_input)

Enter a sentence: I had a really tough day today
Using the Logistic Regression model...
...This input indicated that the user is NOT stressed



#### Conclusion
* As you can see with some further testing, the model is NOT 100% accurate!
* The more 'positive' and 'negative' words used, along with certain punctuation, can make a big difference in accuracy based on our method of scoring in previous notebooks
* Overall though, all three models are adequately accurate for the purpose however I selected the logistic regression model as the final contendor due to it having the highest accuracy