## Capstone Project
-------

### Stage 2 - Modelling phase
------

#### Importing packages and data
------

Possible packages that need to be installed:

1. SpaCy

<code> conda install -c spacy spacy </code>

2. 'en_core_web_md' - library used in SpaCy

<code> python -m spacy download en_core_web_md </code>

3. WordCloud

<code> conda install -c conda-forge wordcloud </code>

4. Hyperas

<code> conda install -c jaikumarm hyperas </code>

------

In [1]:
# import packages
# Hyperas/TensorFlow
# the __future__ import command must be in the beginning of the notebook
from __future__ import print_function

from hyperopt import Trials, STATUS_OK, tpe
from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

import tensorflow as tf
from tensorflow.keras.layers import LSTM, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping

# Basics
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
import random

# Graphs
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Filter warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Preprocessing; model selection and evaluation
from sklearn import pipeline, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import f1_score

# text handling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.neural_network import MLPClassifier
import statsmodels.api as sm

# for custom countvectorizer with SpaCy lemmatization
import spacy
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, VectorizerMixin
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix

# WordCloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
# import packages
data = pd.read_csv("saved_csv/df.csv")
data.drop(columns = "Unnamed: 0",inplace=True)

df = data.copy()

## Question 3

### Can we predict one's comfort level in discussing MH at workplace using participants' qualitative responses of ways to improve MH support?
------

In [117]:
# Grabbing the responses as independent variables
corpus = df.iloc[:,-9]

# Dependent variables
question = "Would you feel comfortable discussing a mental health issue with your coworkers?"

answers = ["Maybe","No","Not Applicable","Yes"]

dep = df[question].copy()

for num in range(len(answers)):
    if num != 3:
        dep[dep==answers[num]] = 0 #Hesitant
    else:
        dep[dep==answers[num]] = 1 #Comfortable

In [118]:
# Creating a table with both independent and dependent variables
table = pd.concat([corpus,dep],axis=1)

# dropping columns that did not answer the question
index = table[table.iloc[:,0]=="Did not answer"].index
table.drop(index,axis=0,inplace=True)

# resetting the index
table = table.reset_index()
table.drop("index",axis=1,inplace=True)

In [119]:
# split the dataset into training/test sets
x_train, x_test, y_train, y_test = train_test_split(table.iloc[:,0].values,
                                                    table.iloc[:,1].values,test_size = 0.2, stratify = table.iloc[:,1].values)

In [120]:
a = pd.DataFrame({"Question":x_train,"class":y_train})
b = a[a.iloc[:,1]==1]
print(b.shape)
d = a[a.iloc[:,1]==0]
print(d.shape)
c = pd.concat([d,b,b],ignore_index=True)
# x_train2 = c.iloc[:,0].values
# y_train2 = c.iloc[:,1].values

(178, 2)
(453, 2)


In [57]:
np.save("saved_csv/x_test.npy", x_test)
np.save("saved_csv/y_test.npy", y_test)

c.to_csv("saved_csv/c.csv")

In [61]:
# Tuning hyperparameter with Hyperas
# Code source: https://github.com/maxpumperla/hyperas

# for RNN

def data():
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    
    table = pd.read_csv("saved_csv/c.csv")
    table.drop(columns = "Unnamed: 0",inplace=True)
    
    x_train = table.iloc[:,0].values
    y_train = table.iloc[:,1].values
    
    x_train,x_validation,y_train,y_validation = train_test_split(x_train,y_train,test_size=0.2,stratify=y_train)
    
    x_test = np.load("saved_csv/x_test.npy")
    y_test = np.load("saved_csv/y_test.npy")
    
    docs = x_train

    vocab_size = 2000
    tokenizer = Tokenizer(num_words= vocab_size)
    tokenizer.fit_on_texts(x_train)

    sequences = tokenizer.texts_to_sequences(x_train)
    x_train = pad_sequences(sequences, maxlen=50)
    
    sequences = tokenizer.texts_to_sequences(x_validation)
    x_validation = pad_sequences(sequences, maxlen=50)
    
    sequences = tokenizer.texts_to_sequences(x_test)
    x_test = pad_sequences(sequences, maxlen=50)
    
    return x_train, y_train, x_validation, y_validation, x_test, y_test


def create_model(x_train, y_train, x_validation, y_validation, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()

    model.add(Embedding(vocab_size, {{choice([32,128])}}, input_length = 50))
    model.add(LSTM({{choice([32,128])}}, return_sequences=True, dropout={{uniform(0,0.1)}}))
    model.add(BatchNormalization())

    model.add(LSTM({{choice([32,64,96,128])}}))
    model.add(BatchNormalization())

    model.add(Dense({{choice([16,32])}}, activation='elu'))
    model.add(Dropout({{uniform(0,1)}}))

    model.add(Dense(2, activation="softmax"))

    # setting up SGD (optimizer) hyperparameters
    sgd = SGD(lr={{uniform(0,0.1)}},decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

    es = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.0001, patience=12, verbose=1)

    result = model.fit(x_train,y_train, batch_size = 64, epochs = 5, callbacks = [es], validation_data=[x_validation,y_validation])

    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':
    best_run, best_model = optim.minimize(model=create_model,data=data,algo=tpe.suggest,max_evals=5,trials=Trials(),
                                          notebook_name='Capstone modelling stage - RNN test')
    x_train, y_train, x_validation, y_validation, x_test, y_test = data()
#     print("Evaluation of best performing model:")
#     print(best_model.evaluate(X_test, Y_test))
    print("Best performing model chosen hyper-parameters:")
    print(best_run)

>>> Imports:
#coding=utf-8

from __future__ import print_function

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
except:
    pass

try:
    from tensorflow.python.keras.models import Sequential
except:
    pass

try:
    from tensorflow.python.keras.utils import np_utils
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    from tensorflow.keras.layers import LSTM, BatchNormalization
except:
    pass

try:
    from tensorflow.keras.optimizers import SGD, Adam, RMSprop
except:
    pass

try:
    from tensorflow.keras.callbacks import EarlyStopping
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import random
except:
    pass

try:
    from matplotli

 - ETA: 1s - loss: 1.1117 - acc: 0.4809            
                                                   
 - ETA: 0s - loss: 1.1329 - acc: 0.4609            
                                                   
 - 16s 25ms/step - loss: 1.1267 - acc: 0.4621 - val_loss: 0.6911 - val_acc: 0.5432

Epoch 2/5                                          
 64/647 [=>............................]           
 - ETA: 2s - loss: 0.8208 - acc: 0.5625            
                                                   
128/647 [====>.........................]           
 - ETA: 2s - loss: 0.8676 - acc: 0.5469            
                                                   
 - ETA: 2s - loss: 0.8551 - acc: 0.5521            
                                                   
 - ETA: 1s - loss: 0.8544 - acc: 0.5469            
                                                   
 - ETA: 1s - loss: 0.8432 - acc: 0.5437            
                                                   
 - ETA: 1s - loss: 0.8676 - acc:

 - ETA: 26s - loss: 1.6502 - acc: 0.4427                                     
                                                                            
 - ETA: 17s - loss: 1.5416 - acc: 0.4375                                     
                                                                            
 - ETA: 11s - loss: 1.4803 - acc: 0.4437                                     
                                                                            
 - ETA: 8s - loss: 1.4916 - acc: 0.4479                                      
                                                                             
 - ETA: 5s - loss: 1.4275 - acc: 0.4598                                      
                                                                             
 - ETA: 3s - loss: 1.4226 - acc: 0.4590                                      
                                                                             
 - ETA: 1s - loss: 1.3820 - acc: 0.4635                            

 - ETA: 1s - loss: 0.8219 - acc: 0.5344                                      
                                                                             
 - ETA: 1s - loss: 0.8080 - acc: 0.5286                                      
                                                                             
 - ETA: 0s - loss: 0.8140 - acc: 0.5179                                      
                                                                             
 - ETA: 0s - loss: 0.8071 - acc: 0.5117                                      
                                                                             
 - ETA: 0s - loss: 0.8109 - acc: 0.5017                                      
                                                                             
 - ETA: 0s - loss: 0.8234 - acc: 0.4813                                      
                                                                             
 - 3s 5ms/step - loss: 0.8226 - acc: 0.4791 - val_loss: 0.6882 -

                                                                             
 - ETA: 1s - loss: 0.7958 - acc: 0.5182                                      
                                                                             
 - ETA: 0s - loss: 0.7790 - acc: 0.5379                                      
                                                                             
 - ETA: 0s - loss: 0.7891 - acc: 0.5293                                      
                                                                             
 - ETA: 0s - loss: 0.7775 - acc: 0.5434                                      
                                                                             
 - ETA: 0s - loss: 0.7690 - acc: 0.5469                                      
                                                                             
 - 4s 6ms/step - loss: 0.7720 - acc: 0.5425 - val_loss: 0.6762 - val_acc: 0.5617

Epoch 3/5                                                   

 - ETA: 1s - loss: 0.6404 - acc: 0.6473                                      
                                                                             
 - ETA: 0s - loss: 0.6448 - acc: 0.6543                                      
                                                                             
 - ETA: 0s - loss: 0.6457 - acc: 0.6458                                      
                                                                             
 - ETA: 0s - loss: 0.6436 - acc: 0.6453                                      
                                                                             
 - 4s 6ms/step - loss: 0.6424 - acc: 0.6461 - val_loss: 0.6775 - val_acc: 0.5556

Best validation acc of epoch:                                                
0.5617284009486069                                                           
Train on 647 samples, validate on 162 samples                                
Epoch 1/5                                                   

 - ETA: 0s - loss: 0.8199 - acc: 0.4844                                      
                                                                             
 - ETA: 0s - loss: 0.8197 - acc: 0.4844                                      
                                                                             
 - ETA: 0s - loss: 0.8181 - acc: 0.4875                                      
                                                                             
 - 2s 4ms/step - loss: 0.8188 - acc: 0.4869 - val_loss: 0.6928 - val_acc: 0.4877

Epoch 4/5                                                                    
 64/647 [=>............................]                                     
 - ETA: 1s - loss: 0.7578 - acc: 0.6094                                      
                                                                             
128/647 [====>.........................]                                     
 - ETA: 1s - loss: 0.7755 - acc: 0.5391                     

                                                                             
 - ETA: 1s - loss: 0.9834 - acc: 0.5104                                      
                                                                             
 - ETA: 0s - loss: 0.9732 - acc: 0.5031                                      
                                                                             
 - 17s 26ms/step - loss: 0.9717 - acc: 0.5023 - val_loss: 0.6746 - val_acc: 0.6111

Epoch 2/5                                                                    
 64/647 [=>............................]                                     
 - ETA: 2s - loss: 1.1788 - acc: 0.5469                                      
                                                                             
128/647 [====>.........................]                                     
 - ETA: 2s - loss: 0.9829 - acc: 0.5312                                      
                                                          

 - ETA: 0s - loss: 0.6841 - acc: 0.5891                                      
                                                                             
 - 3s 5ms/step - loss: 0.6844 - acc: 0.5889 - val_loss: 0.7016 - val_acc: 0.4074

Epoch 5/5                                                                    
 64/647 [=>............................]                                     
 - ETA: 2s - loss: 0.7558 - acc: 0.5781                                      
                                                                             
128/647 [====>.........................]                                     
 - ETA: 2s - loss: 0.7205 - acc: 0.5547                                      
                                                                             
 - ETA: 2s - loss: 0.6772 - acc: 0.6094                                      
                                                                             
 - ETA: 1s - loss: 0.7047 - acc: 0.5820                     

In [121]:
x_train = c.iloc[:,0].values
y_train = c.iloc[:,1].values

x_train,x_validation,y_train,y_validation = train_test_split(x_train,y_train,test_size=0.2,stratify=y_train)

docs = x_train

vocab_size = 2000
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(x_train)

sequences = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(sequences, maxlen=50)

sequences = tokenizer.texts_to_sequences(x_validation)
x_validation = pad_sequences(sequences, maxlen=50)

model = Sequential()

model.add(Embedding(vocab_size, 32, input_length = 50))
model.add(LSTM(128, return_sequences=True, dropout=0.05838315653286106))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(BatchNormalization())

model.add(Dense(16, activation='elu'))
model.add(Dropout(0.08713141896816126))

model.add(Dense(2, activation="softmax"))

# setting up SGD (optimizer) hyperparameters
sgd = SGD(lr=0.03323327852409652,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

# compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

es = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.0001, patience=12, verbose=1)

model.fit(x_train,y_train, batch_size = 64, epochs = 5, callbacks = [es], validation_data=[x_validation,y_validation])

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x1c2a2b6a90>>
Traceback (most recent call last):
  File "/Users/nattiechan/anaconda3/envs/myenv/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1391, in __del__
    def __del__(self):
KeyboardInterrupt: 


Train on 647 samples, validate on 162 samples
Epoch 1/5


KeyboardInterrupt: 

In [66]:
x_test

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,  10,   7,   9],
       [308,  20,   9, ...,  23,  77, 584],
       ...,
       [  0,   0,   0, ..., 288,  17,  61],
       [  0,   0,   0, ...,  38,  45, 592],
       [  0,   0,   0, ...,   4, 545,   7]], dtype=int32)

In [68]:
# sequences = tokenizer.texts_to_sequences(x_test)
# x_test = pad_sequences(sequences, maxlen=50)

y_pred = model.predict(x_test)

my_list = []
for num in range(len(y_pred)):
    if y_pred[num][0] > 0.5:
        my_list.append(0)
    else:
        my_list.append(1)

f1_score(y_test.astype(int),my_list)

0.0

In [122]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding

x_train = c.iloc[:,0].values
y_train = c.iloc[:,1].values

x_train,x_validation,y_train,y_validation = train_test_split(x_train,y_train,test_size=0.2,stratify=y_train)

docs = x_train

vocab_size = 2000
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(x_train)

sequences = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(sequences, maxlen=50)

sequences = tokenizer.texts_to_sequences(x_validation)
x_validation = pad_sequences(sequences, maxlen=50)

sequences = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(sequences, maxlen=50)

model = Sequential()

model.add(Embedding(vocab_size, 128, input_length = 50))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(BatchNormalization())

model.add(Dense(32, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(2, activation="softmax"))

# setting up SGD (optimizer) hyperparameters
sgd = SGD(lr=0.04,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

# compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

es = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.0001, patience=12, verbose=1)

result = model.fit(x_train,y_train, batch_size = 64, epochs = 5, callbacks = [es], validation_data=[x_validation,y_validation])

Train on 647 samples, validate on 162 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [123]:
y_pred = model.predict(x_test)

my_list = []
for num in range(len(y_pred)):
    if y_pred[num][0] > 0.5:
        my_list.append(0)
    else:
        my_list.append(1)
    
f1_score(y_test.astype(int),my_list)

0.43564356435643564

In [127]:
# input an response and see if the model predicts correctly
response = input("Briefly describe what you think the tech industry as a whole and/or \
employers could do to improve mental health support for employees.")

print("Processing...")

sequences = tokenizer.texts_to_sequences(response)
test = pad_sequences(sequences, maxlen=50)
y_pred = model.predict(test)

print("Almost there...")
y_pred

# # printing the result
# if y_pred[0][0] > 0.5:
#     print("The model predicts you are hesitant with discussing MH issue with your coworkers.")
# else:
#     print("The model predicts you to have comfortable with discussing MH issue with your coworkers.")

Briefly describe what you think the tech industry as a whole and/or employers could do to improve mental health support for employees.Shorter hours worklife balance dogs allowed in office
Processing...
Almost there...


array([[0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47668   , 0.5233199 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47945943, 0.5205405 ],
       [0.47532666, 0.5246733 ],
       [0.47668   , 0.5233199 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47902042, 0.5209795 ],
       [0.47532666, 0.5246733 ],
       [0.47902042, 0.5209795 ],
       [0.47532666, 0.5246733 ],
       [0.47532666, 0.5246733 ],
       [0.47668   , 0.5233199 ],
       [0.