## Capstone Project
-------

### Stage 2 - Modelling phase - Neural Networks
------

#### Importing packages and data
------

Possible packages that need to be installed:

1. Hyperas

<code> conda install -c jaikumarm hyperas </code>

These packages are from the previous notebook. If virtual environments are used for neural network, however, the following packages will need to be installed in order for the notebook to run properly.

3. SpaCy

<code> conda install -c spacy spacy </code>

4. 'en_core_web_md' - library used in SpaCy

<code> python -m spacy download en_core_web_md </code>

5. wordcloud

<code> conda install -c conda-forge wordcloud </code>

------

In [16]:
# import packages

# Hyperas/TensorFlow
# the __future__ import command must be in the beginning of the notebook
from __future__ import print_function

from hyperopt import Trials, STATUS_OK, tpe
from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

import tensorflow as tf
from tensorflow.keras.layers import LSTM, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping

# Basics
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Preprocessing; model selection and evaluation
from sklearn import pipeline, preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# text handling
from sklearn.feature_extraction.text import TfidfVectorizer

# for custom countvectorizer with SpaCy lemmatization
import spacy
from sklearn.feature_extraction.text import CountVectorizer, VectorizerMixin
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix

# WordCloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

------
If we need to move virtual ENV to use Tensorflow we will need to install:

1. spacy

<code> conda install -c spacy spacy </code>

2. 'en_core_web_md'

<code> python -m spacy download en_core_web_md </code>

3. wordcloud

<code> conda install -c conda-forge wordcloud </code>

------

In [17]:
# import packages
data = pd.read_csv("saved_csv/df.csv")
data.drop(columns = "Unnamed: 0",inplace=True)

df = data.copy()

### Creating a model to predict comfort level using text responses
------

In [18]:
# Grabbing the responses as independent variables
corpus = df.iloc[:,-9]

# Dependent variables
question = "Would you feel comfortable discussing a mental health issue with your coworkers?"

answers = ["Maybe","No","Not Applicable","Yes"]

dep = df[question].copy()

for num in range(len(answers)):
    if num != 3:
        dep[dep==answers[num]] = 0 #No
    else:
        dep[dep==answers[num]] = 1 #Yes

In [19]:
# Creating a table with both independent and dependent variables
table = pd.concat([corpus,dep],axis=1)

# dropping columns that did not answer the question
index = table[table.iloc[:,0]=="Did not answer"].index

table.drop(index,axis=0,inplace=True)

In [20]:
stopwords = set(STOPWORDS)

words = ["aren't","can't","can","cannot","could","couldn't","did","didn't","doing","don't","hasn't","hadn't","shan't"]
for word in words:
    stopwords.remove(word)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(table.iloc[:,0].values,
                                                    table.iloc[:,1].values,test_size = 0.2)

In [22]:
# Text processing to prepare data for RNN

# Lemmatization using SpaCy
nlp = spacy.load('en_core_web_md')

sentences = []

for num in range(len(x_train)):
    doc = nlp(x_train[num])

    sentence = []
    for token in doc:
        sentence.append(token.lemma_)

    sentences.append(" ".join(sentence))

# Processing text with TfidfVectorizer
tf_model = TfidfVectorizer(stop_words=stopwords,ngram_range=(1,3), min_df=3)
tf_vectors = tf_model.fit_transform(sentences); tf_vectors

<631x918 sparse matrix of type '<class 'numpy.float64'>'
	with 9205 stored elements in Compressed Sparse Row format>

In [23]:
from imblearn.over_sampling import SMOTE
from collections import Counter

sm = SMOTE(n_jobs = 6)
X_res,y_res = sm.fit_resample(tf_vectors.toarray(),y_train)

In [24]:
# Process and transform x_test
sentences = []

for num in range(len(x_test)):
    doc = nlp(x_test[num])

    sentence = []
    for token in doc:
        sentence.append(token.lemma_)

    sentences.append(" ".join(sentence))

x_test_vectors = tf_model.transform(sentences); x_test_vectors

<158x918 sparse matrix of type '<class 'numpy.float64'>'
	with 2107 stored elements in Compressed Sparse Row format>

In [25]:
# saving files to be loaded in Hyperas functions
np.save("saved_csv/tf_vectors.npy", tf_vectors.toarray(), allow_pickle=True, fix_imports=True)

table.to_csv("saved_csv/table.csv")

In [34]:
# Tuning hyperparameter with Hyperas
# Code source: https://github.com/maxpumperla/hyperas

# for RNN

def data():
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    tf_vectors = np.load("saved_csv/tf_vectors.npy")
    
    table = pd.read_csv("saved_csv/table.csv")
    table.drop(columns = "Unnamed: 0",inplace=True)
    
    x_train, x_test, y_train, y_test = train_test_split(tf_vectors,table.iloc[:,1].values,test_size = 0.2)

    x_train = x_train.reshape(631,1130,1)
    y_train = y_train.reshape(631,1)
    x_test = x_test.reshape(158,1130,1)
    y_test = y_test.reshape(158,1)
    
    return x_train, y_train, x_test, y_test


def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()

    model.add(LSTM({{choice([32,64,96,128])}},activation={{choice(["relu","elu"])}}, 
                   input_shape = (x_train.shape[1:]), return_sequences=True, dropout={{uniform(0,1)}}))
    model.add(BatchNormalization())

    model.add(LSTM({{choice([32,64,96,128])}}, activation={{choice(["relu","elu"])}}, dropout={{uniform(0,1)}}))
    model.add(BatchNormalization())

    model.add(Dense({{choice([32,64,96,128])}}, activation={{choice(["relu","elu"])}}))
    model.add(Dropout({{uniform(0,1)}}))

    model.add(Dense(2, activation={{choice(["softmax","sigmoid"])}}))

    # setting up optimizer hyperparameters
    sgd = SGD(lr={{uniform(0,0.01)}},decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

    es = EarlyStopping(monitor='val_loss', mode="min", patience=2, verbose=1)

    result = model.fit(x_train,y_train, batch_size = {{choice([16, 32, 64])}}, epochs = {{choice([5, 10, 15])}}, 
                       callbacks = [es], validation_split=0.2)

    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':
    best_run, best_model = optim.minimize(model=create_model,data=data,algo=tpe.suggest,max_evals=5,trials=Trials(),
                                          notebook_name='Capstone modelling stage v.3-RNN')
    X_train, Y_train, X_test, Y_test = data()
    print("Evaluation of best performing model:")
    print(best_model.evaluate(X_test, Y_test))
    print("Best performing model chosen hyper-parameters:")
    print(best_run)

>>> Imports:
#coding=utf-8

from __future__ import print_function

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
except:
    pass

try:
    from tensorflow.python.keras.models import Sequential
except:
    pass

try:
    from tensorflow.python.keras.utils import np_utils
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    from tensorflow.keras.layers import LSTM, BatchNormalization
except:
    pass

try:
    from tensorflow.keras.layers import Conv1D
except:
    pass

try:
    from tensorflow.keras.layers import MaxPooling1D
except:
    pass

try:
    from tensorflow.keras.layers import GlobalAveragePooling1D
except:
    pass

try:
    from tensorflow.keras.optimizers import SGD, Adam, RMSprop
except:
    pass

try:


Epoch 3/10                                                                    
 64/504 [==>...........................]                                      
 - ETA: 22s - loss: 0.7667 - acc: 0.4062                                      
                                                                             
 - ETA: 19s - loss: 0.7230 - acc: 0.5156                                      
                                                                             
 - ETA: 16s - loss: 0.7116 - acc: 0.5417                                      
                                                                             
 - ETA: 13s - loss: 0.7076 - acc: 0.5039                                      
                                                                             
 - ETA: 10s - loss: 0.7182 - acc: 0.4844                                      
                                                                             
 - ETA: 6s - loss: 0.7116 - acc: 0.4922                  

 - ETA: 58s - loss: 0.7206 - acc: 0.6172                                      
                                                                             
 - ETA: 51s - loss: 0.7384 - acc: 0.6000                                      
                                                                             
 - ETA: 45s - loss: 0.7217 - acc: 0.6198                                      
                                                                             
 - ETA: 40s - loss: 0.7079 - acc: 0.6518                                      
                                                                             
 - ETA: 35s - loss: 0.7034 - acc: 0.6406                                      
                                                                             
 - ETA: 30s - loss: 0.7032 - acc: 0.6424                                      
                                                                             
 - ETA: 26s - loss: 0.7047 - acc: 0.6469                  

 - ETA: 27s - loss: 0.6396 - acc: 0.6910                                      
                                                                             
 - ETA: 23s - loss: 0.6285 - acc: 0.6937                                      
                                                                             
 - ETA: 18s - loss: 0.6218 - acc: 0.7017                                      
                                                                             
 - ETA: 14s - loss: 0.6254 - acc: 0.6901                                      
                                                                             
 - ETA: 10s - loss: 0.6250 - acc: 0.6899                                      
                                                                             
 - ETA: 6s - loss: 0.6204 - acc: 0.6964                                       
                                                                              
 - ETA: 2s - loss: 0.6185 - acc: 0.7000                  

                                                                             
 - ETA: 7s - loss: 0.6235 - acc: 0.7210                                       
                                                                              
 - ETA: 3s - loss: 0.6213 - acc: 0.7250                                       
                                                                              
 - 69s 136ms/step - loss: 0.6231 - acc: 0.7202 - val_loss: 0.6602 - val_acc: 0.6614

Epoch 10/15                                                                   
 32/504 [>.............................]                                      
 - ETA: 1:03 - loss: 0.5462 - acc: 0.8438                                     
                                                                              
 64/504 [==>...........................]                                      
 - ETA: 56s - loss: 0.5841 - acc: 0.7500                                      
                                               

 - ETA: 1:35 - loss: 0.5785 - acc: 0.7344                                     
                                                                              
 96/504 [====>.........................]                                      
 - ETA: 1:30 - loss: 0.5572 - acc: 0.7604                                     
                                                                              
 - ETA: 1:29 - loss: 0.5819 - acc: 0.7344                                     
                                                                              
 - ETA: 1:24 - loss: 0.5855 - acc: 0.7250                                     
                                                                              
 - ETA: 1:15 - loss: 0.6233 - acc: 0.7031                                     
                                                                              
 - ETA: 1:07 - loss: 0.6183 - acc: 0.7098                                     
                                                    

 96/504 [====>.........................]                                      
 - ETA: 1:27 - loss: 0.6878 - acc: 0.5104                                     
                                                                              
112/504 [=====>........................]                                      
 - ETA: 1:26 - loss: 0.6999 - acc: 0.5000                                     
                                                                              
 - ETA: 1:22 - loss: 0.6992 - acc: 0.4844                                     
                                                                              
 - ETA: 1:18 - loss: 0.6978 - acc: 0.4722                                     
                                                                              
 - ETA: 1:14 - loss: 0.6991 - acc: 0.4750                                     
                                                                              
 - ETA: 1:11 - loss: 0.6999 - acc: 0.4659           

                                                                              
 - ETA: 1:10 - loss: 0.7039 - acc: 0.4886                                     
                                                                              
 - ETA: 1:08 - loss: 0.7038 - acc: 0.4896                                     
                                                                              
 - ETA: 1:04 - loss: 0.7060 - acc: 0.4952                                     
                                                                              
 - ETA: 1:01 - loss: 0.7039 - acc: 0.5045                                     
                                                                              
 - ETA: 57s - loss: 0.7053 - acc: 0.4958                                      
                                                                             
 - ETA: 54s - loss: 0.7047 - acc: 0.4961                                      
                                                     

 - ETA: 58s - loss: 0.6927 - acc: 0.5583                                      
                                                                             
 - ETA: 55s - loss: 0.6933 - acc: 0.5469                                      
                                                                             
 - ETA: 51s - loss: 0.6937 - acc: 0.5404                                      
                                                                             
 - ETA: 48s - loss: 0.6933 - acc: 0.5347                                      
                                                                             
 - ETA: 45s - loss: 0.6933 - acc: 0.5329                                      
                                                                             
 - ETA: 41s - loss: 0.6940 - acc: 0.5312                                      
                                                                             
 - ETA: 38s - loss: 0.6890 - acc: 0.5476                  

 - ETA: 44s - loss: 0.6889 - acc: 0.5469                                      
                                                                             
 - ETA: 40s - loss: 0.6903 - acc: 0.5476                                      
                                                                             
 - ETA: 36s - loss: 0.6909 - acc: 0.5455                                      
                                                                             
 - ETA: 32s - loss: 0.6916 - acc: 0.5408                                      
                                                                             
 - ETA: 28s - loss: 0.6929 - acc: 0.5286                                      
                                                                             
 - ETA: 24s - loss: 0.6937 - acc: 0.5275                                      
                                                                             
 - ETA: 20s - loss: 0.6932 - acc: 0.5288                  

KeyboardInterrupt: 

In [35]:
# Using results from Hyperas to create the model

def RNN_model(X,y):
    model = Sequential()

    model.add(LSTM(32,input_shape = (X.shape[1:]), return_sequences=True))
    model.add(BatchNormalization())

    model.add(LSTM(96))
    model.add(BatchNormalization())

    model.add(Dense(32))
    model.add(Dropout(0.5))

    model.add(Dense(2, activation="softmax"))

    # setting up optimizer hyperparameters
    sgd = SGD(lr=0.0001,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, 
                  metrics = ["accuracy"],sample_weight_mode='temporal')

    es = EarlyStopping(monitor='val_loss', mode="min", patience=2, verbose=1)

    result = model.fit(X,y, batch_size = 16, epochs = 5, callbacks = [es], validation_split=0.2)
    
    return model, result

In [36]:
y_res = y_res.astype(int)

X = X_res.reshape(906,918,1)
y = y_res.reshape(906,1)

model, result = RNN_model(X,y)

Train on 724 samples, validate on 182 samples
Epoch 1/5

KeyboardInterrupt: 

In [15]:
x_test = x_test_vectors.toarray()
x_test = x_test.reshape(158,911,1)

y_pred = model.predict(x_test)

my_list = []
for num in range(len(y_pred)):
    if y_pred[num][0] > 0.5:
        my_list.append(0)
    else:
        my_list.append(1)

print(y_pred)
print(y_test)

# y_true = table.iloc[:,1].values.astype(int)

#f1_score(y_test.astype(int),y_pred)

[[0.54049    0.45950994]
 [0.5404892  0.4595108 ]
 [0.54039884 0.45960122]
 [0.54061365 0.45938635]
 [0.54049    0.45950994]
 [0.54049003 0.45950997]
 [0.54049    0.45950994]
 [0.5404628  0.45953718]
 [0.5404889  0.45951107]
 [0.54043025 0.45956978]
 [0.5406051  0.4593949 ]
 [0.5406539  0.4593461 ]
 [0.5404859  0.45951408]
 [0.54049    0.45950994]
 [0.54049    0.45950994]
 [0.54049    0.45950994]
 [0.5404463  0.4595537 ]
 [0.540436   0.45956394]
 [0.54064447 0.45935556]
 [0.5404378  0.45956215]
 [0.54049003 0.45950997]
 [0.540616   0.459384  ]
 [0.54036945 0.45963058]
 [0.54049    0.45950994]
 [0.54046917 0.45953086]
 [0.54049    0.45950997]
 [0.54056305 0.45943698]
 [0.54049003 0.45950997]
 [0.54049    0.45950994]
 [0.54049    0.45950994]
 [0.54049    0.45950994]
 [0.54049    0.45950997]
 [0.54049003 0.45950997]
 [0.5404361  0.4595639 ]
 [0.5404723  0.45952764]
 [0.54049    0.45950994]
 [0.5405478  0.4594522 ]
 [0.54049    0.45950994]
 [0.540487   0.45951304]
 [0.5404426  0.45955747]


In [70]:
# input an response
response = input("Briefly describe what you think the tech industry as a whole and/or \
employers could do to improve mental health support for employees.")

print("Processing...")

# Text processing to prepare data for RNN
nlp = spacy.load('en_core_web_md')

sentences = []
doc = nlp(response)

sentence = []
for token in doc:
    sentence.append(token.lemma_)

sentences.append(" ".join(sentence))

print("Almost there...")

# Processing text with TfidfVectorizer
tf_vectors = tf_model.transform(sentences)

# predicting the result using the model
X_test = tf_vectors.toarray().reshape(1,1130,1)
y_pred = model.predict(X_test)

# printing the result
if y_pred[0][0] > 0.5:
    print("The model predicts you are hesitant with discussing MH issue with your coworkers.")
else:
    print("The model predicts you to have comfortable with discussing MH issue with your coworkers.")

Briefly describe what you think the tech industry as a whole and/or employers could do to improve mental health support for employees.they hate me
Processing...
Almost there...


array([[0.4998527 , 0.50014734]], dtype=float32)