## Capstone Project
-------

### Stage 2 - Modelling phase - Neural Networks
------

#### Importing packages and data
------

Possible packages that need to be installed:

1. Hyperas

<code> conda install -c jaikumarm hyperas </code>

2. mlxtend

<code> conda install -c conda-forge mlxtend </code>

These packages are from the previous notebook. If virtual environments are used for neural network, however, the following packages will need to be installed in order for the notebook to run properly.

3. SpaCy

<code> conda install -c spacy spacy </code>

4. 'en_core_web_md' - library used in SpaCy

<code> python -m spacy download en_core_web_md </code>

5. wordcloud

<code> conda install -c conda-forge wordcloud </code>

------

In [2]:
# import packages

# Hyperas/TensorFlow
# the __future__ import command must be in the beginning of the notebook
from __future__ import print_function

from hyperopt import Trials, STATUS_OK, tpe
from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

import tensorflow as tf
from tensorflow.keras.layers import LSTM, BatchNormalization
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping

# Basics
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Preprocessing; model selection and evaluation
from sklearn import pipeline, preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# Modelling
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from xgboost import XGBClassifier

# text handling
from sklearn.feature_extraction.text import TfidfVectorizer

# for custom countvectorizer with SpaCy lemmatization
import spacy
from sklearn.feature_extraction.text import CountVectorizer, VectorizerMixin
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix

# WordCloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

Using TensorFlow backend.


------
If we need to move virtual ENV to use Tensorflow we will need to install:

1. spacy

<code> conda install -c spacy spacy </code>

2. 'en_core_web_md'

<code> python -m spacy download en_core_web_md </code>

3. wordcloud

<code> conda install -c conda-forge wordcloud </code>

------

In [3]:
# import packages
data = pd.read_csv("saved_csv/df.csv")
data.drop(columns = "Unnamed: 0",inplace=True)

df = data.copy()

### Creating a model to predict comfort level using text responses
------

In [5]:
# Grabbing the responses as independent variables
corpus = df.iloc[:,-9]

# grabbing the dependent variables
# dependent_class = pd.read_csv("saved_csv/q1_dependent_alt.csv")
# dependent_class.drop('Unnamed: 0',axis=1,inplace=True)

# dependent_class_alt = dependent_class.copy()

# dependent_class_alt[dependent_class_alt < 2] = 0
# dependent_class_alt[dependent_class_alt >= 2] = 1

question = "Would you feel comfortable discussing a mental health issue with your coworkers?"

answers = ["Maybe","No","Not Applicable","Yes"]

dep = df[question].copy()

for num in range(len(answers)):
    if num != 3:
        dep[dep==answers[num]] = 0 #No
    else:
        dep[dep==answers[num]] = 1 #Yes

In [6]:
# Creating a table with both independent and dependent variables
table = pd.concat([corpus,dep],axis=1)

# dropping columns that did not answer the question
index = table[table.iloc[:,0]=="Did not answer"].index

table.drop(index,axis=0,inplace=True)

In [30]:
stopwords = set(STOPWORDS)

words = ["aren't","can't","can","cannot","could","couldn't","did","didn't","doing","don't","hasn't","hadn't","shan't"]
for word in words:
    stopwords.remove(word)

In [32]:
# Text processing to prepare data for RNN

# Lemmatization using SpaCy
nlp = spacy.load('en_core_web_md')

sentences = []

for num in range(len(table)):
    doc = nlp(table.iloc[num,0])

    sentence = []
    for token in doc:
        sentence.append(token.lemma_)

    sentences.append(" ".join(sentence))

# Processing text with TfidfVectorizer
tf_model = TfidfVectorizer(stop_words=stopwords,ngram_range=(1,3), min_df=3)
tf_vectors = tf_model.fit_transform(sentences); tf_vectors

<789x1130 sparse matrix of type '<class 'numpy.float64'>'
	with 11996 stored elements in Compressed Sparse Row format>

In [33]:
# saving files to be loaded in Hyperas functions
np.save("saved_csv/tf_vectors.npy", tf_vectors.toarray(), allow_pickle=True, fix_imports=True)

table.to_csv("saved_csv/table.csv")

In [None]:
# Tuning hyperparameter with Hyperas
# Code source: https://github.com/maxpumperla/hyperas

# for RNN

def data():
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    tf_vectors = np.load("saved_csv/tf_vectors.npy")
    
    table = pd.read_csv("saved_csv/table.csv")
    table.drop(columns = "Unnamed: 0",inplace=True)
    
    x_train, x_test, y_train, y_test = train_test_split(tf_vectors,table.iloc[:,1].values,test_size = 0.2)

    x_train = x_train.reshape(631,1130,1)
    y_train = y_train.reshape(631,1)
    x_test = x_test.reshape(158,1130,1)
    y_test = y_test.reshape(158,1)
    
    return x_train, y_train, x_test, y_test


def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()

    model.add(LSTM({{choice([32,64,96,128])}},activation={{choice(["relu","elu"])}}, 
                   input_shape = (x_train.shape[1:]), return_sequences=True, dropout={{uniform(0,1)}}))
    model.add(BatchNormalization())

    model.add(LSTM({{choice([32,64,96,128])}}, activation={{choice(["relu","elu"])}}, dropout={{uniform(0,1)}}))
    model.add(BatchNormalization())

    model.add(Dense({{choice([32,64,96,128])}}, activation={{choice(["relu","elu"])}}))
    model.add(Dropout({{uniform(0,1)}}))

    model.add(Dense(2, activation={{choice(["softmax","sigmoid"])}}))

    # setting up optimizer hyperparameters
    sgd = SGD(lr={{uniform(0,0.01)}},decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

    es = EarlyStopping(monitor='val_loss', mode="min", patience=2, verbose=1)

    result = model.fit(x_train,y_train, batch_size = {{choice([16, 32, 64])}}, epochs = {{choice([5, 10, 15])}}, 
                       callbacks = [es], validation_split=0.2)

    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':
    best_run, best_model = optim.minimize(model=create_model,data=data,algo=tpe.suggest,max_evals=5,trials=Trials(),
                                          notebook_name='Capstone modelling stage v.3-RNN')
    X_train, Y_train, X_test, Y_test = data()
    print("Evaluation of best performing model:")
    print(best_model.evaluate(X_test, Y_test))
    print("Best performing model chosen hyper-parameters:")
    print(best_run)

>>> Imports:
#coding=utf-8

from __future__ import print_function

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from tensorflow.python.keras.layers.core import Dense, Dropout, Activation, Flatten
except:
    pass

try:
    from tensorflow.python.keras.models import Sequential
except:
    pass

try:
    from tensorflow.python.keras.utils import np_utils
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    from tensorflow.keras.layers import LSTM, BatchNormalization
except:
    pass

try:
    from tensorflow.keras.layers import Conv1D
except:
    pass

try:
    from tensorflow.keras.layers import MaxPooling1D
except:
    pass

try:
    from tensorflow.keras.layers import GlobalAveragePooling1D
except:
    pass

try:
    from tensorflow.keras.optimizers import SGD, Adam, RMSprop
except:
    pass

try:


                                                   
 - ETA: 3s - loss: 0.7678 - acc: 0.5670            
                                                   
 - 30s 60ms/step - loss: 0.7563 - acc: 0.5774 - val_loss: 0.6645 - val_acc: 0.6614

Epoch 3/5                                          
 64/504 [==>...........................]           
 - ETA: 25s - loss: 0.7887 - acc: 0.6094           
                                                  
 - ETA: 24s - loss: 0.7503 - acc: 0.5703           
                                                  
 - ETA: 21s - loss: 0.7680 - acc: 0.5938           
                                                  
 - ETA: 16s - loss: 0.7488 - acc: 0.5859           
                                                  
 - ETA: 11s - loss: 0.7895 - acc: 0.5844           
                                                  
 - ETA: 7s - loss: 0.8293 - acc: 0.5625            
                                                   
 - ETA: 3s - loss: 0.8242 - acc: 0.57

Epoch 3/10                                                                    
 64/504 [==>...........................]                                      
 - ETA: 22s - loss: 0.7667 - acc: 0.4062                                      
                                                                             
 - ETA: 19s - loss: 0.7230 - acc: 0.5156                                      
                                                                             
 - ETA: 16s - loss: 0.7116 - acc: 0.5417                                      
                                                                             
 - ETA: 13s - loss: 0.7076 - acc: 0.5039                                      
                                                                             
 - ETA: 10s - loss: 0.7182 - acc: 0.4844                                      
                                                                             
 - ETA: 6s - loss: 0.7116 - acc: 0.4922                  

 - ETA: 15s - loss: 0.6879 - acc: 0.5625                                      
                                                                             
 - ETA: 12s - loss: 0.6846 - acc: 0.5820                                      
                                                                             
 - ETA: 9s - loss: 0.6818 - acc: 0.6094                                       
                                                                              
 - ETA: 6s - loss: 0.6809 - acc: 0.5964                                       
                                                                              
 - ETA: 2s - loss: 0.6901 - acc: 0.6004                                       
                                                                              
 - 28s 55ms/step - loss: 0.6890 - acc: 0.5913 - val_loss: 0.6809 - val_acc: 0.6614

Epoch 8/10                                                                    
 64/504 [==>...........................]         

 - ETA: 58s - loss: 0.7206 - acc: 0.6172                                      
                                                                             
 - ETA: 51s - loss: 0.7384 - acc: 0.6000                                      
                                                                             
 - ETA: 45s - loss: 0.7217 - acc: 0.6198                                      
                                                                             
 - ETA: 40s - loss: 0.7079 - acc: 0.6518                                      
                                                                             
 - ETA: 35s - loss: 0.7034 - acc: 0.6406                                      
                                                                             
 - ETA: 30s - loss: 0.7032 - acc: 0.6424                                      
 40%|████      | 2/5 [08:14<10:00, 200.22s/it, best loss: -0.6614173331598597]

In [57]:
# Using results from Hyperas to create the model

def RNN_model(X,y):
    model = Sequential()

    model.add(LSTM(32,activation="elu", input_shape = (X.shape[1:]), return_sequences=True, dropout=0.3207527760045966))
    model.add(BatchNormalization())

    model.add(LSTM(96, activation="elu", dropout=0.7342146978592597))
    model.add(BatchNormalization())

    model.add(Dense(32, activation='elu'))
    model.add(Dropout(0.692539034315719))

    model.add(Dense(2, activation="softmax"))

    # setting up SGD (optimizer) hyperparameters
    sgd = SGD(lr=0.004371162594318422,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

    es = EarlyStopping(monitor='val_loss', mode='min', patience=2, verbose=1)

    result = model.fit(X,y, batch_size = 64, epochs = 5, callbacks = [es], validation_split=0.2)
    
    return model, result

In [58]:
X = tf_vectors
y = table.iloc[:,1].values

X = X.toarray().reshape(789,1130,1)
y = y.reshape(789,1)

model, result = RNN_model(X,y)

Train on 631 samples, validate on 158 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 00003: early stopping


In [70]:
# input an response
response = input("Briefly describe what you think the tech industry as a whole and/or \
employers could do to improve mental health support for employees.")

print("Processing...")

# Text processing to prepare data for RNN
nlp = spacy.load('en_core_web_md')

sentences = []
doc = nlp(response)

sentence = []
for token in doc:
    sentence.append(token.lemma_)

sentences.append(" ".join(sentence))

print("Almost there...")

# Processing text with TfidfVectorizer
tf_vectors = tf_model.transform(sentences)

# predicting the result using the model
X_test = tf_vectors.toarray().reshape(1,1130,1)
y_pred = model.predict(X_test)

# printing the result
if y_pred[0][0] > 0.5:
    print("The model predicts you are hesitant with discussing MH issue with your coworkers.")
else:
    print("The model predicts you to have comfortable with discussing MH issue with your coworkers.")

Briefly describe what you think the tech industry as a whole and/or employers could do to improve mental health support for employees.they hate me
Processing...
Almost there...


array([[0.4998527 , 0.50014734]], dtype=float32)

In [None]:
# for CNN
def data():
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    tf_vectors = np.load("saved_csv/tf_vectors.npy")
    
    table = pd.read_csv("saved_csv/table.csv")
    table.drop(columns = "Unnamed: 0",inplace=True)
    
    x_train, x_test, y_train, y_test = train_test_split(tf_vectors,table.iloc[:,1].values,test_size = 0.2)

    x_train = x_train.reshape(631,1127,1)
    y_train = y_train.reshape(631,1)
    x_test = x_test.reshape(158,1127,1)
    y_test = y_test.reshape(158,1)
    
    return x_train, y_train, x_test, y_test


def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()

    model.add(Conv1D({{choice([32,64,96,128])}},{{choice([5,10,15,20])}},activation={{choice(["relu","elu"])}}, 
                   input_shape = (x_train.shape[1:])))
    model.add(Conv1D({{choice([32,64,96,128])}},{{choice([5,10,15,20])}},activation={{choice(["relu","elu"])}}))
    model.add(MaxPooling1D({{choice([1,2,3,4,5,6])}}))

    model.add(Conv1D({{choice([32,64,96,128])}},{{choice([5,10,15,20])}},activation={{choice(["relu","elu"])}}))
    model.add(Conv1D({{choice([32,64,96,128])}},{{choice([5,10,15,20])}},activation={{choice(["relu","elu"])}}))
    model.add(GlobalAveragePooling1D())
              
    model.add(Flatten())
    model.add(Dense({{choice([32,64,96,128])}},activation={{choice(["relu","elu"])}}))
    model.add(Dropout({{uniform(0,1)}}))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr={{uniform(0,0.01)}},decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)
              
    model.compile(loss="sparse_categorical_crossentropy",optimizer=sgd, metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, mode='min', patience=2, verbose=1)

    result = model.fit(x_train,y_train, batch_size = {{choice([16, 32, 64])}}, epochs = {{choice([5, 10, 15])}}, 
                       validation_split = 0.2, callbacks=[early_stop])

    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':
    best_run, best_model = optim.minimize(model=create_model,data=data,algo=tpe.suggest,max_evals=5,trials=Trials(),
                                          notebook_name='Capstone modelling stage v.3-RNN')
    X_train, Y_train, X_test, Y_test = data()
    print("Evaluation of best performing model:")
    print(best_model.evaluate(X_test, Y_test))
    print("Best performing model chosen hyper-parameters:")
    print(best_run)

In [None]:
# Using results from Hyperas to create the model
def CNN_model(x_train,y_train):
    CNN_model = Sequential()

    CNN_model.add(Conv1D(96,10,activation="relu",input_shape = (x_train.shape[1:])))
    CNN_model.add(Conv1D(64,20,activation="relu"))
    CNN_model.add(MaxPooling1D(0))

    CNN_model.add(Conv1D(64,20,activation="elu"))
    CNN_model.add(Conv1D(64,10,activation="relu"))
    CNN_model.add(GlobalAveragePooling1D())

    CNN_model.add(Flatten())
    CNN_model.add(Dense(96,activation="elu"))
    CNN_model.add(Dropout(0.9912013870496312))
    CNN_model.add(Dense(10, activation='softmax'))

    sgd = SGD(lr=0.00026079803111884515,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    CNN_model.compile(loss="sparse_categorical_crossentropy",optimizer=sgd, metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, mode='min', patience=2, verbose=1)

    results = CNN_model.fit(x_train,y_train, batch_size=32, epochs=10, validation_split = 0.2, callbacks=[early_stop])
    
    return CNN_model, results

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tf_vectors,table.iloc[:,1].values,test_size = 0.2)

x_train = x_train.toarray().reshape(631,1127,1)
y_train = y_train.reshape(631,1)
x_test = x_test.toarray().reshape(158,1127,1)
y_test = y_test.reshape(158,1)

CNN_model, results = CNN_model(x_train,y_train)

In [None]:
# checking test accuracy
_, test_acc = CNN_model.evaluate(x_test, y_test, verbose=0)
test_acc

In [None]:
def NN_model():
    model = Sequential()

    model.add(Dense(64,activation="relu", input_shape = (3,)))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(2, activation="softmax"))

    # setting up SGD (optimizer) hyperparameters
    sgd = SGD(lr=0.0001, decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

    # compile model
    model.compile(loss="sparse_categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])
    
    return model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tf_vectors.toarray(),table.iloc[:,1].values,test_size = 0.8)

base_models = [KNeighborsClassifier(n_neighbors=1),
               LogisticRegression(),
               XGBClassifier()]

base_models = [(f'{model.__class__.__name__}-{i}', model) for i, model in enumerate(base_models)]

stacked_model = StackingCVClassifier(classifiers=[model for _, model in base_models],
                                   meta_classifier=KerasClassifier(build_fn=NN_model, batch_size = 16, epochs = 5, validation_split = 0.2), 
                                   use_features_in_secondary=False)

params = {'kneighborsclassifier__n_neighbors': [5,10,15,20],'kneighborsclassifier__n_jobs': [6],
          'xgbclassifier__max_depth' : [1,2,3],'xgbclassifier__n_estimators' : [50,100,150],'xgbclassifier__n_jobs': [6],
          'logisticregression__penalty': ['l1','l2'],'logisticregression__C': [0.0001,0.01,1,10],'logisticregression__n_jobs': [6]}

grid = GridSearchCV(estimator=stacked_model, param_grid=params, cv=3,refit=True)
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
# customization stopwords to filter out some words
stopwords = set(STOPWORDS)
stopwords.update(["mental","health","issue","work",
                  "take","hour","tech","industry","people","employee"])


for num in range(2):

    classes = table[table.iloc[:,-1]==num]

    # CountVectorizer with SpaCy Lemmatization
    spp = SpacyPipeProcessor(nlp, n_threads=1, multi_iters=True)
    spacy_docs = spp(classes.iloc[:,0]);

    slcv = SpacyLemmaCountVectorizer(min_df=3,stop_words=stopwords, ngram_range=(1, 3), ignore_chars='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
    slcv.fit(spacy_docs)
    count_vectors = slcv.transform(spacy_docs); count_vectors

    # Pulling out the list of parsed words and put them into a wordcloud
    list_of_words = slcv.vocabulary_.keys()
    list_of_words = list(list_of_words)
    list_of_words.sort()

    wordcloud = WordCloud(background_color="white").generate(" ".join(list_of_words))

    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show();