In [1]:
# import packages

# Basics
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np

# Graphs
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Preprocessing; model selection and evaluation
from sklearn import pipeline, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# text handling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.neural_network import MLPClassifier
import statsmodels.api as sm

# for custom countvectorizer with SpaCy lemmatization
import spacy
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, VectorizerMixin
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix

# WordCloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

------
If we need to move virtual ENV to use Tensorflow we will need to install:

1. spacy

<code> conda install -c spacy spacy </code>

2. 'en_core_web_md'

<code> python -m spacy download en_core_web_md </code>

3. wordcloud

<code> conda install -c conda-forge wordcloud </code>

------

In [2]:
# import packages
data = pd.read_csv("saved_csv/df.csv")
data.drop(columns = "Unnamed: 0",inplace=True)

df = data.copy()

### Setting up data for analysis
------

In [3]:
# putting questions into categories

current_mh_coverage = ["Does your employer provide mental health benefits as part of healthcare coverage?",
               "Do you know the options for mental health care available under your employer-provided health coverage?",
               "Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",
               "Does your employer offer resources to learn more about mental health disorders and options for seeking help?",
               "Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?",
               "If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?"]

previous_mh_coverage = ["Have your previous employers provided mental health benefits?",
                        "Were you aware of the options for mental health care provided by your previous employers?",
                        "Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?",
                        "Did your previous employers provide resources to learn more about mental health disorders and how to seek help?",
                        "Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?"]

mh_status = ["Do you currently have a mental health disorder?",
             "Have you ever been diagnosed with a mental health disorder?",'Anxiety Disorder', 'Mood Disorder', 
             'Psychotic Disorder','Eating Disorder', 'Neurodevelopmental Disorders','Personality Disorder', 
             'Obsessive-Compulsive Disorder','Post-Traumatic Stress Disorder', 'Dissociative Disorder',
             'Substance-Related and Addictive Disorders', 'Other','Adjustment disorder',
             "Have you had a mental health disorder in the past?",
             "Have you ever sought treatment for a mental health disorder from a mental health professional?",
             "Do you have a family history of mental illness?",
             "How willing would you be to share with friends and family that you have a mental illness?",
             "Would you be willing to bring up a physical health issue with a potential employer in an interview?"]

witnessed_exp = ["Have your observations of how another individual who discussed a mental health issue made you less likely to reveal a mental health issue yourself in your current workplace?",
                 "Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?",
                 "Have you observed or experienced supportive or well handled response to a mental health issue in your current or previous workplace?"]

ratings = df.columns[df.columns.str.contains("Overall")]

demographics = ["What is your age?","What is your gender?","What country do you live in?",
                "What US state or territory do you live in?","What is your race?"]

comfort_talking_current = ["Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?",
                           "Have you ever discussed your mental health with your employer?",
                           "Would you feel comfortable discussing a mental health issue with your coworkers?",
                           "Have you ever discussed your mental health with coworkers?",
                           "Have you ever had a coworker discuss their or another coworker's mental health with you?",
                           "Would you feel more comfortable talking to your coworkers about your physical health or your mental health?",
                           "Would you bring up your mental health with a potential employer in an interview?",
                           "Are you openly identified at work as a person with a mental health issue?"]

comfort_talking_previous = ["Would you have felt more comfortable talking to your previous employer about your physical health or your mental health?",
                            "Would you have been willing to discuss your mental health with your direct supervisor(s)?",
                            "Did you ever discuss your mental health with your previous employer?",
                            "Would you have been willing to discuss your mental health with your coworkers at previous employers?",
                            "Did you ever discuss your mental health with a previous coworker(s)?",
                            "Did you ever have a previous coworker discuss their or another coworker's mental health with you?",
                            "Would you bring up your mental health with a potential employer in an interview?",
                            "Are you openly identified at work as a person with a mental health issue?"]

comfort_dependent_var = ["Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?",
                           "Have you ever discussed your mental health with your employer?",
                           "Would you feel comfortable discussing a mental health issue with your coworkers?",
                           "Have you ever discussed your mental health with coworkers?",
                           "Have you ever had a coworker discuss their or another coworker's mental health with you?",
                           "Would you feel more comfortable talking to your coworkers about your physical health or your mental health?",
                           "Would you have felt more comfortable talking to your previous employer about your physical health or your mental health?",
                           "Would you have been willing to discuss your mental health with your direct supervisor(s)?",
                           "Did you ever discuss your mental health with your previous employer?",
                           "Would you have been willing to discuss your mental health with your coworkers at previous employers?",
                           "Did you ever discuss your mental health with a previous coworker(s)?",
                           "Did you ever have a previous coworker discuss their or another coworker's mental health with you?",
                           "Would you bring up your mental health with a potential employer in an interview?",
                           "Are you openly identified at work as a person with a mental health issue?"]

categories = [current_mh_coverage,previous_mh_coverage,mh_status,witnessed_exp,ratings,comfort_talking_current,comfort_talking_previous,comfort_dependent_var]

### Preparing the data
------

#### Independent Variables for Q1
------

In [4]:
def make_dummies(question,columns_to_keep = 1):
    
    dummies = pd.get_dummies(df_2.loc[:,question])
    for j in range(len(dummies.columns)):
        name = question + "__" + dummies.columns[j]
        dummies.rename(columns = {dummies.columns[j] : name},inplace=True)

    result = dummies.iloc[:,[columns_to_keep]]
    
    return result

In [5]:
# making a copy
df_2 = df.copy()

In [6]:
# preparing a table of independent variables for Q1
independent_var = [current_mh_coverage,previous_mh_coverage,witnessed_exp,mh_status,ratings]
dependent_var = [comfort_talking_current,comfort_talking_previous]

In [7]:
# set up 2 dataframes for concatenating data
omitted = pd.DataFrame(columns = ["Question","Answer"])
final = pd.DataFrame()

In [8]:
# current_mh_coverage
for i in [0,2,3]:
    result = make_dummies(current_mh_coverage[i],-1)
    final = pd.concat([final,result],axis = 1)

for i in [2,-2,-1]:
    result = make_dummies(current_mh_coverage[5],i)
    final = pd.concat([final,result],axis = 1)

In [9]:
# witnessed_exp
result = make_dummies(witnessed_exp[0],-1)
final = pd.concat([final,result],axis = 1)

In [10]:
# mh_status
old_answer_1 = "Possibly"
old_answer_2 = "-1"
answer = "Don't Know"

to_dummy = []
for i in [0,1,-5,-3,-1]:
    to_dummy.append(mh_status[i])
    
df_2.loc[:,to_dummy[0]][df_2.loc[:,to_dummy[0]]==old_answer_1]=answer

df_2.loc[:,to_dummy[2]][df_2.loc[:,to_dummy[2]]==old_answer_1]=answer
df_2.loc[:,to_dummy[2]][df_2.loc[:,to_dummy[2]]==old_answer_2]=answer

In [11]:
# mh_status
result = make_dummies(to_dummy[0],-1)
final = pd.concat([final,result],axis = 1)

for i in [2,3,4,5,6,7,8,9,10,11,12,13,15,17]:
    final = pd.concat([final,df_2.loc[:,mh_status[i]]],axis = 1)

In [12]:
# ratings
for i in [1,4]:
    final = pd.concat([final,df_2.loc[:,ratings[i]]],axis = 1)

In [13]:
independent_q1 = final

#### Dependent Variables for Q1
------

In [14]:
# creating a table of dependent variables
dependent = df.loc[:,comfort_dependent_var]

In [15]:
num_list = [0,2,5,6,7,9,12]
columns_to_join = [1,3,4,8,10,11,13]

final_dep = pd.get_dummies(dependent.iloc[:,num_list])

for i in columns_to_join:
    result = pd.get_dummies(dependent.iloc[:,i])
    final_dep = pd.concat([final_dep,result],axis=1)

#### Predicting classes
------

Models Used:

- KMeans Clustering

In [None]:
# KMeans Clustering
X_alt = final_dep

inertia = []

for num in range(1,21):
    k_means_model = KMeans(n_clusters = num)
    k_means_model.fit(X_alt)

    y_pred2_alt = k_means_model.predict(X_alt)
    inertia.append(k_means_model.inertia_)

# b_alt = pd.DataFrame(y_pred2_alt)
# b_alt[0].groupby(b_alt[0]).count()

In [None]:
plt.scatter(np.arange(1,21),inertia);

In [None]:
k_means_model = KMeans(n_clusters = 4)
k_means_model.fit(X_alt)

y_pred2_alt = k_means_model.predict(X_alt)

b_alt = pd.DataFrame(y_pred2_alt)
b_alt[0].groupby(b_alt[0]).count()

In [None]:
b_alt.rename(columns = {0 : "Classes"},inplace=True)

In [None]:
#b_alt.to_csv("saved_csv/q1_dependent_alt.csv")

Since there is a factor of randomness in KMeans Clustering, the code will generate different counts with each iteration. Since the goal is to generate a cluster of 4 classes with an approximately normal distribution, the following cluster count will be used for the analysis:

|Class|Count|
|------|------|
|0|143|
|1|490|
|2|386|
|3|154|

### Q1 Qualitative insights for the tech industry to improve MH support for employees
------

In [16]:
# grabbing the dependent variables
dependent_class = pd.read_csv("saved_csv/q1_dependent_alt.csv")
dependent_class.drop('Unnamed: 0',axis=1,inplace=True)

In [17]:
# simplifing dependent variables to 0/1
dependent_class_alt = dependent_class.copy()

dependent_class_alt[dependent_class_alt < 2] = 0
dependent_class_alt[dependent_class_alt >= 2] = 1

In [18]:
# Code source: https://github.com/mpavlovic/spacy-vectorizers
# create a custom countvectorizer with SpaCy lemmatization

class SpacyPipeInitializer(object):
    def __init__(self, nlp, join_str=" ", batch_size=10000, n_threads=2):
        self.nlp = nlp
        self.join_str = join_str
        self.batch_size = batch_size
        self.n_threads = n_threads
        
class SpacyPipeProcessor(SpacyPipeInitializer):
    def __init__(self, nlp, multi_iters=False, join_str=" ", batch_size=10000, n_threads=2):
        super(SpacyPipeProcessor, self).__init__(nlp, join_str, batch_size, n_threads)
        self.multi_iters = multi_iters
    
    def __call__(self, raw_documents):
        docs_generator = self.nlp.pipe(raw_documents, batch_size=self.batch_size, n_threads=self.n_threads)
        return docs_generator if self.multi_iters == False else list(docs_generator)
    
class SpacyLemmaCountVectorizer(CountVectorizer):
    
    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)[^\r\n ]+",
                 ngram_range=(1, 1), analyzer='word',
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64, 
                 nlp=None, ignore_chars='!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~', 
                 join_str=" ", use_pron=False):
        
        super().__init__(input, encoding, decode_error, strip_accents, 
                                                   lowercase, preprocessor, tokenizer,
                                                   stop_words, token_pattern, ngram_range, 
                                                   analyzer, max_df, min_df, max_features,
                                                   vocabulary, binary, dtype)
        self.ignore_chars = ignore_chars
        self.join_str = ' ' # lemmas have to be joined for splitting
        self.use_pron = use_pron
        self.translate_table = dict((ord(char), None) for char in self.ignore_chars)
        
    def lemmatize_from_docs(self, docs):
        for doc in docs:
            lemmas_gen = (token.lemma_.translate(self.translate_table) if self.use_pron or token.lemma_!='-PRON-' else token.lower_.translate(self.translate_table) for token in doc)  # generator expression
            yield self.join_str.join(lemmas_gen) if self.join_str is not None else [lemma for lemma in lemmas_gen]
    
    def build_tokenizer(self):
        return lambda doc: doc.split()
    
    def transform(self, spacy_docs):
        raw_documents = self.lemmatize_from_docs(spacy_docs)
        return super(SpacyLemmaCountVectorizer, self).transform(raw_documents)
    
    def fit_transform(self, spacy_docs, y=None):
        raw_documents = self.lemmatize_from_docs(spacy_docs)
        return super(SpacyLemmaCountVectorizer, self).fit_transform(raw_documents, y)

In [19]:
# Grabbing the responses as independent variables
corpus = df.iloc[:,-9]

# dependent variables -> ratings
industry_rating = df.iloc[:,-10]

# make the ratings from 1-5 to 0-4
industry_rating = industry_rating - 1

# combining all info into one table
table_q4_alt = pd.concat([corpus,industry_rating,dependent_class_alt],axis=1)

# dropping columns that did not answer the question
index = table_q4_alt[table_q4_alt.iloc[:,0]=="Did not answer"].index

table_q4_alt.drop(index,axis=0,inplace=True)

In [20]:
# customization stopwords to filter out some words
stopwords = set(STOPWORDS)
stopwords.update(["mental","health","issue","work",
                  "take","hour","tech","industry","people","employee"])

# CountVectorizer with SpaCy Lemmatization
nlp = spacy.load('en_core_web_md')

spp = SpacyPipeProcessor(nlp, n_threads=1, multi_iters=True)
spacy_docs = spp(table_q4_alt.iloc[:,0]);

slcv = SpacyLemmaCountVectorizer(min_df=3,stop_words=stopwords, ngram_range=(1, 3), ignore_chars='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
slcv.fit(spacy_docs)
count_vectors = slcv.transform(spacy_docs); count_vectors

In [None]:
# Pulling out the list of parsed words and put them into a wordcloud
list_of_words = slcv.vocabulary_.keys()
list_of_words = list(list_of_words)
list_of_words.sort()

wordcloud = WordCloud(background_color="white").generate(" ".join(list_of_words))

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show();

### Q2 - Factors that affect comfort level in discussing MH at workplace
------

In [None]:
def make_dummies_q(question,num,drop_pattern = 1):
    '''
    This function creates dummy variable for text responses.
    It also creates a table of answers that are omitted in the table in preparation for analysis,
        since n-1 answer for each question is needed to avoid multicollinearity.
        
    Returns:
    -------
    question_omitted: a table with question and the answer that has been omitted to avoid multicollinearity.
    result: a table of dummy variables.
    '''
    d = {}
    question_omitted = pd.DataFrame(columns = ["Question","Answer"])
    result = pd.DataFrame()

    d[num] = pd.get_dummies(df_2.loc[:,question])
    if drop_pattern == 1:
        question_omitted = question_omitted.append({"Question":question, "Answer": d[num].columns[-2]},ignore_index = True)
        d[num].drop(columns = d[num].columns[0],inplace = True)
    elif drop_pattern == 2:
        question_omitted = question_omitted.append({"Question":question, "Answer": d[num].columns[3]},ignore_index = True)
        d[num].drop(columns = d[num].columns[3],inplace = True)

    for i in range(len(d[num].columns)):
        name = question + "__" + d[num].columns[i]
        d[num].rename(columns = {d[num].columns[i] : name},inplace=True)

    result = pd.concat([result,d[num]],axis=1)
        
    return question_omitted,result

In [None]:
# grabbing the dependent variables
dependent_class = pd.read_csv("saved_csv/q1_dependent_alt.csv")
dependent_class.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
# What is your gender?
question_omitted,gender = make_dummies_q(df_2.columns[-7],-7,drop_pattern = 2)

In [None]:
# Grouping countries into 5 continents
country_names = df_2.iloc[:,-6].groupby(df_2.iloc[:,-6]).count().index

df_2["countries_continent"] = df_2.iloc[:,-6]

north_am = [8,34,55]
south_am = [0,6,9,56]
asia = [3,20,23,24,26,28,29,39,43,44,46]
africa = [14,30,33,37,48]
europe = [2,4,5,7,10,11,13,15,16,17,18,19,21,22,25,27,31,32,35,38,40,41,42,45,47,49,50,51,52,53,54]
oceania = [1,36]
did_not_answer = [12]

continents = [north_am,south_am,asia,africa,europe,oceania,did_not_answer]
names = ["North America", "South America", "Asia", "Africa", "Europe", "Oceania", "Did not answer"]

for position,continent in enumerate(continents):
    for num in np.flip(continent):
        df_2.loc[:,"countries_continent"][df_2.loc[:,"countries_continent"]== country_names[num]]=names[position]
        
countries = df_2.iloc[:,-1]

countries = pd.get_dummies(countries)
countries.drop(columns = "North America", inplace=True)

In [None]:
# What is your race?
races = pd.get_dummies(df_2.iloc[:,-5])
races.drop(columns = "Caucasian", inplace=True)

In [None]:
# company_size
company_size = pd.get_dummies(df_2.iloc[:,2])
company_size.drop(columns="0",inplace=True)
size = company_size.iloc[:,[3,5]]

In [None]:
# Creating a table of demographics
demographics = pd.concat([gender,countries,races,df_2.iloc[:,[-9,1,3,4]],size],axis=1)

In [None]:
# creating features/independent variables
independent_q1_alt = pd.concat([independent_q1,demographics],axis=1)

In [None]:
# p-value WITH DEMOGRAPHICS

X_1 = independent_q1_alt
Y_1 = dependent_class_alt

scaler = preprocessing.StandardScaler()
scaler.fit(X_1)
X_transformed_1 = scaler.transform(X_1)

X_transformed_1 = np.hstack([np.ones([X_transformed_1.shape[0],1]), X_transformed_1])

logit = sm.Logit(Y_1, X_transformed_1)
fitted_model_demo = logit.fit_regularized()
fitted_model_demo.summary()

# alpha = 0.05
# all features together are significant
# Columns that are significant: 3,4,5,20,21,22,33,39,46,47,51

In [None]:
# Create a list of questions that are statistically significant
num_list = [3,4,5,20,21,22,33,39,46,47,51]
q1_ind_sig = independent_q1_alt.iloc[:,num_list]

In [None]:
# Creating a model using the questions that are statistically significant
X_train, X_test, y_train, y_test = train_test_split(q1_ind_sig,dependent_class_alt,test_size = 0.2)

estimators = [("normalize", preprocessing.StandardScaler()),
             ("model",LogisticRegression())]

pipe = pipeline.Pipeline(estimators)

param_grid = [{"model":[XGBClassifier()], 
               "normalize": [preprocessing.StandardScaler(), preprocessing.MinMaxScaler(), None],
               "model__max_depth":[1,2,3,4,5],"model__n_estimators":[50,100,150,200],"model__n_jobs":[6]},
              {"model": [RandomForestClassifier()],
               "normalize": [preprocessing.StandardScaler(), preprocessing.MinMaxScaler(), None],
               "model__n_estimators":[100,150,200],"model__n_jobs":[6]}]

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=6)
fitted_grid_1 = grid.fit(X_train,y_train)

In [None]:
fitted_grid_1.best_estimator_

In [None]:
fitted_grid_1.best_score_

In [None]:
fitted_grid_1.score(X_test,y_test)

In [None]:
# Ranking of factors that are important for predicting comfort level, from most to least important
indices = np.flip(fitted_grid_1.best_estimator_.named_steps["model"].feature_importances_.argsort())

q1_ind_sig.columns[indices]

### What can companies to do encourage their employees to seek treatment?
------

In [None]:
# set up 2 dataframes for concatenating data
omitted = pd.DataFrame(columns = ["Question","Answer"])
final = pd.DataFrame()

# current_mh_coverage
for i in [0,2,3]:
    result = make_dummies(current_mh_coverage[i],-1)
    final = pd.concat([final,result],axis = 1)

for i in [2,-2,-1]:
    result = make_dummies(current_mh_coverage[5],i)
    final = pd.concat([final,result],axis = 1)

In [None]:
ind_variable = pd.concat([final,demographics],axis=1)

In [None]:
dep_question = "Have you ever sought treatment for a mental health disorder from a mental health professional?"

X_1 = ind_variable
Y_1 = df_2.loc[:,dep_question]

scaler = preprocessing.StandardScaler()
scaler.fit(X_1)
X_transformed_1 = scaler.transform(X_1)

X_transformed_1 = np.hstack([np.ones([X_transformed_1.shape[0],1]), X_transformed_1])

logit = sm.Logit(Y_1, X_transformed_1)
fitted_model_sp = logit.fit_regularized()
fitted_model_sp.summary()

# alpha = 0.05
# all features together are significant
# Columns that are significant: 0,5,6,7,8,12

In [None]:
# Create a list of questions that are statistically significant
num_list = [0,5,6,7,8,12]
ind_sig = ind_variable.iloc[:,num_list]

In [None]:
# Creating a model using the questions that are statistically significant
X_train, X_test, y_train, y_test = train_test_split(ind_sig,df_2.loc[:,dep_question],test_size = 0.2)

estimators = [("normalize", preprocessing.StandardScaler()),
             ("model",LogisticRegression())]

pipe = pipeline.Pipeline(estimators)

param_grid = [{"model":[XGBClassifier()], 
               "normalize": [preprocessing.StandardScaler(), preprocessing.MinMaxScaler(), None],
               "model__max_depth":[1,2,3,4,5],"model__n_estimators":[50,100,150,200],"model__n_jobs":[6]},
              {"model": [RandomForestClassifier()],
               "normalize": [preprocessing.StandardScaler(), preprocessing.MinMaxScaler(), None],
               "model__n_estimators":[100,150,200],"model__n_jobs":[6]},
              {"model": [LogisticRegression()],
               "normalize": [preprocessing.StandardScaler(), preprocessing.MinMaxScaler(), None],
               "model__penalty":["l1","l2"],"model__C":[1e-04,1e-02,0.1,1,10,100]}]

grid = GridSearchCV(pipe, param_grid, cv=5)
fitted_grid_2 = grid.fit(X_train,y_train)

In [None]:
fitted_grid_2.best_estimator_

In [None]:
fitted_grid_2.best_score_

In [None]:
fitted_grid_2.score(X_test,y_test)

In [None]:
# Ranking of factors that are important for predicting comfort level, from most to least important
indices = np.flip(fitted_grid_2.best_estimator_.named_steps["model"].feature_importances_.argsort())

ind_sig.columns[indices]

### Qualitative insights for the tech industry to improve MH support for employees, based on comfort level
------

In [None]:
# customization stopwords to filter out some words
stopwords = set(STOPWORDS)
stopwords.update(["mental","health","issue","work",
                  "take","hour","tech","industry","people","employee","open","openly","make","long"])

In [None]:
for num in range(2):

    table = table_q4_alt[table_q4_alt.iloc[:,-1]==num]

    # CountVectorizer with SpaCy Lemmatization
    spp = SpacyPipeProcessor(nlp, n_threads=1, multi_iters=True)
    spacy_docs = spp(table.iloc[:,0]);

    slcv = SpacyLemmaCountVectorizer(min_df=3,stop_words=stopwords, ngram_range=(1, 3), ignore_chars='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
    slcv.fit(spacy_docs)
    count_vectors = slcv.transform(spacy_docs); count_vectors

    # Pulling out the list of parsed words and put them into a wordcloud
    list_of_words = slcv.vocabulary_.keys()
    list_of_words = list(list_of_words)
    list_of_words.sort()

    wordcloud = WordCloud(background_color="white").generate(" ".join(list_of_words))

    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show();

In [None]:
# use RNN to do something with the words?

In [None]:
dependent_class

In [None]:
# Code source: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

lemmatized = []

for idx in range(len(corpus)):
    doc = nlp(corpus[idx])
    lemmatized.append(" ".join([token.lemma_ for token in doc]))

ind_var = pd.DataFrame(lemmatized,columns = ["Responses"])

In [None]:
np.array(lemmatized).shape

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU, CuDNNLSTM, BatchNormalization, Flatten, Embedding
                                                               # this is faster than LSTM
                                                               # but you cannot change the activation function
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import plot_model

In [242]:
table = pd.concat([corpus,dependent_class_alt],axis=1)

# dropping columns that did not answer the question
index = table[table.iloc[:,0]=="Did not answer"].index

table.drop(index,axis=0,inplace=True)

In [243]:
# customization stopwords to filter out some words
stopwords = set(STOPWORDS)
stopwords.update(["mental","health","issue","work",
                  "take","hour","tech","industry","people","employee"])

# CountVectorizer with SpaCy Lemmatization
nlp = spacy.load('en_core_web_md')

spp = SpacyPipeProcessor(nlp, n_threads=1, multi_iters=True)
spacy_docs = spp(table.iloc[:,0]);

slcv = SpacyLemmaCountVectorizer(min_df=3,stop_words=stopwords, ngram_range=(1, 3), ignore_chars='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
slcv.fit(spacy_docs)
count_vectors = slcv.transform(spacy_docs); count_vectors

<789x755 sparse matrix of type '<class 'numpy.int64'>'
	with 7266 stored elements in Compressed Sparse Row format>

In [247]:
X_train, X_test, y_train, y_test = train_test_split(count_vectors,table.iloc[:,1].values,test_size = 0.2)

In [252]:
X_train, X_test, y_train, y_test = train_test_split(count_vectors,table.iloc[:,1].values,test_size = 0.2)

X_train = X_train.toarray().reshape(631,755,1)
y_train = y_train.reshape(631,1)
X_test = X_test.toarray().reshape(158,755,1)
y_test = y_test.reshape(158,1)

model = Sequential()

model.add(LSTM(64,activation="relu", input_shape = (X_train.shape[1:]), return_sequences=True, dropout=0.2))
model.add(BatchNormalization())

model.add(LSTM(64, activation="relu"))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(2, activation="softmax"))

# setting up SGD (optimizer) hyperparameters
sgd = SGD(lr=0.01,decay=0.0, momentum = 0.0, nesterov=False, clipnorm=2.0)

# compile model
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer = sgd,
              metrics = ["accuracy"])

model.summary()

model.fit(X_train,y_train, batch_size = 48, epochs = 20, validation_data = (X_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_101 (LSTM)              (None, 755, 64)           16896     
_________________________________________________________________
batch_normalization_73 (Batc (None, 755, 64)           256       
_________________________________________________________________
lstm_102 (LSTM)              (None, 64)                33024     
_________________________________________________________________
batch_normalization_74 (Batc (None, 64)                256       
_________________________________________________________________
dense_48 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_12 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_49 (Dense)             (None, 2)                 66        
Total para

<tensorflow.python.keras.callbacks.History at 0x1c66e2a358>