## Capstone Project
-------

### Stage 2 - Modelling phase
------

#### Importing packages and data
------

In [1]:
# import packages

# Basics
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
from collections import Counter

# Graphs
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Preprocessing; model selection and evaluation
from sklearn import pipeline, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

# text handling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import statsmodels.api as sm
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import GradientBoostingClassifier

# for custom countvectorizer with SpaCy lemmatization
import spacy
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, VectorizerMixin
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix

# WordCloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

  from numpy.core.umath_tests import inner1d


In [2]:
# import packages
data = pd.read_csv("saved_csv/df.csv")
data.drop(columns = "Unnamed: 0",inplace=True)

df = data.copy()

## Question 3

### Can we predict one's comfort level in discussing MH at workplace using participants' qualitative responses of ways to improve MH support?
------

In [3]:
# Grabbing the responses as independent variables
corpus = df.iloc[:,-9]

# Dependent variables
question = "Would you feel comfortable discussing a mental health issue with your coworkers?"

answers = ["Maybe","No","Not Applicable","Yes"]

dep = df[question].copy()

for num in range(len(answers)):
    if num != 3:
        dep[dep==answers[num]] = 0 #Hesitant
    else:
        dep[dep==answers[num]] = 1 #Comfortable

In [4]:
# Creating a table with both independent and dependent variables
table = pd.concat([corpus,dep],axis=1)

# dropping columns that did not answer the question
index = table[table.iloc[:,0]=="Did not answer"].index
table.drop(index,axis=0,inplace=True)

# resetting the index
table = table.reset_index()
table.drop("index",axis=1,inplace=True)

In [5]:
# split the dataset into training/test sets
x_train, x_test, y_train, y_test = train_test_split(table.iloc[:,0].values,table.iloc[:,1].values,
                                                    test_size = 0.2, stratify=table.iloc[:,1].values)

In [6]:
# Process and transform x_train

# Lemmatization using SpaCy
nlp = spacy.load('en_core_web_md')

# customizing stopwords to exclude certain stopwords
stopwords = set(STOPWORDS)

words = ["against","all","aren't","can't","can","cannot","could","couldn't","did",
         "didn't","doing","don't","hasn't","hadn't","ever","few","mustn't","once","shan't"]

for word in words:
    stopwords.remove(word)

sentences = []

for num in range(len(x_train)):
    doc = nlp(x_train[num])

    sentence = []
    for token in doc:
        sentence.append(token.lemma_)

    sentences.append(" ".join(sentence))

# Processing text with TfidfVectorizer
tf_model = TfidfVectorizer(stop_words=stopwords,ngram_range=(1,3), min_df=3)
tf_vectors = tf_model.fit_transform(sentences); tf_vectors

<631x930 sparse matrix of type '<class 'numpy.float64'>'
	with 9438 stored elements in Compressed Sparse Row format>

In [7]:
# Oversampling with SMOTE
sm = SMOTE(n_jobs = 6)
X_res,y_res = sm.fit_resample(tf_vectors.toarray(),y_train)

In [8]:
# Process and transform x_test
sentences = []

for num in range(len(x_test)):
    doc = nlp(x_test[num])

    sentence = []
    for token in doc:
        sentence.append(token.lemma_)

    sentences.append(" ".join(sentence))

x_test_vectors = tf_model.transform(sentences); x_test_vectors

<158x930 sparse matrix of type '<class 'numpy.float64'>'
	with 1967 stored elements in Compressed Sparse Row format>

In [9]:
# Building the model using Stacking CV Classifier
base_models = [SVC(C=1.0, kernel="rbf", class_weight = {0: 1, 1: 2}), 
               LogisticRegression(penalty = "l1", class_weight = {0: 1, 1: 2})]

base_models = [(f'{model.__class__.__name__}-{i}', model) for i, model in enumerate(base_models)]

stacked_model = StackingCVClassifier(classifiers=[model for _, model in base_models],
                                     meta_classifier=GradientBoostingClassifier(n_estimators = 100, max_depth = 3), 
                                     use_features_in_secondary=False)

model = stacked_model.fit(X_res, y_res.astype(int))

In [10]:
y_pred = model.predict(x_test_vectors.toarray())

f1_score(y_test.astype(int),y_pred)

0.4578313253012048

In [11]:
# input an response and see if the model predicts correctly
response = input("Briefly describe what you think the tech industry as a whole and/or \
employers could do to improve mental health support for employees.")

print("Processing...")

# Text processing to prepare data for RNN
nlp = spacy.load('en_core_web_md')

sentences = []
doc = nlp(response)

sentence = []
for token in doc:
    sentence.append(token.lemma_)

sentences.append(" ".join(sentence))

print("Almost there...")

# Processing text with TfidfVectorizer
tf_vectors = tf_model.transform(sentences)

# predicting the result using the model
y_pred = model.predict(tf_vectors.toarray())

# printing the result
if y_pred == 0 :
    print("The model predicts you are hesitant with discussing MH issue with your coworkers.")
else:
    print("The model predicts you to have comfortable with discussing MH issue with your coworkers.")