In [1]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

Using Theano backend.


In [2]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [3]:
reduced_url = "Datasets/MentalHealth/mental-health-in-tech-survey/reducedDataset/survey.csv"
reduced_names = ["Age","Country","state","family_history","treatment","work_interfere","no_employees","remote_work","benefits","wellness_program","anonymity","leave","phys_health_consequence","coworkers","supervisor","mental_health_interview","phys_health_interview","mental_vs_physical","obs_consequence","mental_health_consequence"]
reduced_dataset = pandas.read_csv(reduced_url, names=reduced_names)

In [4]:
print(reduced_dataset.dtypes)
print(reduced_dataset.shape)

Age                           int64
Country                      object
state                        object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
benefits                     object
wellness_program             object
anonymity                    object
leave                        object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
mental_health_consequence    object
dtype: object
(1259, 20)


In [5]:
obj_columns = reduced_dataset.select_dtypes(['object']).columns
print(obj_columns)

Index(['Country', 'state', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'benefits', 'wellness_program',
       'anonymity', 'leave', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'mental_health_consequence'],
      dtype='object')


In [6]:
for col_name in reduced_dataset.columns:
    if(reduced_dataset[col_name].dtype == 'object'):
        reduced_dataset[col_name]= reduced_dataset[col_name].astype('category')

cat_columns = reduced_dataset.select_dtypes(['category']).columns
print(cat_columns)

Index(['Country', 'state', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'benefits', 'wellness_program',
       'anonymity', 'leave', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'mental_health_consequence'],
      dtype='object')


In [7]:
reduced_dataset[cat_columns] = reduced_dataset[cat_columns].apply(lambda x: x.cat.codes)
print(reduced_dataset.head(20))

    Age  Country  state  family_history  treatment  work_interfere  \
0    37       45     11               0          1               2   
1    44       45     12               0          0               3   
2    32        7      6               0          0               3   
3    31       44      6               1          1               2   
4    31       45     38               0          0               1   
5    33       45     37               1          0               4   
6    35       45     19               1          1               4   
7    39        7      6               0          0               1   
8    42       45     11               1          1               4   
9    23        7      6               0          0               1   
10   31       45     30               0          1               4   
11   29        6      6               0          0               1   
12   42       45      2               1          1               4   
13   36       45    

In [11]:
#Split-out validation dataset

from sklearn import model_selection
reduced_array = reduced_dataset.values
reduced_X = reduced_array[:,0:19]
reduced_Y = reduced_array[:,19]
validation_size = 0.20
seed = 7
reduced_X_train, reduced_X_validation, reduced_Y_train, reduced_Y_validation = model_selection.train_test_split(reduced_X, reduced_Y, test_size = validation_size, random_state = seed)

In [12]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(reduced_Y)
encoded_Y = encoder.transform(reduced_Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
print(dummy_y)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [29]:
print(encoded_Y)

[1 0 1 ... 2 2 0]


In [15]:
print(reduced_Y)

[1 0 1 ... 2 2 0]


In [21]:
# define baseline model
def baseline_model():
    # create model
    net_model = Sequential()
    net_model.add(Dense(30, input_dim=19, activation='relu'))
    net_model.add(Dense(3, activation='softmax'))
    # Compile model
    net_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return net_model

In [22]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

In [23]:
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [24]:
results = cross_val_score(estimator, reduced_X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 59.01% (3.07%)


In [27]:
# Predicting the Test set results
import sklearn
estimator.fit(reduced_X,reduced_Y)
predictions = estimator.predict(reduced_X)
predictions = (predictions > 0.5)

In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print(accuracy_score(reduced_Y, predictions))
print(classification_report(reduced_Y, predictions))

0.5615567911040509
              precision    recall  f1-score   support

           0       0.70      0.57      0.63       477
           1       0.50      0.89      0.64       491
           2       0.00      0.00      0.00       291

   micro avg       0.56      0.56      0.56      1259
   macro avg       0.40      0.49      0.42      1259
weighted avg       0.46      0.56      0.49      1259



  'precision', 'predicted', average, warn_for)
