In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from mlxtend.classifier import StackingClassifier
import tensorflow as tf
import argparse

In [None]:
df = pd.read_csv("survey.csv")
print(df.shape)
print(df.info)

In [None]:
print(df.describe())

In [None]:
print(df.describe(include='all'))

In [None]:
#checking bad data
column = df.shape[0]
total = df.isnull().sum()/column
print(total)

In [None]:
df.drop(['comments'], axis= 1, inplace=True)
df.drop(['state'], axis= 1, inplace=True)
df.drop(['Timestamp'], axis= 1, inplace=True)
df.isnull().sum().max() #just checking that there's no missing data missing...

df.head(5)

In [None]:
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0
# Create lists by data tpe
intFeatures = ['Age']
floatFeatures = []
stringFeatures = []
# Clean the NaN's
for feature in df:
    if feature in intFeatures:
        df[feature] = df[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        df[feature] = df[feature].fillna(defaultString)
    elif feature in floatFeatures:
        df[feature] = df[feature].fillna(defaultFloat)
    else:
        print('Error: Feature %s not identified.' % feature)
print(df)

In [None]:
gender = df['Gender'].unique()
print(gender)
#Get rid of bullshit
mistakes = ['maile', 'Make', 'Mal', 'msle', 'Mail', 'Malr', 'M', 'm', 'Man']
for mistake in mistakes:
    df['Gender'] = df['Gender'].replace(mistake, 'Male')
    
mistakes_f = ['female', 'f', 'F', 'Woman', 'woman', 'Femake','Female ', 'femail']
for mistake_f in mistakes_f:
    df['Gender'] = df['Gender'].replace(mistake_f, 'Female')
    
mis = ['Trans-female', 'non-binary', 'Trans woman', 'Female (Trans)', 'Neuter']
for mi in mis:
    df['Gender'] = df['Gender'].replace(mi, 'Others')
list = ['Male', 'Female', 'Others']
stk_list = ['A little about you', 'p', 'Male-ish', 'maile', 'Trans-female', 'Cis-female', 'something kinda male?', 'Cis Male', 
            'Mal', 'Make', 'Nah', 'Femake', 'non-binary', 'fluid','queer/she/they', 'All', 'Enby', 'Genderqueer', 'Guy (-ish) ^_^', 'Male (CIS)', 'male leaning androgynous', 'Androgyne', 'Agender','Cis Female', 'cis-female/femme', 'msle', 
            'queer','Female (trans)', 'Female (cis)','Mail', 'cis male', 'Malr', 'femail', 'Cis Man', 'ostensibly male, unsure what that really means']
df = df[df['Gender'].isin(list)]
print(df['Gender'].unique())

In [None]:
print(df.shape)

In [None]:
#complete missing age with mean
df['Age'].fillna(df['Age'].median(), inplace = True)
# Fill with media() values  120
s = pd.Series(df['Age'])
s[s<18] = df['Age'].median()
df['Age'] = s
s = pd.Series(df['Age'])
s[s>120] = df['Age'].median()
df['Age'] = s
#Ranges of Age
df['age_range'] = pd.cut(df['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)
#There are only 0.014% of self employed so let's change NaN to NOT self_employed
#Replace "NaN" string from defaultString
df['self_employed'] = df['self_employed'].replace([defaultString], 'No')
print(df['self_employed'].unique())

In [None]:
#There are only 0.20% of self work_interfere so let's change NaN to "Don't know
#Replace "NaN" string from defaultString
df['work_interfere'] = df['work_interfere'].replace([defaultString], 'Dont know')
print(df['work_interfere'].unique())

In [None]:
labelDict = {}
for feature in df:
    le = preprocessing.LabelEncoder()
    le.fit(df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    df[feature] = le.transform(df[feature])
    labelKey = 'label_'+feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey]= labelValue
for key, value in labelDict.items():
    print(key, value)

df = df.drop(['Country'], axis=1)
df.head()

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

In [None]:
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()

In [None]:
k = 10 
cols = corrmat.nlargest(k, 'treatment')['treatment'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(df["Age"], bins=24)
plt.title("Distribution and density by Age")
plt.xlabel("Age")

Mostly the people within age 10 to 20 are mentally disturbed

In [None]:
j = sns.FacetGrid(df, col='treatment')
j = j.map(sns.histplot, "Age")

In [None]:
plt.figure(figsize=(12,8))
labels = labelDict['label_Gender']
j = sns.countplot(x="treatment", data=df)
j.set_xticks(range(len(labels)))
j.set_xticklabels(labels)
plt.title('Total Distribution by treated or not')

In [None]:
o = labelDict['label_age_range']
j = sns.catplot(x="age_range", y="treatment", hue="Gender", data=df, kind="bar",  errorbar=None, aspect=2, legend_out = True)
j.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Age')
new_labels = labelDict['label_Gender']
for t, l in zip(j._legend.texts, new_labels): t.set_text(l)
j.fig.subplots_adjust(top=0.9,right=0.8)
plt.show()

In [None]:
o = labelDict['label_family_history']
j = sns.catplot(x="family_history", y="treatment", hue="Gender", data=df, kind="bar", errorbar=None, aspect=2, legend_out = True)
j.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Family History')
new_labels = labelDict['label_Gender']
for t, l in zip(j._legend.texts, new_labels): t.set_text(l)
j.fig.subplots_adjust(top=0.9,right=0.8)
plt.show()

In [None]:
o = labelDict['label_care_options']
j = sns.catplot(x="care_options", y="treatment", hue="Gender", data=df, kind="bar", errorbar=None, aspect=2, legend_out = True)
j.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Care options')
new_labels = labelDict['label_Gender']
for t, l in zip(j._legend.texts, new_labels): t.set_text(l)
j.fig.subplots_adjust(top=0.9,right=0.8)
plt.show()

In [None]:
o = labelDict['label_benefits']
j = sns.catplot(x="care_options", y="treatment", hue="Gender", data=df, kind="bar", errorbar=None, aspect=2, legend_out = True)
j.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Benefits')
new_labels = labelDict['label_Gender']
for t, l in zip(j._legend.texts, new_labels): t.set_text(l)
j.fig.subplots_adjust(top=0.9,right=0.8)
plt.show()

In [None]:
o = labelDict['label_work_interfere']
j = sns.catplot(x="work_interfere", y="treatment", hue="Gender", data=df, kind="bar", errorbar=None, aspect=2, legend_out = True)
j.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Work interfere')
new_labels = labelDict['label_Gender']
for t, l in zip(j._legend.texts, new_labels): t.set_text(l)
j.fig.subplots_adjust(top=0.9,right=0.8)
plt.show()

In [None]:
#Scaling to normalize
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])
df.head()

In [None]:
#Finding correlation of other set of data with treatment
corre = df.corr()['treatment']
print(corre)

In [None]:
feature_cols = ['Age', 'family_history', 'benefits', 'care_options', 'anonymity', 'obs_consequence']
x = df[feature_cols]
y = df.treatment
X_train1, X_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.30, random_state=0)
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)
forest.fit(x, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
labels = []
for f in range(x.shape[1]):
    labels.append(feature_cols[f])
plt.figure(figsize=(12,8))
plt.title("Feature importances")
plt.bar(range(x.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(x.shape[1]), labels, rotation='vertical')
plt.xlim([-1, x.shape[1]])
plt.show()

In [64]:
def evalModel(model,X_test1, y_test1, y_pred_class, plot=True):
    print('Accuracy:', metrics.accuracy_score(y_test1, y_pred_class))
    print('Null accuracy:n', y_test1.value_counts())
    print('Percentage of ones:', y_test1.mean())
    print('Percentage of zeros:',1 - y_test1.mean())
    print('True:', y_test1.values[0:25])
    print('Pred:', y_pred_class[0:25])
    #Confusion matrix
    confusion = metrics.confusion_matrix(y_test1, y_pred_class)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    # visualize Confusion Matrix
    sns.heatmap(confusion,annot=True,fmt="d")
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    accuracy = metrics.accuracy_score(y_test1, y_pred_class)
    print('Classification Accuracy:', accuracy)
    print('Classification Error:', 1 - metrics.accuracy_score(y_test1, y_pred_class))
    fp_rate = FP / float(TN + FP)
    print('False Positive Rate:', fp_rate)
    print('Precision:', metrics.precision_score(y_test1, y_pred_class))
    print('First 10 predicted responses:n', model.predict(X_test1)[0:10])
    print('First 10 predicted probabilities of class members:n', model.predict_proba(X_test1)[0:10])
    model.predict_proba(X_test1)[0:10, 1]
    y_pred_prob = model.predict_proba(X_test1)[:, 1]
    if plot == True:
        # histogram of predicted probabilities
        plt.rcParams['font.size'] = 12
        plt.hist(y_pred_prob, bins=8)
      
        plt.xlim(0,1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of treatment')
        plt.ylabel('Frequency')
    y_pred_prob = y_pred_prob.reshape(-1,1)
    y_pred_class = binarize(y_pred_prob, threshold=0.5)
    print('First 10 predicted probabilities:n', y_pred_prob[0:10])
    roc_auc = metrics.roc_auc_score(y_test1, y_pred_prob)
    fpr, tpr, thresholds = metrics.roc_curve(y_test1, y_pred_prob)
    if plot == True:
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for treatment classifier using LogisticRegression')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc="lower right")
        plt.show()
    return accuracy

In [65]:
def tuningGridSearch(knn):
    k_range = range(1,31)
    param_grid = {'n_neighbors': k_range}
    print(param_grid)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    grid.fit(x, y)
    print(grid.best_params_)
    print('GridSearch best score', grid.best_score_)
    print('GridSearch best params', grid.best_params_)
    print('GridSearch best estimator', grid.best_estimator_)
    

In [66]:
def logisticRegression():
    logreg = LogisticRegression()
    logreg.fit(X_train1, y_train1)
    y_pred_class = logreg.predict(X_test1)
    accuracy_score = evalModel(logreg,X_test1, y_test1, y_pred_class)
    print(accuracy_score)

In [None]:
logisticRegression()

In [68]:
def evalModel(model, X_test1, y_test1, y_pred_class, plot=True):
    print('Accuracy:', metrics.accuracy_score(y_test1, y_pred_class))
    print('Null accuracy:n', y_test1.value_counts())
    print('Percentage of ones:', y_test1.mean())
    print('Percentage of zeros:',1 - y_test1.mean())
    print('True:', y_test1.values[0:25])
    print('Pred:', y_pred_class[0:25])
    #Confusion matrix
    confusion = metrics.confusion_matrix(y_test1, y_pred_class)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    # visualize Confusion Matrix
    sns.heatmap(confusion,annot=True,fmt="d")
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    accuracy = metrics.accuracy_score(y_test1, y_pred_class)
    print('Classification Accuracy:', accuracy)
    print('Classification Error:', 1 - metrics.accuracy_score(y_test1, y_pred_class))
    fp_rate = FP / float(TN + FP)
    print('False Positive Rate:', fp_rate)
    print('Precision:', metrics.precision_score(y_test1, y_pred_class))
    print('First 10 predicted responses:n', model.predict(X_test1)[0:10])
    print('First 10 predicted probabilities of class members:n', model.predict_proba(X_test1)[0:10])
    model.predict_proba(X_test1)[0:10, 1]
    y_pred_prob = model.predict_proba(X_test1)[:, 1]
    if plot == True:
        # histogram of predicted probabilities
        plt.rcParams['font.size'] = 12
        plt.hist(y_pred_prob, bins=8)
      
        plt.xlim(0,1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of treatment')
        plt.ylabel('Frequency')
    y_pred_prob = y_pred_prob.reshape(-1,1)
    y_pred_class = binarize(y_pred_prob, threshold=0.5)
    print('First 10 predicted probabilities:n', y_pred_prob[0:10])
    roc_auc = metrics.roc_auc_score(y_test1, y_pred_prob)
    fpr, tpr, thresholds = metrics.roc_curve(y_test1, y_pred_prob)
    if plot == True:
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for treatment classifier using KnnClassifier')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc="lower right")
        plt.show()
    return accuracy

In [None]:
def Knn():
    # Calculating the best parameters
    knn = KNeighborsClassifier(n_neighbors=6)
   
    k_range = range(1, 31)
    weight_options = ['uniform', 'distance']
    
    param_dist = dict(N_neighbors=k_range, weights=weight_options)
    tuningGridSearch(knn)
   
    knn = KNeighborsClassifier(n_neighbors=27, weights='uniform')
    knn.fit(X_train1, y_train1)
   
    y_pred_class = knn.predict(X_test1)
    accuracy_score = evalModel(knn,X_test1, y_test1, y_pred_class, True)
    print(accuracy_score)
Knn()

In [70]:
def evalModel(model,X_test1, y_test1, y_pred_class, plot=True):
    print('Accuracy:', metrics.accuracy_score(y_test1, y_pred_class))
    print('Null accuracy:n', y_test1.value_counts())
    print('Percentage of ones:', y_test1.mean())
    print('Percentage of zeros:',1 - y_test1.mean())
    print('True:', y_test1.values[0:25])
    print('Pred:', y_pred_class[0:25])
    #Confusion matrix
    confusion = metrics.confusion_matrix(y_test1, y_pred_class)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    # visualize Confusion Matrix
    sns.heatmap(confusion,annot=True,fmt="d")
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    accuracy = metrics.accuracy_score(y_test1, y_pred_class)
    print('Classification Accuracy:', accuracy)
    print('Classification Error:', 1 - metrics.accuracy_score(y_test1, y_pred_class))
    fp_rate = FP / float(TN + FP)
    print('False Positive Rate:', fp_rate)
    print('Precision:', metrics.precision_score(y_test1, y_pred_class))
    print('First 10 predicted responses:n', model.predict(X_test1)[0:10])
    print('First 10 predicted probabilities of class members:n', model.predict_proba(X_test1)[0:10])
    model.predict_proba(X_test1)[0:10, 1]
    y_pred_prob = model.predict_proba(X_test1)[:, 1]
    if plot == True:
        # histogram of predicted probabilities
        plt.rcParams['font.size'] = 12
        plt.hist(y_pred_prob, bins=8)
      
        plt.xlim(0,1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of treatment')
        plt.ylabel('Frequency')
    y_pred_prob = y_pred_prob.reshape(-1,1)
    y_pred_class = binarize(y_pred_prob, threshold=0.5)
    print('First 10 predicted probabilities:n', y_pred_prob[0:10])
    roc_auc = metrics.roc_auc_score(y_test1, y_pred_prob)
    fpr, tpr, thresholds = metrics.roc_curve(y_test1, y_pred_prob)
    if plot == True:
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for treatment classifier using AdaBoostClassifier')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc="lower right")
        plt.show()
    return accuracy

In [72]:
def Classifier():
    logreg = AdaBoostClassifier()
    logreg.fit(X_train1, y_train1)
    y_pred_class = logreg.predict(X_test1)
    accuracy_score = evalModel(logreg,X_test1, y_test1, y_pred_class)
    print(accuracy_score)

In [None]:
Classifier()

In [33]:
batch_size = 100
train_steps = 1000
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)
def train_input_fn(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    return dataset.shuffle(1000).repeat().batch(batch_size)
def eval_input_fn(features, labels, batch_size):
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)
    dataset = tf.data.Dataset.from_tensor_slices(inputs)
    dataset = dataset.batch(batch_size)
    # Return the dataset.
    return dataset

In [34]:
age = tf.feature_column.numeric_column("Age")
obs = tf.feature_column.numeric_column("obs_consequence")
family_history = tf.feature_column.numeric_column("family_history")
benefits = tf.feature_column.numeric_column("benefits")
care_options = tf.feature_column.numeric_column("care_options")
anonymity = tf.feature_column.numeric_column("anonymity")
"""leave = tf.feature_column.numeric_column("leave")
work_interfere = tf.feature_column.numeric_column("work_interfere")"""
feature_column = [age, obs, family_history, benefits, care_options, anonymity]

In [None]:
model = tf.estimator.DNNClassifier(feature_columns=feature_column, hidden_units=[20, 20], optimizer=lambda: tf.keras.optimizers.Adam(
          learning_rate=tf.compat.v1.train.exponential_decay(
              learning_rate=0.01,
              global_step=tf.compat.v1.train.get_global_step(),
              decay_steps=10000,
              decay_rate=0.96)), activation_fn=tf.nn.relu)

In [None]:
model.train(input_fn=lambda:train_input_fn(X_train, y_train, batch_size), steps=train_steps)

In [None]:
# Evaluate the model.
eval_result = model.evaluate(
    input_fn=lambda:eval_input_fn(X_test, y_test, batch_size))
print('nTest set accuracy: {accuracy:0.2f}n'.format(**eval_result))
#Data for final graph
accuracy = eval_result['accuracy'] * 100
print(accuracy)

In [None]:
model = tf.estimator.DNNRegressor(feature_columns=feature_column, hidden_units=[20, 20], optimizer=lambda: tf.keras.optimizers.Adam(
          learning_rate=tf.compat.v1.train.exponential_decay(
              learning_rate=0.01,
              global_step=tf.compat.v1.train.get_global_step(),
              decay_steps=10000,
              decay_rate=0.96)), activation_fn=tf.nn.relu)

In [None]:
model.train(input_fn=lambda:train_input_fn(X_train, y_train, batch_size), steps=train_steps)


In [None]:
eval_result = model.evaluate(
    input_fn=lambda:eval_input_fn(X_test, y_test, batch_size))
#print('nTest set accuracy: {accuracy:0.2f}n'.format(**eval_result))
#Data for final graph
#accuracy = eval_result['accuracy'] * 100
print(eval_result)