In [132]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [133]:
data = pd.read_csv('survey.csv')

In [134]:
data

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


# Cleaning

In [135]:
data.isna().sum()

Unnamed: 0,0
Timestamp,0
Age,0
Gender,0
Country,0
state,515
self_employed,18
family_history,0
treatment,0
work_interfere,264
no_employees,0


In [136]:
data = data.drop('comments', axis=1)

In [137]:
data['self_employed'].unique()

array([nan, 'Yes', 'No'], dtype=object)

In [138]:
data['self_employed'].mode()

Unnamed: 0,self_employed
0,No


In [139]:
data['self_employed'] = data['self_employed'].fillna('No')

In [140]:
data['work_interfere'].unique()

array(['Often', 'Rarely', 'Never', 'Sometimes', nan], dtype=object)

In [141]:
data['work_interfere'].mode()

Unnamed: 0,work_interfere
0,Sometimes


In [142]:
data['work_interfere'] = data['work_interfere'].fillna('Sometimes')

# Feature Engineering

In [143]:
data

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,2014-08-27 11:29:31,37,Female,United States,IL,No,No,Yes,Often,6-25,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,2014-08-27 11:29:37,44,M,United States,IN,No,No,No,Rarely,More than 1000,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,2014-08-27 11:29:44,32,Male,Canada,,No,No,No,Rarely,6-25,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,2014-08-27 11:29:46,31,Male,United Kingdom,,No,Yes,Yes,Often,26-100,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,2014-08-27 11:30:22,31,Male,United States,TX,No,No,No,Never,100-500,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,Sometimes,26-100,...,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No,No
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,Sometimes,100-500,...,Don't know,Don't know,Yes,No,No,No,No,No,No,No


In [144]:
data['Timestamp']

Unnamed: 0,Timestamp
0,2014-08-27 11:29:31
1,2014-08-27 11:29:37
2,2014-08-27 11:29:44
3,2014-08-27 11:29:46
4,2014-08-27 11:30:22
...,...
1254,2015-09-12 11:17:21
1255,2015-09-26 01:07:35
1256,2015-11-07 12:36:58
1257,2015-11-30 21:25:06


In [145]:
data['Year'] = data['Timestamp'].apply(lambda x: int(x[0:4]))
data['Month'] = data['Timestamp'].apply(lambda x: int(x[5:7]))
data['Day'] = data['Timestamp'].apply(lambda x: int(x[8:10]))

data['Hour'] = data['Timestamp'].apply(lambda x: int(x[11:13]))
data['Minute'] = data['Timestamp'].apply(lambda x: int(x[14:16]))
data['Second'] = data['Timestamp'].apply(lambda x: int(x[17:19]))

data = data.drop('Timestamp', axis=1)

In [146]:
data

Unnamed: 0,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,...,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,Year,Month,Day,Hour,Minute,Second
0,37,Female,United States,IL,No,No,Yes,Often,6-25,No,...,No,Maybe,Yes,No,2014,8,27,11,29,31
1,44,M,United States,IN,No,No,No,Rarely,More than 1000,No,...,No,No,Don't know,No,2014,8,27,11,29,37
2,32,Male,Canada,,No,No,No,Rarely,6-25,No,...,Yes,Yes,No,No,2014,8,27,11,29,44
3,31,Male,United Kingdom,,No,Yes,Yes,Often,26-100,No,...,Maybe,Maybe,No,Yes,2014,8,27,11,29,46
4,31,Male,United States,TX,No,No,No,Never,100-500,Yes,...,Yes,Yes,Don't know,No,2014,8,27,11,30,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,26,male,United Kingdom,,No,No,Yes,Sometimes,26-100,No,...,No,No,Don't know,No,2015,9,12,11,17,21
1255,32,Male,United States,IL,No,Yes,Yes,Often,26-100,Yes,...,No,No,Yes,No,2015,9,26,1,7,35
1256,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,No,...,No,No,No,No,2015,11,7,12,36,58
1257,46,f,United States,NC,No,No,No,Sometimes,100-500,Yes,...,No,No,No,No,2015,11,30,21,25,6


# Encoding Features

In [147]:
{column: len(data[column].unique()) for column in data.select_dtypes('object').columns}

{'Gender': 49,
 'Country': 48,
 'state': 46,
 'self_employed': 2,
 'family_history': 2,
 'treatment': 2,
 'work_interfere': 4,
 'no_employees': 6,
 'remote_work': 2,
 'tech_company': 2,
 'benefits': 3,
 'care_options': 3,
 'wellness_program': 3,
 'seek_help': 3,
 'anonymity': 3,
 'leave': 5,
 'mental_health_consequence': 3,
 'phys_health_consequence': 3,
 'coworkers': 3,
 'supervisor': 3,
 'mental_health_interview': 3,
 'phys_health_interview': 3,
 'mental_vs_physical': 3,
 'obs_consequence': 2}

In [148]:
{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

{'Gender': ['Female',
  'M',
  'Male',
  'male',
  'female',
  'm',
  'Male-ish',
  'maile',
  'Trans-female',
  'Cis Female',
  'F',
  'something kinda male?',
  'Cis Male',
  'Woman',
  'f',
  'Mal',
  'Male (CIS)',
  'queer/she/they',
  'non-binary',
  'Femake',
  'woman',
  'Make',
  'Nah',
  'All',
  'Enby',
  'fluid',
  'Genderqueer',
  'Female ',
  'Androgyne',
  'Agender',
  'cis-female/femme',
  'Guy (-ish) ^_^',
  'male leaning androgynous',
  'Male ',
  'Man',
  'Trans woman',
  'msle',
  'Neuter',
  'Female (trans)',
  'queer',
  'Female (cis)',
  'Mail',
  'cis male',
  'A little about you',
  'Malr',
  'p',
  'femail',
  'Cis Man',
  'ostensibly male, unsure what that really means'],
 'Country': ['United States',
  'Canada',
  'United Kingdom',
  'Bulgaria',
  'France',
  'Portugal',
  'Netherlands',
  'Switzerland',
  'Poland',
  'Australia',
  'Germany',
  'Russia',
  'Mexico',
  'Brazil',
  'Slovenia',
  'Costa Rica',
  'Austria',
  'Ireland',
  'India',
  'South Afric

In [149]:
def clean_gender(g):
    g = str(g).strip().lower()
    if g in ['male', 'm', 'man', 'cis male', 'male ', 'cis man']:
        return 'Male'
    elif g in ['female', 'f', 'woman', 'cis female', 'female ']:
        return 'Female'
    else:
        return 'Other'

In [150]:
data['Gender'] = data['Gender'].apply(clean_gender)

In [151]:
target = 'treatment'

binary_features = [
    'self_employed',
    'family_history',
    'remote_work',
    'tech_company',
    'obs_consequence'
]

ordinal_features = [
    'work_interfere',
    'no_employees',
    'leave'
]

nominal_features = [
    'Gender',
    'Country',
    'state',
    'benefits',
    'care_options',
    'wellness_program',
    'seek_help',
    'anonymity',
    'mental_health_consequence',
    'phys_health_consequence',
    'coworkers',
    'supervisor',
    'mental_health_interview',
    'phys_health_interview',
    'mental_vs_physical'
]

In [152]:
def binary_encode(df, columns, positive_values):
    df = df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df



def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [153]:
data['leave'].unique()

array(['Somewhat easy', "Don't know", 'Somewhat difficult',
       'Very difficult', 'Very easy'], dtype=object)

In [159]:
binary_positive_values = ['Yes' for feature in binary_features]



ordinal_mappings = {
    'work_interfere': {
        'Never': 0,
        'Rarely': 1,
        'Sometimes': 2,
        'Often': 3
    },
    'no_employees': {
        '1-5': 0,
        '6-25': 1,
        '26-100': 2,
        '100-500': 3,
        '500-1000': 4,
        'More than 1000': 5
    },
    'leave': {
        'Very easy': 4,
        'Somewhat easy': 3,
        "Don't know": 2,
        'Somewhat difficult': 1,
        'Very difficult': 0
    }
}


nominal_prefixes = [
    'ge',
    'co',
    'st',
    'be',
    'ca',
    'we',
    'se',
    'an',
    'mc',
    'ph',
    'cw',
    'su',
    'mi',
    'pi',
    'mp'
]


In [160]:
# --- Binary Encoding ---
for col in binary_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])


# --- nominal mapping
for col, mapping in ordinal_mappings.items():
    data[col] = data[col].map(mapping)


# --- One-Hot Encoding (Nominal) ---
ohe = OneHotEncoder(drop='first', sparse=False)  # ✅ For older versions (<1.2)

nominal_encoded = ohe.fit_transform(data[nominal_features])

# Get column names and create DataFrame
ohe_cols = ohe.get_feature_names_out(nominal_features)
nominal_df = pd.DataFrame(nominal_encoded, columns=ohe_cols, index=data.index)

# Drop original nominal columns and concat new one-hot encoded columns
data = pd.concat([data.drop(columns=nominal_features), nominal_df], axis=1)


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
data

# Encoding Labels

In [None]:
data = binary_encode(data, columns=['treatment'], positive_values=['Yes'])

In [None]:
print("Remaining non-numeric columns:", len(data.select_dtypes('object').columns))

In [None]:
print("Remaining missing values:", data.isna().sum().sum())

# Splitting/Scaling

In [None]:
y = data['treatment'].copy()
X = data.drop('treatment', axis=1).copy()

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

# Training

In [None]:
X.shape

In [None]:
print("Class Distribution (Positive to Negative): {:.1f}% / {:.1f}%".format(y_train.mean() * 100, (1 - y_train.mean()) * 100))

In [None]:
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(1024, activation='relu')(inputs)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


batch_size = 64
epochs = 50

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau()
    ]
)

# Results

In [None]:
plt.figure(figsize=(12, 6))

plt.plot(range(epochs), history.history['accuracy'], label="Training Accuracy")
plt.plot(range(epochs), history.history['val_accuracy'], label="Validation Accuracy")

plt.xlabel("Epoch")
plt.ylabel("Accuracy")

plt.legend()
plt.title("Accuracy Over Time")

plt.show()

In [None]:
model.evaluate(X_test, y_test)