In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

# Set Spark version and Java environment variables
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,688 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,824 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,788 kB]
Get:

In [None]:
# prompt: start spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Mental Health Dataset").getOrCreate()
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load dataset into a Spark DataFrame
df = spark.read.csv("/content/drive/My Drive/Colab Notebooks/Mental Health Dataset.csv", header=True, inferSchema=True)

# Display the schema and first few rows of the DataFrame
print("DataFrame Schema:")
df.printSchema()
df.show()

# Count total and distinct rows to identify duplicates
total_rows = df.count()
distinct_rows = df.distinct().count()
duplicate_count = total_rows - distinct_rows
print(f"\nTotal rows before removing duplicates: {total_rows}")
print(f"\nNumber of distinct rows: {distinct_rows}")
print(f"\nNumber of duplicate rows: {duplicate_count}")

# Drop duplicate rows
df = df.dropDuplicates()
print(f"\nTotal rows after removing duplicates: {df.count()}")

# Count and display the number of null values in each column
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
print("\nNumber of missing values per column:")
null_counts.show()

# Drop rows with any null values
df = df.na.drop()
print(f"Total rows after removing rows with null values: {df.count()}")

# Optionally, show the cleaned DataFrame
df.show()



DataFrame Schema:
root
 |-- Timestamp: timestamp (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- self_employed: string (nullable = true)
 |-- family_history: string (nullable = true)
 |-- treatment: string (nullable = true)
 |-- Days_Indoors: string (nullable = true)
 |-- Growing_Stress: string (nullable = true)
 |-- Changes_Habits: string (nullable = true)
 |-- Mental_Health_History: string (nullable = true)
 |-- Mood_Swings: string (nullable = true)
 |-- Coping_Struggles: string (nullable = true)
 |-- Work_Interest: string (nullable = true)
 |-- Social_Weakness: string (nullable = true)
 |-- mental_health_interview: string (nullable = true)
 |-- care_options: string (nullable = true)

+-------------------+------+--------------+----------+-------------+--------------+---------+------------+--------------+--------------+---------------------+-----------+----------------+-------------+---------

In [None]:
# Save the cleaned DataFrame to a CSV file
output_path = "/cleaned_mentalhealth_data.csv"
df.write.option("header", "true").csv(output_path)
print(f"\nCleaned data has been saved to: {output_path}")



Cleaned data has been saved to: /cleaned_mentalhealth_data.csv


In [None]:
# Convert Spark DataFrame to Pandas DataFrame for ML processing
pandas_df = df.toPandas()

from sklearn.preprocessing import LabelEncoder

# Encode target column
label_encoder = LabelEncoder()
pandas_df['Treatment'] = label_encoder.fit_transform(pandas_df['treatment'])

# Encode other categorical features
categorical_cols = ['Gender', 'Country', 'Occupation', 'self_employed', 'family_history',
                    'Mental_Health_History', 'Mood_Swings', 'Coping_Struggles',
                    'Work_Interest', 'Social_Weakness', 'mental_health_interview', 'care_options']

for col in categorical_cols:
    if pandas_df[col].dtype == 'object':
        pandas_df[col] = label_encoder.fit_transform(pandas_df[col])



In [None]:
# Check dtypes
print(pandas_df.dtypes)

# Preview unique values of suspicious columns
print(pandas_df['Days_Indoors'].unique())
print(pandas_df['Growing_Stress'].unique())
print(pandas_df['Changes_Habits'].unique())


Timestamp                  datetime64[ns]
Gender                              int64
Country                             int64
Occupation                          int64
self_employed                       int64
family_history                      int64
treatment                          object
Days_Indoors                       object
Growing_Stress                     object
Changes_Habits                     object
Mental_Health_History               int64
Mood_Swings                         int64
Coping_Struggles                    int64
Work_Interest                       int64
Social_Weakness                     int64
mental_health_interview             int64
care_options                        int64
Treatment                           int64
dtype: object
['More than 2 months' '1-14 days' '31-60 days' 'Go out Every day'
 '15-30 days']
['Yes' 'Maybe' 'No']
['No' 'Maybe' 'Yes']


In [None]:
# Map ordinal columns manually (example)
ordinal_map_days = {
    'Never': 0,
    '1-7 days': 1,
    '1-2 weeks': 2,
    'More than 2 months': 3
}
if pandas_df['Days_Indoors'].dtype == 'object':
    pandas_df['Days_Indoors'] = pandas_df['Days_Indoors'].map(ordinal_map_days)

# Encode other object columns if any remain
for col in ['Growing_Stress', 'Changes_Habits']:
    if pandas_df[col].dtype == 'object':
        pandas_df[col] = label_encoder.fit_transform(pandas_df[col])

In [None]:
# === HANDLE MISSING VALUES & SCALE ===
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Prepare features and target
X = pandas_df.drop(columns=['Treatment', 'Timestamp', 'treatment'], errors='ignore')
y = pandas_df['Treatment']

# Impute missing values (numeric)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# === TRAIN & EVALUATE MLP ===
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

In [None]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# === BUSINESS INSIGHTS ===
treated_cases = pandas_df['Treatment'].sum()
total_cases = len(pandas_df)
untreated_cases = total_cases - treated_cases
treatment_ratio = treated_cases / total_cases

print("\n --- Business Insight Summary ---")
print(f"Total Participants: {total_cases}")
print(f"Treated: {treated_cases} ({treatment_ratio:.2%})")
print(f"Untreated: {untreated_cases} ({(1 - treatment_ratio):.2%})")




 Model Accuracy: 78.25%

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.75      0.77     28422
           1       0.77      0.82      0.79     28940

    accuracy                           0.78     57362
   macro avg       0.78      0.78      0.78     57362
weighted avg       0.78      0.78      0.78     57362


 --- Business Insight Summary ---
Total Participants: 286808
Treated: 144501 (50.38%)
Untreated: 142307 (49.62%)


In [None]:
# %%
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

# Set Spark version and Java environment variables
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION'] = spark_version
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Install Spark, Java, and dependencies
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark xgboost

# Initialize Spark
import findspark
findspark.init()

spark = SparkSession.builder.appName("Mental Health Dataset").getOrCreate()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# %%
# Load dataset
df = spark.read.csv("/content/drive/My Drive/Colab Notebooks/Mental Health Dataset.csv", header=True, inferSchema=True)

# Clean data
df = df.dropDuplicates()
df = df.na.drop()

# %%
# Convert Spark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Label encode target and categorical features
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Encode target
pandas_df['Treatment'] = label_encoder.fit_transform(pandas_df['treatment'])

# Encode categorical columns
categorical_cols = ['Gender', 'Country', 'Occupation', 'self_employed', 'family_history',
                    'Mental_Health_History', 'Mood_Swings', 'Coping_Struggles',
                    'Work_Interest', 'Social_Weakness', 'mental_health_interview', 'care_options']

for col in categorical_cols:
    if pandas_df[col].dtype == 'object':
        pandas_df[col] = label_encoder.fit_transform(pandas_df[col])

# Ordinal encoding
ordinal_map_days = {
    'Never': 0,
    '1-7 days': 1,
    '1-2 weeks': 2,
    'More than 2 months': 3
}
if pandas_df['Days_Indoors'].dtype == 'object':
    pandas_df['Days_Indoors'] = pandas_df['Days_Indoors'].map(ordinal_map_days)

# Encode any remaining object columns
for col in ['Growing_Stress', 'Changes_Habits']:
    if pandas_df[col].dtype == 'object':
        pandas_df[col] = label_encoder.fit_transform(pandas_df[col])

# %%
# Handle missing values & scale
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Prepare features and target
X = pandas_df.drop(columns=['Treatment', 'Timestamp', 'treatment'], errors='ignore')
y = pandas_df['Treatment']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# %%
# Train and evaluate XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nXGBoost Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Business insight
treated_cases = pandas_df['Treatment'].sum()
total_cases = len(pandas_df)
untreated_cases = total_cases - treated_cases
treatment_ratio = treated_cases / total_cases

print("\n --- Business Insight Summary ---")
print(f"Total Participants: {total_cases}")
print(f"Treated: {treated_cases} ({treatment_ratio:.2%})")
print(f"Untreated: {untreated_cases} ({(1 - treatment_ratio):.2%})")

print("\nWhy should businesses care?")
print("- Untreated mental health can lead to burnout, absenteeism.")
print("- This predictive model helps prioritize employee support.")
print("- An accurate model (>75%) supports proactive HR action.")

# %%


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 261 kB in 4s (60.6 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list

Parameters: { "use_label_encoder" } are not used.




XGBoost Model Accuracy: 78.12%

Classification Report:

              precision    recall  f1-score   support

           0       0.82      0.72      0.77     28422
           1       0.75      0.84      0.80     28940

    accuracy                           0.78     57362
   macro avg       0.79      0.78      0.78     57362
weighted avg       0.79      0.78      0.78     57362


 --- Business Insight Summary ---
Total Participants: 286808
Treated: 144501 (50.38%)
Untreated: 142307 (49.62%)

Why should businesses care?
- Untreated mental health can lead to burnout, absenteeism.
- This predictive model helps prioritize employee support.
- An accurate model (>75%) supports proactive HR action.
