# Clustering
---

Import libraries:


In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

from feature_engine.encoding import OneHotEncoder

---

## Import cleaned data parquet file

Import the cleaned data that was the output of the 1st notebook

In [2]:
df = pd.read_parquet("../data/mental_health_social_media_dataset_cleaned.parquet")

---
## Select Features

In [3]:
# List numeric columns for clustering
numeric_features = [
    "age",
    "daily_screen_time_min",
    "social_media_time_min",
    "sleep_hours",
    "physical_activity_min",
    "interaction_negative_ratio",
    "stress_level",
    "mood_level",
    "anxiety_level"
]

# List categorical columns for clustering
categorical_features = ["gender", "platform", "mental_state"]

# Create DataFrame for clustering with only selected features
df_cluster = df[numeric_features + categorical_features].copy()

---
## Build the Pipeline

I will use OneHotEncoder on the categorical columns so that it has a binary value for each category value, dropping the last one as all the combinations can be represented nd avoids redundancy.

I will use a standard scaler to standardise all the numerical values, setting their mean to 0 and standard deviation to 1. This is to stop large values such as screen time dominating the distance calcualtions.

The model I am using for clutering is k-means. The optimal number of clusters (k) was determined using both the elbow method and silhouette analysis below.

In [4]:
pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(variables=categorical_features, drop_last=True)),
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=4, random_state=42, n_init="auto"))
])

---
## Fit the Pipeline

In [5]:
# Call the fit method to run the full pipeline
pipeline.fit(df_cluster)

# Get the cluster labels
labels = pipeline.named_steps["kmeans"].labels_

# Create a copy of the DataFrame before adding labels
df_cluster_no_labels = df_cluster.copy()

# Add the cluster labels to the original DataFrame
df_cluster["cluster"] = labels


View the dataframe at the last transform step before the model is run to see the changes made in the pipeline:

In [None]:
# Run all the steps apart from the last one (KMeans), use the non-labeled DataFrame as
# its the same as the original DataFrame before adding labels
preprocessed = pipeline[:-1].transform(df_cluster_no_labels)

# Convert the preprocessed array back to a DataFrame for easier inspection
transformed_df = pd.DataFrame(
    preprocessed,
    columns=pipeline.named_steps["onehot"].get_feature_names_out()
)

# Display the first few rows of the transformed DataFrame
transformed_df.head()


Unnamed: 0,age,daily_screen_time_min,social_media_time_min,sleep_hours,physical_activity_min,interaction_negative_ratio,stress_level,mood_level,anxiety_level,gender_Male,gender_Female,platform_Instagram,platform_Snapchat,platform_Facebook,platform_WhatsApp,platform_TikTok,platform_YouTube,mental_state_Stressed,mental_state_Healthy
0,0.41146,-0.500581,-0.215325,0.497701,0.500538,0.27064,-0.101292,0.492465,-0.64208,1.029639,-0.989654,2.472322,-0.405147,-0.418105,-0.406818,-0.411149,-0.40882,0.294483,-0.27054
1,-0.484399,0.754215,0.711613,-0.815297,-0.725669,-0.190731,0.840086,-0.823582,0.615914,-0.971214,1.010455,2.472322,-0.405147,-0.418105,-0.406818,-0.411149,-0.40882,0.294483,-0.27054
2,-0.321516,-0.151502,0.290278,0.122559,0.123243,0.27064,-0.101292,0.492465,0.615914,1.029639,-0.989654,-0.404478,2.468238,-0.418105,-0.406818,-0.411149,-0.40882,0.294483,-0.27054
3,2.936156,-1.727073,-0.987773,1.623128,1.726744,-1.574843,-1.042671,0.492465,-0.64208,-0.971214,1.010455,-0.404478,2.468238,-0.418105,-0.406818,-0.411149,-0.40882,0.294483,-0.27054
4,0.085693,0.093796,0.500945,-0.065012,-0.065404,0.27064,-0.101292,0.492465,0.615914,1.029639,-0.989654,-0.404478,2.468238,-0.418105,-0.406818,-0.411149,-0.40882,0.294483,-0.27054
