# Student Lifestyle Analytics — Data Preparation (Step 3)

# 1. Setup & Load

In [1]:
import pandas as pd

df = pd.read_csv("../data/interim/student_lifestyle_clean.csv")
df.head()

Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level,GPA_Quartile
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate,Q2
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low,Q1_lowest
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low,Q1_lowest
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate,Q1_lowest
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High,Q4_highest


# 2. Handle Outliers

- Cap Physical_Activity_Hours_Per_Day at 10h.

In [2]:
df["Physical_Activity_Hours_Per_Day"] = df["Physical_Activity_Hours_Per_Day"].clip(upper=10)

- Optional: remove rows where total hours > 24.

In [3]:
df["Total_Hours"] = (
    df["Study_Hours_Per_Day"] + df["Sleep_Hours_Per_Day"] +
    df["Social_Hours_Per_Day"] + df["Extracurricular_Hours_Per_Day"] +
    df["Physical_Activity_Hours_Per_Day"]
)
df = df[df["Total_Hours"] <= 24]

# 3. Encode Categorical Variables

- Convert Stress_Level (Low, Moderate, High) into ordered categories.

In [4]:
df["Stress_Level"] = pd.Categorical(
    df["Stress_Level"], categories=["Low","Moderate","High"], ordered=True
)

- For ML modeling later, add dummy variables:

In [5]:
df = pd.get_dummies(df, columns=["Stress_Level"], drop_first=True)

# 4. Feature Engineering

Create useful derived features:

- 4.1 Study-to-Sleep Ratio (balance measure):

In [6]:
df["Study_Sleep_Ratio"] = df["Study_Hours_Per_Day"] / df["Sleep_Hours_Per_Day"]

- 4.2 Total Active Hours (study + physical + extracurricular + social):

In [7]:
df["Active_Hours"] = (
    df["Study_Hours_Per_Day"] + df["Physical_Activity_Hours_Per_Day"] +
    df["Extracurricular_Hours_Per_Day"] + df["Social_Hours_Per_Day"]
)

4.3 GPA Quartile (for comparisons & dashboard filters):

In [8]:
df["GPA_Quartile"] = pd.qcut(df["GPA"], 4, labels=["Q1_Lowest","Q2","Q3","Q4_Highest"])

# 5. Validate Cleaned Data

Quick sanity checks:

In [9]:
print(df.isna().sum())
print(df.describe(include="all"))

Student_ID                         0
Study_Hours_Per_Day                0
Extracurricular_Hours_Per_Day      0
Sleep_Hours_Per_Day                0
Social_Hours_Per_Day               0
Physical_Activity_Hours_Per_Day    0
GPA                                0
GPA_Quartile                       0
Total_Hours                        0
Stress_Level_Moderate              0
Stress_Level_High                  0
Study_Sleep_Ratio                  0
Active_Hours                       0
dtype: int64
         Student_ID  Study_Hours_Per_Day  Extracurricular_Hours_Per_Day  \
count   1766.000000          1766.000000                    1766.000000   
unique          NaN                  NaN                            NaN   
top             NaN                  NaN                            NaN   
freq            NaN                  NaN                            NaN   
mean    1008.422424             7.412288                       1.985844   
std      576.025556             1.418638                

### Confirm:
- No negative hours.
- No totals > 24.
- Stress encoded properly.

# 6. Save Final Dataset

In [11]:
df.to_csv("../data/processed/student_lifestyle_final.csv", index=False)