In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import stats

In [3]:
# Load dataset
df = pd.read_csv("Student Mental Health Analysis During Online Learning.csv")
df.head()

Unnamed: 0,Name,Gender,Age,Education Level,Screen Time (hrs/day),Sleep Duration (hrs),Physical Activity (hrs/week),Stress Level,Anxious Before Exams,Academic Performance Change
0,Aarav,Male,15,Class 8,7.1,8.9,9.3,Medium,No,Same
1,Meera,Female,25,MSc,3.3,5.0,0.2,Medium,No,Same
2,Ishaan,Male,20,BTech,9.5,5.4,6.2,Medium,No,Same
3,Aditya,Male,20,BA,10.8,5.6,5.5,High,Yes,Same
4,Anika,Female,17,Class 11,2.8,5.4,3.1,Medium,Yes,Same


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          1000 non-null   object 
 1   Gender                        1000 non-null   object 
 2   Age                           1000 non-null   int64  
 3   Education Level               1000 non-null   object 
 4   Screen Time (hrs/day)         1000 non-null   float64
 5   Sleep Duration (hrs)          1000 non-null   float64
 6   Physical Activity (hrs/week)  1000 non-null   float64
 7   Stress Level                  1000 non-null   object 
 8   Anxious Before Exams          1000 non-null   object 
 9   Academic Performance Change   1000 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 78.3+ KB


In [5]:
df.isnull().sum()

Unnamed: 0,0
Name,0
Gender,0
Age,0
Education Level,0
Screen Time (hrs/day),0
Sleep Duration (hrs),0
Physical Activity (hrs/week),0
Stress Level,0
Anxious Before Exams,0
Academic Performance Change,0


In [6]:
# 1. Tangani Missing Values
# Pisahkan kolom numerik dan kategorik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

In [7]:
# Cek missing value
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
Name                            0
Gender                          0
Age                             0
Education Level                 0
Screen Time (hrs/day)           0
Sleep Duration (hrs)            0
Physical Activity (hrs/week)    0
Stress Level                    0
Anxious Before Exams            0
Academic Performance Change     0
dtype: int64


In [9]:
# Imputasi: mean untuk numerik, most_frequent untuk kategorikal
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [10]:
# 2. Deteksi dan Hapus Outlier (menggunakan Z-score untuk kolom numerik)
z_scores = np.abs(stats.zscore(df[num_cols]))
df = df[(z_scores < 3).all(axis=1)]  # hanya simpan data tanpa outlier

In [11]:
# 3. Encoding untuk variabel kategorikal
# Gunakan One-Hot Encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [12]:
# 4. Standarisasi Data Numerik
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [13]:
# Simpan hasilnya
df.to_csv("cleaned_student_mental_health.csv", index=False)

print("Data berhasil dibersihkan dan disimpan sebagai 'cleaned_student_mental_health.csv'")

Data berhasil dibersihkan dan disimpan sebagai 'cleaned_student_mental_health.csv'
