In [113]:
import numpy as np
import pandas as pd
import seaborn as sns

In [114]:
data = pd.read_csv("../data/cleaned_student_habits_performance.csv")
data.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [115]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   str    
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   str    
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   str    
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   str    
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       1000 non-null   str    
 12  internet_quality               1000 non-null   str    
 13  

In [116]:
data['gender'].value_counts()

gender
Female    481
Male      477
Other      42
Name: count, dtype: int64

In [117]:
data['part_time_job'].nunique()

2

In [118]:
data['part_time_job'].value_counts()

part_time_job
No     785
Yes    215
Name: count, dtype: int64

In [119]:
data['diet_quality'].value_counts()

diet_quality
Fair    437
Good    378
Poor    185
Name: count, dtype: int64

In [120]:
data['parental_education_level'].value_counts()

parental_education_level
High School    392
Bachelor       350
Master         167
Unknown         91
Name: count, dtype: int64

In [121]:
data['internet_quality'].value_counts()

internet_quality
Good       447
Average    391
Poor       162
Name: count, dtype: int64

In [122]:
data['extracurricular_participation'].value_counts()

extracurricular_participation
No     682
Yes    318
Name: count, dtype: int64

In [123]:
encoded_data = data.copy()

In [124]:
# Define the logical weights for each category
# Note: Higher numbers represent 'more' or 'better' of that feature
ordinal_mappings = {
    'parental_education_level': {
        'Unknown': 0, 
        'High School': 1, 
        'Bachelor': 2, 
        'Master': 3
    },
    'internet_quality': {
        'Poor': 1, 
        'Average': 2, 
        'Good': 3
    },
    'diet_quality': {
        'Poor': 1, 
        'Fair': 2, 
        'Good': 3
    }
}

# Apply the mapping to each column
for col, mapping in ordinal_mappings.items():
    encoded_data[col] = encoded_data[col].map(mapping)

# Check the results for these columns
print(f"--- BEFORE Ordinal Encoding ---")
display(data[['parental_education_level', 'internet_quality', 'diet_quality']].head())

print(f"\n--- AFTER Ordinal Encoding ---")
display(encoded_data[['parental_education_level', 'internet_quality', 'diet_quality']].head())

--- BEFORE Ordinal Encoding ---


Unnamed: 0,parental_education_level,internet_quality,diet_quality
0,Master,Average,Fair
1,High School,Average,Good
2,High School,Poor,Poor
3,Master,Good,Poor
4,Master,Good,Fair



--- AFTER Ordinal Encoding ---


Unnamed: 0,parental_education_level,internet_quality,diet_quality
0,3,2,2
1,1,2,3
2,1,1,1
3,3,3,1
4,3,3,2


In [125]:
binary_encoding_config = {
    'part_time_job': {'No': 0, 'Yes': 1},
    'extracurricular_participation': {'No': 0, 'Yes': 1}
}

print("--- BEFORE Binary Encoding ---")
display(data[list(binary_encoding_config.keys())].head())

# Apply the maps
for col, mapping in binary_encoding_config.items():
    encoded_data[col] = encoded_data[col].map(mapping)

print("\n--- AFTER Binary Encoding ---")
display(encoded_data[list(binary_encoding_config.keys())].head())

--- BEFORE Binary Encoding ---


Unnamed: 0,part_time_job,extracurricular_participation
0,No,Yes
1,No,No
2,No,No
3,No,Yes
4,No,No



--- AFTER Binary Encoding ---


Unnamed: 0,part_time_job,extracurricular_participation
0,0,1
1,0,0
2,0,0
3,0,1
4,0,0


In [126]:
print("--- BEFORE One-Hot Encoding ---")
display(encoded_data[['gender']].head())

# Apply One-Hot Encoding
# drop_first=True: Removes the first category (Female) to prevent redundant data.
# dtype=int: Ensures the result is 0 and 1 instead of True/False.
encoded_data = pd.get_dummies(encoded_data, columns=['gender'], drop_first=True, dtype=int)

print("\n--- AFTER One-Hot Encoding ---")
# The new columns will be named 'gender_Male' and 'gender_Other'
display(encoded_data[['gender_Male', 'gender_Other']].head())

--- BEFORE One-Hot Encoding ---


Unnamed: 0,gender
0,Female
1,Female
2,Male
3,Female
4,Female



--- AFTER One-Hot Encoding ---


Unnamed: 0,gender_Male,gender_Other
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [127]:
encoded_data.dtypes

student_id                           str
age                                int64
study_hours_per_day              float64
social_media_hours               float64
netflix_hours                    float64
part_time_job                      int64
attendance_percentage            float64
sleep_hours                      float64
diet_quality                       int64
exercise_frequency                 int64
parental_education_level           int64
internet_quality                   int64
mental_health_rating               int64
extracurricular_participation      int64
exam_score                       float64
gender_Male                        int64
gender_Other                       int64
dtype: object

In [128]:
encoded_data.drop(columns=['student_id'], inplace=True)
encoded_data.head()

Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score,gender_Male,gender_Other
0,23,0.0,1.2,1.1,0,85.0,8.0,2,6,3,2,8,1,56.2,0,0
1,20,6.9,2.8,2.3,0,97.3,4.6,3,6,1,2,8,0,100.0,0,0
2,21,1.4,3.1,1.3,0,94.8,8.0,1,1,1,1,1,0,34.3,1,0
3,23,1.0,3.9,1.0,0,71.0,9.2,1,4,3,3,1,1,26.8,0,0
4,19,5.0,4.4,0.5,0,90.9,4.9,2,3,3,3,1,0,66.4,0,0


In [129]:
encoded_data.head().T

Unnamed: 0,0,1,2,3,4
age,23.0,20.0,21.0,23.0,19.0
study_hours_per_day,0.0,6.9,1.4,1.0,5.0
social_media_hours,1.2,2.8,3.1,3.9,4.4
netflix_hours,1.1,2.3,1.3,1.0,0.5
part_time_job,0.0,0.0,0.0,0.0,0.0
attendance_percentage,85.0,97.3,94.8,71.0,90.9
sleep_hours,8.0,4.6,8.0,9.2,4.9
diet_quality,2.0,3.0,1.0,1.0,2.0
exercise_frequency,6.0,6.0,1.0,4.0,3.0
parental_education_level,3.0,1.0,1.0,3.0,3.0


In [131]:
encoded_data.to_csv("../data/final_encoded_data.csv", index=False)
print("Success! 'final_encoded_data.csv' has been saved.")

Success! 'final_encoded_data.csv' has been saved.
