# Student Dataset – Level 2
Feature Engineering, Feature Scaling, and Encoding Categorical Data

In [3]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# Load cleaned dataset
df = pd.read_csv("cleaned_students.csv")
df.head()


Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Python_mean_imputed,Python_median_imputed,studyHOURS_cleaned,Python_cleaned,DB_cleaned
0,Christina,Binger,44,Female,NORWAY,Private,72,Masters,158,59.0,55,59.0,59.0,158,59.0,55
1,Alex,Walekhwa,60,Male,KENYA,Private,79,Diploma,150,60.0,75,60.0,60.0,150,60.0,75
2,Philip,Leo,25,Male,UGANDA,Sognsvann,55,High School,130,74.0,50,74.0,74.0,130,74.0,50
3,Shoni,Hlongwane,22,Female,SOUTH AFRICA,Sognsvann,40,High School,120,,44,75.853333,81.0,123,75.853333,44
4,Maria,Kedibone,23,Female,SOUTH AFRICA,Sognsvann,65,High School,122,91.0,80,91.0,91.0,123,91.0,80


## Part 4 – Feature Engineering

In [15]:

# Create Programming Average
df['Programming_Avg'] = (df['Python_cleaned'] + df['DB_cleaned']) / 2

# Create isAdult feature
df['isAdult'] = (df['Age'] >= 25).astype(int)

# Transform studyHOURS into categories (Low, Medium, High) using quantiles
df['StudyHours_cat'] = pd.qcut(df['studyHOURS_cleaned'], q=3, labels=['Low','Medium','High'])

df[['Age','Python_cleaned','DB_cleaned','Programming_Avg','isAdult','StudyHours_cat']].head()


Unnamed: 0,Age,Python_cleaned,DB_cleaned,Programming_Avg,isAdult,StudyHours_cat
0,44,59.0,55,57.0,1,Medium
1,60,60.0,75,67.5,1,Low
2,25,74.0,50,62.0,1,Low
3,22,75.853333,44,59.926667,0,Low
4,23,91.0,80,85.5,0,Low


**Question:** Which engineered feature adds most predictive power?

- `Programming_Avg` likely adds the most value, as it combines performance across Python and DB into a single score.

## Part 5 – Feature Scaling

In [54]:

# Detect numeric columns
numeric_cols = ['Age','entryEXAM','studyHOURS_cleaned','Python_cleaned','DB_cleaned','Programming_Avg']
print("Numeric columns:", numeric_cols)

# StandardScaler
scaler_std = StandardScaler()
df_std = df.copy()
df_std[numeric_cols] = scaler_std.fit_transform(df[numeric_cols])

# MinMaxScaler
scaler_mm = MinMaxScaler()
df_mm = df.copy()
df_mm[numeric_cols] = scaler_mm.fit_transform(df[numeric_cols])

df_std.head()


Numeric columns: ['Age', 'entryEXAM', 'studyHOURS_cleaned', 'Python_cleaned', 'DB_cleaned', 'Programming_Avg']


Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Python_mean_imputed,Python_median_imputed,studyHOURS_cleaned,Python_cleaned,DB_cleaned,Programming_Avg,isAdult,StudyHours_cat
0,Christina,Binger,0.855723,Female,NORWAY,Private,-0.290391,Masters,158,59.0,55,59.0,59.0,0.673635,-1.667216,-0.854917,-1.357272,1,Medium
1,Alex,Walekhwa,2.412963,Male,KENYA,Private,0.137261,Diploma,150,60.0,75,60.0,60.0,-0.007743,-1.576215,0.326925,-0.487979,1,Low
2,Philip,Leo,-0.993499,Male,UGANDA,Sognsvann,-1.328974,High School,130,74.0,50,74.0,74.0,-1.711189,-0.302202,-1.150378,-0.943323,1,Low
3,Shoni,Hlongwane,-1.285481,Female,SOUTH AFRICA,Sognsvann,-2.245371,High School,120,,44,75.853333,81.0,-2.307395,-0.133547,-1.50493,-1.114974,0,Low
4,Maria,Kedibone,-1.188154,Female,SOUTH AFRICA,Sognsvann,-0.718043,High School,122,91.0,80,91.0,91.0,-2.307395,1.244814,0.622386,1.002236,0,Low


## Part 6 – Encoding Categorical Data

In [63]:

# Detect categorical columns
categorical_cols = ['gender','country','residence','prevEducation','StudyHours_cat']
print("Categorical columns:", categorical_cols)

# Label Encoding for gender (binary)
df['gender_encoded'] = LabelEncoder().fit_transform(df['gender'])

# One-Hot Encoding for others
df_encoded = pd.get_dummies(df, columns=['country','residence','prevEducation','StudyHours_cat'], drop_first=True)

df_encoded.head()


Categorical columns: ['gender', 'country', 'residence', 'prevEducation', 'StudyHours_cat']


Unnamed: 0,fNAME,lNAME,Age,gender,entryEXAM,studyHOURS,Python,DB,Python_mean_imputed,Python_median_imputed,...,residence_BI_Residence,residence_Private,residence_Sognsvann,prevEducation_Bachelors,prevEducation_Diploma,prevEducation_Doctorate,prevEducation_High School,prevEducation_Masters,StudyHours_cat_Medium,StudyHours_cat_High
0,Christina,Binger,44,Female,72,158,59.0,55,59.0,59.0,...,False,True,False,False,False,False,False,True,True,False
1,Alex,Walekhwa,60,Male,79,150,60.0,75,60.0,60.0,...,False,True,False,False,True,False,False,False,False,False
2,Philip,Leo,25,Male,55,130,74.0,50,74.0,74.0,...,False,False,True,False,False,False,True,False,False,False
3,Shoni,Hlongwane,22,Female,40,120,,44,75.853333,81.0,...,False,False,True,False,False,False,True,False,False,False
4,Maria,Kedibone,23,Female,65,122,91.0,80,91.0,91.0,...,False,False,True,False,False,False,True,False,False,False


## Final Export

In [70]:

# Save final engineered and encoded dataset
df_encoded.to_csv("students_engineered.csv", index=False)
print("✅ Final engineered dataset saved as students_engineered.csv")


✅ Final engineered dataset saved as students_engineered.csv
