In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('SP.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df = df.drop_duplicates()
print("Data shape after removing duplicates:", df.shape)
df

Data shape after removing duplicates: (1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [7]:
#Handling Missing Values
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].median())

# For categorical → most common value
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\n Missing values fixed.")


 Missing values fixed.


In [8]:
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("\n Encoding Completed.")


 Encoding Completed.


In [9]:
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\n Scaling applied.")

print("\n Final Cleaned Data:")
print(df.head())


 Scaling applied.

 Final Cleaned Data:
     gender  race/ethnicity  parental level of education     lunch  \
0 -0.964625       -1.015044                    -0.812640  0.741881   
1 -0.964625       -0.150441                     0.827953  0.741881   
2 -0.964625       -1.015044                     0.281088  0.741881   
3  1.036672       -1.879647                    -1.359505 -1.347925   
4  1.036672       -0.150441                     0.827953  0.741881   

   test preparation course  math score  reading score  writing score  
0                 0.746748    0.390024       0.193999       0.391492  
1                -1.339140    0.192076       1.427476       1.313269  
2                 0.746748    1.577711       1.770109       1.642475  
3                 0.746748   -1.259543      -0.833899      -1.583744  
4                 0.746748    0.653954       0.605158       0.457333  


In [10]:
# Save cleaned data
df.to_csv("SP_cleaned.csv", index=False)
print("\n Cleaned dataset saved as SP_cleaned.csv")


 Cleaned dataset saved as SP_cleaned.csv


Day 4: Feature Engineering

Feature Engineering improves dataset quality.

Steps:
1. Remove duplicates
2. Fix missing values:
   - Numeric → median
   - Categorical → mode
3. Encode text:
   - Label encoding (ordered)
   - OneHot encoding (unordered)
4. Scale values using StandardScaler.

Clean data = better model performance.