# 1. Loading the Data

In [None]:
import pandas as pd

df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Check for the data types of the columns

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB



Check for missing values

In [None]:
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


#2. Handling Missing Values

Drop rows with significant missing values (if any).

Fill missing values in scores (if any) with the column's mean or median

Verify again if any missing values remain

In [None]:
df.dropna(thresh=5, inplace=True)


df['math score'].fillna(df['math score'].mean(), inplace=True)
df['reading score'].fillna(df['reading score'].mean(), inplace=True)
df['writing score'].fillna(df['writing score'].mean(), inplace=True)

df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


# 3. Standardizing Categorical Data

Removing extra spaces.

Correcting inconsistent case usage (e.g., “Male” vs “male”)

In [None]:
df['gender'] = df['gender'].str.strip().str.lower()
df['race/ethnicity'] = df['race/ethnicity'].str.strip().str.lower()
df['parental level of education'] = df['parental level of education'].str.strip().str.lower()
df['lunch'] = df['lunch'].str.strip().str.lower()
df['test preparation course'] = df['test preparation course'].str.strip().str.lower()

print(df['gender'].unique())
print(df['race/ethnicity'].unique())
print(df['parental level of education'].unique())
print(df['lunch'].unique())
print(df['test preparation course'].unique())

['female' 'male']
['group b' 'group c' 'group a' 'group d' 'group e']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']
['none' 'completed']


# Final Verification

In [None]:
df.describe()

df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group b,bachelor's degree,standard,none,72,72,74
1,female,group c,some college,standard,completed,69,90,88
2,female,group b,master's degree,standard,none,90,95,93
3,male,group a,associate's degree,free/reduced,none,47,57,44
4,male,group c,some college,standard,none,76,78,75


In [None]:
cleaned_file_path = 'Cleaned_Student_Performance.csv'
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to Cleaned_Student_Performance.csv
