In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

Load Dataset

In [None]:
df=pd.read_csv("StudentsPerformance.csv")

In [None]:
df.head()

In [None]:
df.shape

Initial Data Inspection

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

Task 1: Replace Invalid Zero Values with NaN

In [None]:
(df==0).sum()

In [None]:
df[['math score','reading score','writing score']].min()

In [None]:
(df['math score']==0).sum()

In [None]:
(df['reading score']==0).sum()

In [None]:
(df['writing score']==0).sum()

In [None]:
score_cols=['math score','reading score','writing score']

In [None]:
for col in score_cols:
    zero_count=(df[col]==0).sum()
    print(f"{col}: {zero_count} zeros")

In [None]:
df[df['math score']==0]

Note: In this dataset, zero scores are valid (students can score 0), so we won't replace them with NaN.
If there were invalid zeros in other contexts, we would replace them:

In [None]:
# Example: If zeros were invalid, we would do:
# for col in score_cols:
#     df[col] = df[col].replace(0, np.nan)

Task 2: Handle Missing Values

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
total_missing=df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")

In [None]:
missing_percentage=(df.isnull().sum()/len(df))*100

In [None]:
missing_percentage

Note: This dataset has no missing values. Demonstrating how to handle them if present:

In [None]:
# Example: Handle missing values with median for numerical columns
# for col in score_cols:
#     df[col] = df[col].fillna(df[col].median())

In [None]:
# Example: Handle missing values with mean
# for col in score_cols:
#     df[col] = df[col].fillna(df[col].mean())

In [None]:
# Example: Handle missing values in categorical columns with mode
# categorical_cols = df.select_dtypes(include='object').columns
# for col in categorical_cols:
#     df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
# Alternative: Drop rows with missing values
# df_cleaned = df.dropna()

Create artificial missing values for demonstration

In [None]:
df_demo=df.copy()

In [None]:
np.random.seed(42)
for col in score_cols:
    missing_indices=np.random.choice(df_demo.index,size=5,replace=False)
    df_demo.loc[missing_indices,col]=np.nan

In [None]:
df_demo.isnull().sum()

In [None]:
df_demo[df_demo.isnull().any(axis=1)].head(10)

Handle missing values with median

In [None]:
for col in score_cols:
    median_value=df_demo[col].median()
    print(f"{col} median: {median_value}")
    df_demo[col]=df_demo[col].fillna(median_value)

In [None]:
df_demo.isnull().sum()

Task 3: Verify Cleaned Data

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df[score_cols].describe()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print("Data Verification Summary:")
print("="*50)
print(f"Total rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print("="*50)

Task 4: Remove Duplicates and Inconsistencies

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
print(f"Shape before removing duplicates: {df.shape}")

In [None]:
df=df.drop_duplicates()

In [None]:
print(f"Shape after removing duplicates: {df.shape}")

In [None]:
df.duplicated().sum()

Check for inconsistencies in categorical columns

In [None]:
df['gender'].value_counts()

In [None]:
df['race/ethnicity'].value_counts()

In [None]:
df['parental level of education'].value_counts()

In [None]:
df['lunch'].value_counts()

In [None]:
df['test preparation course'].value_counts()

In [None]:
df['gender'].unique()

In [None]:
df['race/ethnicity'].unique()

Check for data range inconsistencies

In [None]:
for col in score_cols:
    min_val=df[col].min()
    max_val=df[col].max()
    print(f"{col}: min={min_val}, max={max_val}")
    if min_val<0 or max_val>100:
        print(f"  WARNING: {col} has values outside valid range (0-100)")

In [None]:
(df[score_cols]<0).sum()

In [None]:
(df[score_cols]>100).sum()

Strip whitespace from categorical columns (if any)

In [None]:
categorical_cols=df.select_dtypes(include='object').columns

In [None]:
for col in categorical_cols:
    df[col]=df[col].str.strip()

In [None]:
df.head()

Final Verification

In [None]:
print("="*60)
print("FINAL DATA CLEANING SUMMARY")
print("="*60)
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nNumerical columns summary:")
print(df[score_cols].describe())
print("="*60)
print("Data is clean and ready for analysis!")
print("="*60)