In [16]:
import pandas as pd

In [17]:
import numpy as np

In [18]:
import matplotlib.pyplot as plt

In [19]:
import seaborn as sns

Load Dataset

In [20]:
df=pd.read_csv("Student_Performance.csv")

In [21]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,,1,91
1,4,82,No,4.0,2,65
2,8,51,Yes,7.0,2,45
3,5,52,Yes,5.0,2,36
4,7,75,No,,5,66


In [22]:
df.shape

(10000, 6)

Initial Data Inspection

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       9998 non-null   float64
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [24]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,9998.0,10000.0,10000.0
mean,4.9929,69.4457,6.530206,4.5833,55.2248
std,2.589309,17.343152,1.695789,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [25]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         2
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [26]:
df.duplicated().sum()

np.int64(127)

Task 1: Replace Invalid Zero Values with NaN

In [27]:
(df==0).sum()

Hours Studied                         0
Previous Scores                       0
Extracurricular Activities            0
Sleep Hours                           0
Sample Question Papers Practiced    951
Performance Index                     0
dtype: int64

In [None]:
df[['Performance Index']].min()

KeyError: "None of [Index(['math score', 'reading score', 'writing score'], dtype='object')] are in the [columns]"

In [None]:
(df['Hours Studied']==0).sum()

In [None]:
(df['Previous Scores']==0).sum()

In [None]:
(df['Performance Index']==0).sum()

In [None]:
score_cols=['Hours Studied','Previous Scores','Performance Index']

In [None]:
for col in score_cols:
    zero_count=(df[col]==0).sum()
    print(f"{col}: {zero_count} zeros")

In [None]:
df[df['Hours Studied']==0]

Note: In this dataset, zero scores are valid (students can score 0), so we won't replace them with NaN.
If there were invalid zeros in other contexts, we would replace them:

In [None]:
# Example: If zeros were invalid, we would do:
# for col in score_cols:
#     df[col] = df[col].replace(0, np.nan)

Task 2: Handle Missing Values

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
total_missing=df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")

In [None]:
missing_percentage=(df.isnull().sum()/len(df))*100

In [None]:
missing_percentage

Note: This dataset has no missing values. Demonstrating how to handle them if present:

In [None]:
# Example: Handle missing values with median for numerical columns
# for col in score_cols:
#     df[col] = df[col].fillna(df[col].median())

In [None]:
# Example: Handle missing values with mean
# for col in score_cols:
#     df[col] = df[col].fillna(df[col].mean())

In [None]:
# Example: Handle missing values in categorical columns with mode
# categorical_cols = df.select_dtypes(include='object').columns
# for col in categorical_cols:
#     df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
# Alternative: Drop rows with missing values
# df_cleaned = df.dropna()

Create artificial missing values for demonstration

In [None]:
df_demo=df.copy()

In [None]:
np.random.seed(42)
for col in score_cols:
    missing_indices=np.random.choice(df_demo.index,size=5,replace=False)
    df_demo.loc[missing_indices,col]=np.nan

In [None]:
df_demo.isnull().sum()

In [None]:
df_demo[df_demo.isnull().any(axis=1)].head(10)

Handle missing values with median

In [None]:
for col in score_cols:
    median_value=df_demo[col].median()
    print(f"{col} median: {median_value}")
    df_demo[col]=df_demo[col].fillna(median_value)

In [None]:
df_demo.isnull().sum()

Task 3: Verify Cleaned Data

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df[score_cols].describe()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print("Data Verification Summary:")
print("="*50)
print(f"Total rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print("="*50)

Task 4: Remove Duplicates and Inconsistencies

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
print(f"Shape before removing duplicates: {df.shape}")

In [None]:
df=df.drop_duplicates()

In [None]:
print(f"Shape after removing duplicates: {df.shape}")

In [None]:
df.duplicated().sum()

Check for inconsistencies in categorical columns

In [None]:
df['Extracurricular Activities'].value_counts()

In [None]:
df['Hours Studied'].value_counts()

In [None]:
df['Previous Scores'].value_counts()

In [None]:
df['Sleep Hours'].value_counts()

In [None]:
df['Sample Question Papers Practiced'].value_counts()

In [None]:
df['Extracurricular Activities'].unique()

In [None]:
df.columns

Check for data range inconsistencies

In [None]:
for col in score_cols:
    min_val=df[col].min()
    max_val=df[col].max()
    print(f"{col}: min={min_val}, max={max_val}")
    if min_val<0 or max_val>100:
        print(f"  WARNING: {col} has values outside valid range (0-100)")

In [None]:
(df[score_cols]<0).sum()

In [None]:
(df[score_cols]>100).sum()

Strip whitespace from categorical columns (if any)

In [None]:
categorical_cols=df.select_dtypes(include='object').columns

In [None]:
for col in categorical_cols:
    df[col]=df[col].str.strip()

In [None]:
df.head()

Final Verification

In [None]:
print("="*60)
print("FINAL DATA CLEANING SUMMARY")
print("="*60)
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nNumerical columns summary:")
print(df[score_cols].describe())
print("="*60)
print("Data is clean and ready for analysis!")
print("="*60)