In [1]:
import pandas as pd

# -----------------------
# Load dataset
# -----------------------
df = pd.read_csv("student_performance_dataset.csv")

# -----------------------
# 1. Check & Remove duplicates
# -----------------------
print("Duplicates before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicates after:", df.duplicated().sum())

# -----------------------
# 2. Handle missing values
# -----------------------
print("\nMissing values before:\n", df.isnull().sum())

# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Fill numeric columns with median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after:\n", df.isnull().sum())

# -----------------------
# 3. Treat outliers (IQR method)
# -----------------------
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

for col in num_cols:
    lower = Q1[col] - 1.5 * IQR[col]
    upper = Q3[col] + 1.5 * IQR[col]
    df[col] = df[col].clip(lower, upper)

print("\nOutliers treated using IQR capping.")

# -----------------------
# 4. Save cleaned dataset
# -----------------------
df.to_csv("student_performance_dataset_cleaned.csv", index=False)
print("\nCleaned dataset saved as 'student_performance_dataset_cleaned.csv'")


Duplicates before: 208
Duplicates after: 0

Missing values before:
 Student_ID                    0
Gender                        0
Study_Hours_per_Week          0
Attendance_Rate               0
Past_Exam_Scores              0
Parental_Education_Level      0
Internet_Access_at_Home       0
Extracurricular_Activities    0
Final_Exam_Score              0
Pass_Fail                     0
dtype: int64

Missing values after:
 Student_ID                    0
Gender                        0
Study_Hours_per_Week          0
Attendance_Rate               0
Past_Exam_Scores              0
Parental_Education_Level      0
Internet_Access_at_Home       0
Extracurricular_Activities    0
Final_Exam_Score              0
Pass_Fail                     0
dtype: int64

Outliers treated using IQR capping.

Cleaned dataset saved as 'student_performance_dataset_cleaned.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
