In [None]:
# ---------------------------
# Step 1: Upload & Load CSV
# ---------------------------
from google.colab import files
import pandas as pd
import numpy as np

# Upload CSV
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Read CSV
df = pd.read_csv(filename)

# Backup before cleaning
df_before = df.copy()

print("✅ File loaded successfully!\n")

# Show full dataset (before cleaning)
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # No truncation
pd.set_option('display.max_colwidth', None) # Show full content

print("📌 Full Dataset (Before Cleaning):")
display(df)


# ---------------------------
# Step 2: Before Cleaning Summary
# ---------------------------
print("\n--- BEFORE CLEANING ---")
print("Shape (rows, columns):", df_before.shape)
print("\nMissing values:\n", df_before.isnull().sum())
print("\nDuplicate rows:", df_before.duplicated().sum())


# ---------------------------
# Step 3: Cleaning Process
# ---------------------------

# (a) Remove Duplicates
df = df.drop_duplicates()

# (b) Handle Missing Values
# Numeric columns → fill with mean
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# Categorical columns → fill with mode
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# (c) Treat Outliers (IQR method)
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower,
                       np.where(df[col] > upper, upper, df[col]))


# ---------------------------
# Step 4: After Cleaning Summary
# ---------------------------
print("\n--- AFTER CLEANING ---")
print("Shape (rows, columns):", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())


# ---------------------------
# Step 5: Before vs After Report
# ---------------------------
report = pd.DataFrame({
    "Before Cleaning": [
        df_before.shape[0],
        df_before.duplicated().sum(),
        df_before.isnull().sum().sum()
    ],
    "After Cleaning": [
        df.shape[0],
        df.duplicated().sum(),
        df.isnull().sum().sum()
    ]
}, index=["Total Rows", "Duplicate Rows", "Missing Values"])

print("\n📊 Before vs After Cleaning Report:\n")
display(report)

# ---------------------------
# Step 6: Final Cleaned Dataset
# ---------------------------
print("\n✅ Final Cleaned Dataset (Full):")
display(df)


Saving StudentsPerformance1.csv to StudentsPerformance1.csv
✅ File loaded successfully!

📌 Full Dataset (Before Cleaning):


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,44.0
4,male,group C,some college,standard,none,76.0,78.0,75.0
5,female,group B,associate's degree,standard,none,71.0,83.0,78.0
6,female,group B,some college,standard,completed,88.0,95.0,92.0
7,male,group B,some college,free/reduced,none,40.0,43.0,39.0
8,male,group D,high school,free/reduced,completed,64.0,64.0,67.0
9,female,group B,high school,free/reduced,none,38.0,60.0,50.0



--- BEFORE CLEANING ---
Shape (rows, columns): (1020, 8)

Missing values:
 gender                         20
race/ethnicity                 20
parental level of education    20
lunch                          20
test preparation course        20
math score                     10
reading score                  10
writing score                  10
dtype: int64

Duplicate rows: 36

--- AFTER CLEANING ---
Shape (rows, columns): (984, 8)

Missing values:
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

Duplicate rows: 0

📊 Before vs After Cleaning Report:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.where(df[col] < lower, lower,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

Unnamed: 0,Before Cleaning,After Cleaning
Total Rows,1020,984
Duplicate Rows,36,0
Missing Values,130,0



✅ Final Cleaned Dataset (Full):


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,44.0
4,male,group C,some college,standard,none,76.0,78.0,75.0
5,female,group B,associate's degree,standard,none,71.0,83.0,78.0
6,female,group B,some college,standard,completed,88.0,95.0,92.0
7,male,group B,some college,free/reduced,none,40.0,43.0,39.0
8,male,group D,high school,free/reduced,completed,64.0,64.0,67.0
9,female,group B,high school,free/reduced,none,38.0,60.0,50.0
