In [3]:
from google.colab import files

# Upload file from local system
uploaded = files.upload()

Saving student_info.csv to student_info (1).csv


In [4]:
# Install necessary libraries (if not already installed)
# !pip install pandas

import pandas as pd

# 1. Load the dataset
df = pd.read_csv("student_info.csv")
print("🔹 Raw Dataset Loaded. Shape:", df.shape)
print(df.head(), "\n")

# 2. Identify and handle missing values
print("🔹 Missing Values:")
print(df.isnull().sum(), "\n")

# 3. Remove duplicate rows
initial_shape = df.shape
df = df.drop_duplicates()
print(f"🔹 Removed {initial_shape[0] - df.shape[0]} duplicate rows.\n")

# 4. Standardize column headers (lowercase, no spaces)
print("🔹 Column Names BEFORE:")
print(list(initial_shape))
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("🔹 Column Names AFTER:")
print(list(df.columns), "\n")

# 5. Standardize text values (e.g., gender)
if 'gender' in df.columns:
    df['gender'] = df['gender'].astype(str).str.lower().str.strip()
    print("🔹 Unique Genders:", df['gender'].unique())

# 6. Standardize any 'neighbourhood' or similar columns if applicable
if 'neighbourhood' in df.columns:
    df['neighbourhood'] = df['neighbourhood'].astype(str).str.title().str.strip()
    print("🔹 Sample Neighbourhoods:", df['neighbourhood'].unique()[:5], "\n")

# 7. Convert date columns (if any) to dd-mm-yyyy format
date_cols = [col for col in df.columns if 'date' in col or 'day' in col]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = df[col].dt.strftime('%d-%m-%Y')
if date_cols:
    print("🔹 Date Columns Formatted:\n", df[date_cols].head(), "\n")

# 8. Check and fix data types
# Age should be numeric
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    df['age'] = df['age'].fillna(0).astype(int)
    print("🔹 Age Data Type:", df['age'].dtype)
    print(df['age'].describe(), "\n")

# 9. Final overview
print("🔹 Final Cleaned Dataset Info:")
print(df.info())
print("\n🔹 First 5 Rows of Cleaned Data:")
print(df.head())

🔹 Raw Dataset Loaded. Shape: (1000, 15)
  student_id       name gender  age  grade_level  math_score  reading_score  \
0         S1  Student_1  Other   17           10          74             61   
1         S2  Student_2   Male   17           12          99             70   
2         S3  Student_3  Other   17            9          59             60   
3         S4  Student_4  Other   17           12          70             88   
4         S5  Student_5   Male   15            9          85             77   

   writing_score  attendance_rate parent_education  study_hours  \
0             90        94.660002         Master's     4.120192   
1             91        93.173227       Bachelor's     2.886505   
2             99        98.631098              PhD     1.909926   
3             69        96.419620              PhD     1.664740   
4             94        91.332105              PhD     2.330918   

  internet_access       lunch_type extra_activities final_result  
0             Y

In [5]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv("Cleaned_student_info.csv", index=False)

# Download the file to your local system
from google.colab import files
files.download("Cleaned_student_info.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>