In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Step 2: Load the dataset
# Replace 'student_data.csv' with your dataset path
df = pd.read_csv('student_data.csv')

# Step 3: Inspect the data
print(df.head())
print(df.info())
print(df.isnull().sum())

# Step 4: Handle missing values
# For numerical columns - fill missing values with mean
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# For categorical columns - fill missing values with most frequent
cat_cols = df.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Step 5: Encode categorical features
# Using LabelEncoder for simplicity (can also use OneHotEncoder)
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Step 6: Feature scaling (Standardization)
scaler = StandardScaler()
X = df.drop('Final_Grade', axis=1)  # Features
y = df['Final_Grade']               # Target variable

X_scaled = scaler.fit_transform(X)

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Step 8: Check the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
