In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import necessary libraries**

In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# **Path to dataset (adjust path if needed)**

In [5]:
path = "/content/drive/MyDrive/exams.csv"

# **Load dataset**

In [6]:
df = pd.read_csv(path)

#  **Data Preprocessing**

In [7]:
path = "/content/drive/MyDrive/exams.csv"

In [8]:
# Create a new column 'average_score' as the rounded average of math, reading, and writing scores
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1).round().astype(int)

# Ordinal encode 'lunch' column: 'free/reduced' = 0, 'standard' = 1
df['lunch'] = OrdinalEncoder(categories=[['free/reduced', 'standard']]).fit_transform(df[['lunch']])

# Define order for parental education (ordinal encoding)
education_order = [
    'some high school',
    'high school',
    'some college',
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]

# Define order for test preparation course
prep_order = ['none', 'completed']

# Columns to ordinal encode with their categories
ordinal_columns = ['parental level of education', 'test preparation course']
categories = [education_order, prep_order]

# Initialize ordinal encoder and fit-transform the columns
encoder = OrdinalEncoder(categories=categories)
df[ordinal_columns] = encoder.fit_transform(df[ordinal_columns])

# One-Hot Encode categorical columns 'gender' and 'race/ethnicity', drop first to avoid dummy variable trap
df = pd.get_dummies(df, columns=['gender', 'race/ethnicity'], drop_first=True)

# Drop original score columns since we use the average_score now
df.drop(columns=['math score', 'reading score', 'writing score'], inplace=True)

# **Feature Scaling**

In [9]:
# Initialize StandardScaler
scaler = StandardScaler()

# Scale the 'average_score' column and create new column 'average_score_scaled'
df['average_score_scaled'] = scaler.fit_transform(df[['average_score']])

# Define function to classify performance into categories based on average score
def classify(score):
    if score >= 85:
        return 'high'
    elif score >= 70:
        return 'medium'
    else:
        return 'low'

# Inverse transform scaled scores back to original scale for classification
df['average_score'] = scaler.inverse_transform(df[['average_score_scaled']])

# Create target column 'performance_level' based on the average score
df['performance_level'] = df['average_score'].apply(classify)

# Drop the original 'average_score' column as it's no longer needed as feature
df.drop(columns=['average_score'], inplace=True)

# **Prepare Features and Target**

In [10]:
# Drop target column from features
X = df.drop(columns=['performance_level'])

# Target variable
y = df['performance_level']

# **Train/Test Split**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# **Model Training**

In [12]:
# Initialize Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Fit the model on training data
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# **Evaluation**

In [13]:
print("Accuracy on test data:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy on test data: 0.995

Classification Report:
               precision    recall  f1-score   support

        high       1.00      0.97      0.98        29
         low       1.00      1.00      1.00        99
      medium       0.99      1.00      0.99        72

    accuracy                           0.99       200
   macro avg       1.00      0.99      0.99       200
weighted avg       1.00      0.99      0.99       200


Confusion Matrix:
 [[28  0  1]
 [ 0 99  0]
 [ 0  0 72]]


# **Predicting on a Random Sample**

In [15]:
# Get feature column names
feature_columns = X.columns.tolist()

# Generate a random test sample with valid values for each feature
random_sample = {
    'parental level of education': np.random.choice([0,1,2,3,4,5]),  # ordinal encoded 0-5
    'lunch': np.random.choice([0,1]),  # binary 0 or 1
    'test preparation course': np.random.choice([0,1]),  # binary 0 or 1
    'gender_male': np.random.choice([0,1]),  # binary 0 or 1
    'race/ethnicity_group B': 0,
    'race/ethnicity_group C': 0,
    'race/ethnicity_group D': 0,
    'race/ethnicity_group E': 0,
    'average_score_scaled': np.random.normal(0,1)  # scaled numerical value, normal distribution
}

# Ensure exactly one race group is set to 1 (one-hot encoding)
race_groups = ['race/ethnicity_group B', 'race/ethnicity_group C', 'race/ethnicity_group D', 'race/ethnicity_group E']
random_sample[np.random.choice(race_groups)] = 1

# Convert to DataFrame with one row
random_df = pd.DataFrame([random_sample], columns=feature_columns)

# Predict on the random sample
random_pred = clf.predict(random_df)

print("\nRandom Sample Input Features:\n", random_df)

print("\nPredicted Performance Level for Random Sample:", random_pred[0])


Random Sample Input Features:
    parental level of education  lunch  test preparation course  gender_male  \
0                            2      1                        1            1   

   race/ethnicity_group B  race/ethnicity_group C  race/ethnicity_group D  \
0                       0                       0                       1   

   race/ethnicity_group E  average_score_scaled  
0                       0             -0.532516  

Predicted Performance Level for Random Sample: low


# **Importing the Model**

In [16]:
import joblib

# Save trained model
joblib.dump(clf, 'model.pkl')

# Save the scaler too, for handling average score scaling if needed
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']