<a href="https://colab.research.google.com/github/optacool/Multi-outcome-Prediction-of-Student-Scores/blob/main/Multi_Output_Regression_Model_for_Student_Performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings

# Suppress future warnings for cleaner output
warnings.simplefilter(action='ignore', category=FutureWarning)

print("Starting the machine learning model creation process...")

# --- 1. Load the Dataset ---
# The dataset is provided as 'StudentsPerformance.csv'
try:
    df = pd.read_csv('StudentsPerformance.csv')
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: StudentsPerformance.csv not found. Please ensure the file is in the correct directory.")
    exit()

# --- 2. Separate Features (IDVs) and Targets (DVs) ---
# Independent Variables (IDVs)
independent_variables = [
    'gender',
    'race/ethnicity',
    'parental level of education',
    'lunch',
    'test preparation course'
]

# Dependent Variables (DVs) - the ones we want to predict
dependent_variables = [
    'math score',
    'reading score',
    'writing score'
]

X = df[independent_variables]
y = df[dependent_variables]

print(f"\nIndependent Variables (X) shape: {X.shape}")
print(f"Dependent Variables (y) shape: {y.shape}")

# --- 3. Preprocessing Categorical Features ---
# Identify categorical columns for one-hot encoding
categorical_features = X.columns.tolist() # All IDVs are categorical

# Create a preprocessor using ColumnTransformer
# This will apply OneHotEncoder to all categorical features
# remainder='passthrough' means non-specified columns will be kept (though here all are specified)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("\nPreprocessing setup complete (One-Hot Encoding for categorical features).")

# --- 4. Splitting Data into Training and Testing Sets ---
# We'll use 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")
print(f"Training target shape (y_train): {y_train.shape}")
print(f"Testing target shape (y_test): {y_test.shape}")

# --- 5. Model Selection and Training ---
# We'll use RandomForestRegressor as the base estimator for its robustness
# MultiOutputRegressor allows us to train one model that predicts all three scores
# The pipeline will first preprocess the data and then train the model

print("\nInitializing the machine learning pipeline...")

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)))
])

print("Training the model... This might take a moment.")
model_pipeline.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Prediction and Evaluation ---
print("\nMaking predictions on the test set...")
y_pred = model_pipeline.predict(X_test)

print("\nEvaluating model performance:")

# Evaluate performance for each dependent variable
for i, dv_name in enumerate(dependent_variables):
    print(f"\n--- Evaluation for '{dv_name}' ---")

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    print(f"Mean Absolute Error (MAE): {mae:.2f}")

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    print(f"Mean Squared Error (MSE): {mse:.2f}")

    # Calculate R-squared (Coefficient of Determination)
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"R-squared (R2): {r2:.2f}")

print("\n--- Overall Model Performance ---")
# Overall MAE, MSE, R2 across all outputs
overall_mae = mean_absolute_error(y_test, y_pred)
overall_mse = mean_squared_error(y_test, y_pred)
overall_r2 = r2_score(y_test, y_pred) # R2 for multi-output is average of individual R2s

print(f"Overall Mean Absolute Error (MAE) across all scores: {overall_mae:.2f}")
print(f"Overall Mean Squared Error (MSE) across all scores: {overall_mse:.2f}")
print(f"Overall R-squared (R2) across all scores: {overall_r2:.2f}")

print("\nModel creation and evaluation process finished.")

# Example of making a prediction for a new, unseen student
print("\n--- Example Prediction for a New Student ---")
new_student_data = pd.DataFrame([{
    'gender': 'female',
    'race/ethnicity': 'group C',
    'parental level of education': 'some college',
    'lunch': 'standard',
    'test preparation course': 'completed'
}])

predicted_scores = model_pipeline.predict(new_student_data)
print(f"Input for prediction:\n{new_student_data.to_string(index=False)}")
print(f"\nPredicted scores for the new student:")
for i, dv_name in enumerate(dependent_variables):
    print(f"  {dv_name}: {predicted_scores[0, i]:.2f}")

Starting the machine learning model creation process...
Dataset loaded successfully.

First 5 rows of the dataset:
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  

Dataset Info:
<class 'pandas.core.frame.Dat

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))