# Multi-Feature Linear Regression for Exam Scores
Linear regression extends naturally to multiple features. We combine **study hours**, **attendance**, and **internal marks** to predict final exam scores. K-Fold Cross-Validation divides the dataset into `k` parts, trains on `k-1` parts, tests on the remaining part, and repeats. Averaging the scores gives a stable estimate of model performance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold  # only for splitting

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\avishkar\Downloads\student_exam_scores_12_13.csv")  

# Step 2: Select features and target variable
X = df[['hours_studied', 'attendance_percent', 'Internal_marks']].values
y = df['exam_score'].values.reshape(-1, 1)

# Step 3: Add bias (column of 1s for intercept)
X_b = np.c_[np.ones((X.shape[0], 1)), X]

# Step 4: Train Linear Regression using Normal Equation
def linear_regression_train(X, y):
    theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return theta

def linear_regression_predict(X, theta):
    return X.dot(theta)

# Step 5: K-Fold Cross-Validation (Manual)
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

mse_scores = []
r2_scores = []

for train_index, test_index in kf.split(X_b):
    X_train, X_test = X_b[train_index], X_b[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train model
    theta = linear_regression_train(X_train, y_train)
    
    # Predict
    y_pred = linear_regression_predict(X_test, theta)
    
    # Evaluate
    mse = np.mean((y_test - y_pred) ** 2)
    ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
    ss_res = np.sum((y_test - y_pred) ** 2)
    r2 = 1 - (ss_res / ss_total)
    
    mse_scores.append(mse)
    r2_scores.append(r2)

# Step 6: Print average results
print("\nLinear Regression (from scratch) using 5-Fold Cross-Validation")
print("--------------------------------------------------------------")
print("Average MSE: {:.2f}",round(np.mean(mse_scores),3))
print("Average R² Score: {:.3f}",round(np.mean(r2_scores),3))

# Step 7: Train final model on full dataset
theta_final = linear_regression_train(X_b, y)

print("\nModel Coefficients (θ):")
print("Intercept: {:.3f}".format(theta_final[0][0]))
print("hours_studied: {:.3f}".format(theta_final[1][0]))
print("attendance_percent: {:.3f}".format(theta_final[2][0]))
print("Internal_marks: {:.3f}".format(theta_final[3][0]))

# Step 8: Predict for new input
new_hours = float(input("\nEnter study hours: "))
new_attendance = float(input("Enter attendance percent: "))
new_internal = float(input("Enter internal marks: "))

new_X = np.array([[1, new_hours, new_attendance, new_internal]])
predicted_score = linear_regression_predict(new_X, theta_final)
print("\nPredicted Exam Score: {:.2f}".format(predicted_score[0][0]))

# Step 9: Visualization - Actual vs Predicted
y_pred_full = linear_regression_predict(X_b, theta_final)

plt.figure(figsize=(8,6))
plt.scatter(y, y_pred_full, color='blue', edgecolors='k', alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linewidth=2)
plt.title("Actual vs Predicted Exam Scores", fontsize=14)
plt.xlabel("Actual Exam Score", fontsize=12)
plt.ylabel("Predicted Exam Score", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
