In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

In [None]:
df = pd.read_csv("StudentPerformanceFactors.csv")
print("Dataset Shape:", df.shape)
print(df.head())


In [None]:
print("\nMissing values per column:\n", df.isnull().sum())

In [None]:
encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = encoder.fit_transform(df[col])

print("\nData after encoding:\n", df.head())


In [None]:
X = df.drop("Exam_Score", axis=1)
y = df["Exam_Score"]

In [None]:
df.shape
df.info()
df.describe()
df.isnull().sum()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("\nLinear Regression Results")
print("MSE:", mse_linear)
print("R²:", r2_linear)


In [None]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train_poly, X_test_poly, y_train, y_test = train_test_split(
    X_poly, y, test_size=0.2, random_state=42
)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

y_pred_poly = poly_model.predict(X_test_poly)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print("\nPolynomial Regression Results (degree=2)")
print("MSE:", mse_poly)
print("R²:", r2_poly)

In [None]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Polynomial Regression (deg=2)"],
    "MSE": [mse_linear, mse_poly],
    "R²": [r2_linear, r2_poly]
})
print("\nComparison of Models:\n", results)

In [None]:
plt.figure(figsize=(6, 5))
plt.scatter(y_test, y_pred_linear, alpha=0.7, color="blue", label="Linear")
plt.scatter(y_test, y_pred_poly, alpha=0.5, color="red", label="Polynomial")
plt.xlabel("Actual Scores")
plt.ylabel("Predicted Scores")
plt.title("Actual vs Predicted (Linear vs Polynomial)")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": linear_model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Importance (Linear Regression):\n", feature_importance)