In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r"D:\Projects\Academic Score Prediction Model\ResearchInformation3.csv")

x = df.iloc[:, 4:13].values
print(x.shape)
y = df.iloc[:, -1].values
print(y.shape)

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

y = df.iloc[: , -1].values

encoder_Income = OneHotEncoder(sparse_output=False, drop="first")
trasnformed_Income = encoder_Income.fit_transform(df.loc[:, ["Income"]])


encoder_nominal = OrdinalEncoder()
transformed_nominal = encoder_nominal.fit_transform(df[["Hometown", "Job", "Extra"]])


categorical_columns = ["Preparation", "Gaming"]
time_categories = [["0-1 Hour", "2-3 Hours", "More than 3 Hours"]]
trasnformed_categories = []
for col in categorical_columns:
    encoder_columns = OrdinalEncoder(categories=time_categories)
    trasnformed_categories.append(encoder_columns.fit_transform(df[[col]]))


attendance_categories = [["Below 40%", "40%-59%","60%-79%", "80%-100%"]]
encoder_Attendance = OrdinalEncoder(categories=attendance_categories)
trasnformed_Attendance = encoder_Attendance.fit_transform(df.loc[:, ["Attendance"]])


numeric_features = df[["Computer", "Last"]].values


x = np.concatenate([trasnformed_Income, transformed_nominal, trasnformed_Attendance, *trasnformed_categories, numeric_features], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=3),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=3),
    "XGBoost": XGBRegressor(n_estimators=100, max_depth=2, learning_rate=0.1)
}

for name, model in models.items():
    model.fit(x_train, y_train)
    train_score = model.score(x_train, y_train)



In [None]:
predictions = {} 

for name, model in models.items():
    y_pred = model.predict(x_test)
    predictions[name] = y_pred


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

results = []
for name, y_pred in predictions.items():
    results.append({
        "Model": name,
        "R2": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred))
    })

In [None]:
results_df = np.round(pd.DataFrame(results), 2)
print(f"\nPerformance Comparison:\n {results_df}")

the_best_model = results_df.loc[results_df['R2'].idxmax(), 'Model']
print(f"\nthe best model: {the_best_model}")

the_worst_model = results_df.loc[results_df['R2'].idxmin(), 'Model']
print(f"the worst model: {the_worst_model}")

In [None]:
metrics = ['R2', 'MAE', 'MSE','RMSE']

for metric in metrics:
    plt.bar(results_df['Model'], results_df[metric], color='skyblue')
    plt.title(f'{metric} Comparison of Models')
    plt.ylabel(metric)
    plt.xlabel('Models')
    for i, val in enumerate(results_df[metric]):
        plt.text(i, val, f'{val:.2f}', ha='center')
    plt.show()