In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/PStarH/student-performance-factor-analysis/main/StudentPerformanceFactors.csv"
df = pd.read_csv(url)
df.head()

#Data Understanding

In [None]:
 df.columns

In [None]:
 df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

Data Cleaning 

In [None]:
df.duplicated().sum() #no duplicated data

In [None]:
df.isnull().sum()

In [None]:
df['Teacher_Quality'].value_counts() #fill with mode 78 vs 3925 is normal

In [None]:
mode_value = df['Teacher_Quality'].mode()[0]
df['Teacher_Quality'].fillna(mode_value, inplace=True)      # replace it by Medium

In [None]:
df['Teacher_Quality'].isnull().sum()

In [None]:
df['Parental_Education_Level'].value_counts()

In [None]:
values = ['High School', 'College', 'Postgraduate']
ratios = [3223, 1989, 1305]
ratios = np.array(ratios) / sum(ratios)

In [None]:
imputed_values = np.random.choice(values, size=90, p=ratios)

In [None]:
df.loc[df['Parental_Education_Level'].isnull(), 'Parental_Education_Level'] = imputed_values

In [None]:
df['Parental_Education_Level'].isnull().sum()

In [None]:
df['Distance_from_Home'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df['Distance_from_Home'], kde=True)
plt.title("Distribution of Distance_from_Home")
plt.show()


In [None]:
mode_value = df['Distance_from_Home'].mode()[0]
df['Distance_from_Home'].fillna(mode_value, inplace=True)

In [None]:
df['Distance_from_Home'].isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
numeric_columns = df.select_dtypes(include=['number'])
plt.boxplot(numeric_columns.values, labels=numeric_columns.columns, vert=True)
plt.show()

In [None]:
scaler = MinMaxScaler()
normalized = scaler.fit_transform(numeric_columns)
df_normalized = pd.DataFrame(normalized, columns=numeric_columns.columns)
df_final = pd.concat([df_normalized, df.drop(columns=numeric_columns.columns)], axis=1)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
order = {'Low': 0, 'Medium': 1, 'High': 2}
df['Parental_Involvement'] = df['Parental_Involvement'].map(order)

In [None]:
df['Access_to_Resources'].value_counts()

In [None]:
order={'Low':0, 'Medium':1, 'High':2}
df['Access_to_Resources']=df['Access_to_Resources'].map(order)

In [None]:
df['Extracurricular_Activities'] = df['Extracurricular_Activities'].map({'Yes':1, 'No':0})

In [None]:
order={'Low':0, 'Medium':1, 'High':2}
df['Motivation_Level']=df['Motivation_Level'].map(order)

In [None]:
df['Internet_Access']=df['Internet_Access'].map({'Yes':1, 'No':0})

In [None]:
order={'Low':0, 'Medium':1, 'High':2}
df['Family_Income']=df['Family_Income'].map(order)

In [None]:
order={'Low':0, 'Medium':1, 'High':2}
df['Teacher_Quality']=df['Teacher_Quality'].map(order)

In [None]:
df['School_Type'].value_counts()

In [None]:
df['School_Type']=df['School_Type'].map({'Public':1, 'Private':0})

In [None]:
df['Peer_Influence'].value_counts()

In [None]:
order={'Positive':0,'Neutral':1,'Negative':2}
df['Peer_Influence']=df['Peer_Influence'].map(order)

In [None]:
df['Learning_Disabilities']=df['Learning_Disabilities'].map({'Yes':1, 'No':0})

In [None]:
df['Parental_Education_Level'].value_counts()

In [None]:
ord={'High School':0, 'College':1, 'Postgraduate':2}
df['Parental_Education_Level']=df['Parental_Education_Level'].map(ord)

In [None]:
df['Distance_from_Home'].value_counts()

In [None]:
ord={'Near':0,'Moderate':1,'Far':2}
df['Distance_from_Home']=df['Distance_from_Home'].map(ord)

In [None]:
df['Gender']=df['Gender'].map({'Female':1,'Male':0})

Modeling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [None]:
x=df.drop(columns=['Exam_Score'])
y=df['Exam_Score']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
model=LinearRegression()

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

In [None]:
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

In [None]:
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})
print(results_df.head())

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n model value :")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")
print('********************************************')

plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

In [None]:
#"""""""""""""""" BONUS PART 1 """"""""""""""""#

In [None]:
from sklearn.preprocessing import PolynomialFeatures  #call it

In [None]:
poly=PolynomialFeatures(degree=2,include_bias=False)   #2 make it ^2

In [None]:
x_train_ploy=poly.fit_transform(x_train)
x_test_poly=poly.transform(x_test)

In [None]:
poly_model=LinearRegression()

In [None]:
poly_model.fit(x_train_ploy,y_train)

In [None]:
y_pred_poly=poly_model.predict(x_test_poly)

In [None]:
r2_poly=r2_score(y_test,y_pred_poly)
r2_poly

In [None]:
print("\n Linear Regression (Original Model)")
print(f"R²: {r2:.2f}%")
print('*************************************')
print(" Polynomial Regression Results (degree=2)")
print(f"R²: {r2_poly:.2f}%")

In [None]:
#"""""""""""""""" BONUS PART 2 """"""""""""""""#

In [None]:
df.head()

In [None]:
feature_sets = {
    "All Features": list(x_train.columns),
    "Without Sleep": [col for col in x_train.columns if col != 'Sleep_Hours'],
    "Without Participation": [col for col in x_train.columns if col != 'Extracurricular_Activities'],
    "Only Sleep & Participation": ['Sleep_Hours', 'Extracurricular_Activities'],
    "Without Sleep & Participation": [col for col in x_train.columns if col not in ['Sleep_Hours', 'Extracurricular_Activities']]
}
results = []

In [None]:
for name, features in feature_sets.items():
    X_train_sel = x_train[features]
    X_test_sel = x_test[features]

In [None]:
model.fit(X_train_sel, y_train)
y_pred = model.predict(X_test_sel)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results.append([name, mse, rmse, r2])

In [None]:
results_df = pd.DataFrame(results, columns=["Feature Set", "MSE", "RMSE", "R²"])
print(results_df)