In [None]:
# Student Performance Analyzer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load dataset
data = pd.read_csv('StudentsPerformance.csv')
df = pd.DataFrame(data)

# Display basic information about the dataset
# print(df.info())
# print(df.describe())
print(df.head(10))

X = df[['parental level of education', 'lunch', 'test preparation course']]
y = df['Average Score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)
y = df['Average Score']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)


# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
""" 
MAE (Mean Absolute Error) measures the average magnitude of the errors in a set of predictions, without considering their direction.
MSE (Mean Squared Error) measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value.
R² Score (Coefficient of Determination) indicates the proportion of the variance in the dependent variable that is predictable from the independent variables
"""
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


# Visualize the results
plt.scatter(y_test, y_pred, color='blue')
plt.xlabel('Actual Average Scores')
plt.ylabel('Predicted Average Scores')
plt.title('Actual vs Predicted Average Scores')
plt.plot([0, 100], [0, 100], color='red', linestyle='--')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.show()

# # Analyze the impact of test preparation course on average scores
prep_course = df.groupby('test preparation course')['Average Score'].mean()
prep_course.plot(kind='bar', color=['orange', 'green'])
plt.xlabel('Test Preparation Course')
plt.ylabel('Average Score')
plt.title('Impact of Test Preparation Course on Average Scores')
plt.show()
