In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from prettytable import PrettyTable

In [None]:
# DATASET

# Read in the dataset
df = pd.read_csv("Student_performance_data _.csv")
# Convert data from a 0-4 scale to 0-5 scale
df['GPA'] = (df['GPA'] / 4) * 5
# Print data
print(df.head())

In [None]:
# MISSING DATA

# Check for missing values
missing_data = df.isnull().sum()

# Print missing data
print(missing_data)

plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

In [None]:
# CATEGORICAL DATA HANDLING

# Apply one-hot encoding to categorical variables
df_encoded = pd.get_dummies(df, columns=['Gender', 'Ethnicity', 'ParentalEducation'], drop_first=True)

# Convert boolean columns to integers (1/0)
boolean_columns = df_encoded.select_dtypes(include=['bool']).columns  
df_encoded[boolean_columns] = df_encoded[boolean_columns].astype(int)

# Display the first few rows of the modified dataframe
print(df_encoded.head())


In [None]:
# DISTRIBUTIONS FOR NUMERICAL FEATURES 

# List of numerical features
numerical_features = ['Age', 'StudyTimeWeekly', 'Absences', 'GPA']

# Create histograms for each numerical feature
df_encoded[numerical_features].hist(bins=15, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle('Histograms for Numerical Features', fontsize=16)
plt.show()

# Create box plots for each numerical feature
plt.figure(figsize=(12, 8))
df_encoded[numerical_features].plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False, figsize=(12, 8), color='skyblue')
plt.suptitle('Box Plots for Numerical Features', fontsize=16)
plt.show()

# Calculate the correlation matrix
corr_matrix = df_encoded.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix for Numerical Features', fontsize=16)
plt.show()

In [None]:
#REGRESSION MODEL

# Remove unecessary features by dropping the columns
df_updated = df.drop(columns=['StudentID', 'Age', 'Volunteering', 'Ethnicity', 'ParentalEducation'])

X = df_updated.drop(columns=['GPA', 'GradeClass'])  
y = df_updated['GPA'] 

# Split the data into training (70%) and temporary set (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the temporary set into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize models
model_no_scaling = LinearRegression()  
model = LinearRegression()              
scaler = StandardScaler()              

# Fit the non-scaled model
model_no_scaling.fit(X_train, y_train)
# Predict on validation set
y_val_pred_no_scaling = model_no_scaling.predict(X_val)

# Scale the training data
X_train_scaled = scaler.fit_transform(X_train)
# Scale the validation data using the same scaler
X_val_scaled = scaler.transform(X_val)

# Fit the scaled model
model.fit(X_train_scaled, y_train)
# Predict on the scaled validation set
y_val_pred = model.predict(X_val_scaled)

# Calculate RMSE for both models
rmse = root_mean_squared_error(y_val, y_val_pred)  
rmse_no_scaling = root_mean_squared_error(y_val, y_val_pred_no_scaling)  

# Print RMSE values
print(f"Root Mean Squared Error (Scaled Model): {rmse}")
print(f"Root Mean Squared Error (Non-Scaled Model): {rmse_no_scaling}")

# Prepare data for visualization
rmse_values = [rmse_no_scaling, rmse]
labels = ['Non-Scaled Model', 'Scaled Model']

# Create a DataFrame to compare actual vs predicted values
comparison_df = pd.DataFrame({
    'Actual GPA': y_val,
    'Predicted (Non-Scaled)': y_val_pred_no_scaling,
    'Predicted (Scaled)': y_val_pred
})

# Create a PrettyTable instance
table = PrettyTable()

# Add columns to the table
table.field_names = ["Actual GPA", "Predicted (Non-Scaled)", "Predicted (Scaled)"]
for index, row in comparison_df.head(10).iterrows():
    table.add_row([row['Actual GPA'], row['Predicted (Non-Scaled)'], row['Predicted (Scaled)']])

# Print the table
print(table)

# Create a bar plot to compare RMSE values
plt.bar(labels, rmse_values, color=['blue', 'red'])
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.title('Comparison of RMSE for Different Models')
plt.ylim(0, max(rmse_values) + 1)  # Adjust y-axis limits
plt.show()


# Scatter plot for non-scaled model predictions
plt.figure(figsize=(8, 5))
plt.scatter(y_val, y_val_pred_no_scaling, color='blue', label='GPA Predictions', alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='black', lw=2, label='Perfect Prediction')
plt.xlabel('Actual GPA')
plt.ylabel('Predicted GPA')
plt.title('Predicted vs Actual GPA (Non-Scaled)')
plt.legend()
plt.show()

