# üéì Student Performance Prediction

**Objective:** Predict student performance index based on study hours, previous scores, and other factors.

**Dataset:** Student Performance Dataset from Kaggle

**ML Algorithm:** Linear Regression

---

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('‚úÖ All libraries imported successfully!')

## 2. Load Dataset

In [None]:
# Load the dataset from data folder
df = pd.read_csv('../data/Student_Performance.csv')

# Display first 5 rows
print('üìä First 5 rows of the dataset:')
df.head()

## 3. Data Exploration (EDA)

In [None]:
# Dataset shape (rows, columns)
print(f'üìè Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns')
print()

In [None]:
# Column names and data types
print('üìã Column Information:')
df.info()

In [None]:
# Statistical summary of numerical columns
print('üìà Statistical Summary:')
df.describe()

In [None]:
# Check for missing values
print('‚ùì Missing Values:')
missing = df.isnull().sum()
print(missing)
print(f'\n‚úÖ Total missing values: {missing.sum()}')

In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f'üîÑ Duplicate rows: {duplicates}')

## 4. Data Visualization

In [None]:
# 4.1 Distribution of Performance Index (Target Variable)
plt.figure(figsize=(10, 6))
sns.histplot(df['Performance Index'], kde=True, color='#3498db', bins=30)
plt.title('Distribution of Student Performance Index', fontsize=14, fontweight='bold')
plt.xlabel('Performance Index')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('../images/performance_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/performance_distribution.png')

In [None]:
# 4.2 Correlation Heatmap
# First, encode categorical variable if exists
df_encoded = df.copy()
if 'Extracurricular Activities' in df_encoded.columns:
    df_encoded['Extracurricular Activities'] = df_encoded['Extracurricular Activities'].map({'Yes': 1, 'No': 0})

plt.figure(figsize=(10, 8))
correlation = df_encoded.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../images/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/correlation_heatmap.png')

In [None]:
# 4.3 Hours Studied vs Performance Index
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Hours Studied', y='Performance Index', 
                alpha=0.6, color='#e74c3c')
plt.title('Hours Studied vs Performance Index', fontsize=14, fontweight='bold')
plt.xlabel('Hours Studied')
plt.ylabel('Performance Index')
plt.tight_layout()
plt.savefig('../images/hours_vs_performance.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/hours_vs_performance.png')

In [None]:
# 4.4 Previous Scores vs Performance Index
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Previous Scores', y='Performance Index', 
                alpha=0.6, color='#2ecc71')
plt.title('Previous Scores vs Performance Index', fontsize=14, fontweight='bold')
plt.xlabel('Previous Scores')
plt.ylabel('Performance Index')
plt.tight_layout()
plt.savefig('../images/previous_scores_vs_performance.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/previous_scores_vs_performance.png')

In [None]:
# 4.5 Extracurricular Activities Impact
if 'Extracurricular Activities' in df.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x='Extracurricular Activities', y='Performance Index', 
                palette=['#3498db', '#e74c3c'])
    plt.title('Performance by Extracurricular Activities', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('../images/extracurricular_impact.png', dpi=300, bbox_inches='tight')
    plt.show()
    print('‚úÖ Saved: images/extracurricular_impact.png')

## 5. Data Preprocessing

In [None]:
# Create a copy for modeling
df_model = df.copy()

# Encode categorical variable (Extracurricular Activities)
if 'Extracurricular Activities' in df_model.columns:
    df_model['Extracurricular Activities'] = df_model['Extracurricular Activities'].map({'Yes': 1, 'No': 0})
    print('‚úÖ Encoded Extracurricular Activities: Yes=1, No=0')

df_model.head()

In [None]:
# Define Features (X) and Target (y)
X = df_model.drop('Performance Index', axis=1)  # All columns except target
y = df_model['Performance Index']  # Target variable

print(f'üìä Features shape: {X.shape}')
print(f'üéØ Target shape: {y.shape}')
print(f'\nüìã Features used: {list(X.columns)}')

In [None]:
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'üìö Training set size: {X_train.shape[0]} samples')
print(f'üß™ Testing set size: {X_test.shape[0]} samples')

## 6. Model Training

In [None]:
# Initialize and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print('‚úÖ Linear Regression model trained successfully!')

In [None]:
# Display model coefficients
print('üìä Model Coefficients:')
print('-' * 40)
for feature, coef in zip(X.columns, model.coef_):
    print(f'{feature}: {coef:.4f}')
print(f'\nIntercept: {model.intercept_:.4f}')

## 7. Model Evaluation

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print('üìà Model Performance Metrics:')
print('=' * 40)
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R¬≤ Score: {r2:.4f} ({r2*100:.2f}%)')
print('=' * 40)

In [None]:
# 7.1 Actual vs Predicted Values Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='#3498db')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Performance Index')
plt.ylabel('Predicted Performance Index')
plt.title('Actual vs Predicted Performance Index', fontsize=14, fontweight='bold')
plt.legend()
plt.tight_layout()
plt.savefig('../images/actual_vs_predicted.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/actual_vs_predicted.png')

In [None]:
# 7.2 Residual Plot
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='#9b59b6', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../images/residuals_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print('‚úÖ Saved: images/residuals_distribution.png')

## 8. Make Predictions

In [None]:
# Example: Predict performance for a new student
print('üîÆ Sample Prediction:')
print('-' * 40)

# Create sample student data
# Adjust these values based on your dataset columns
sample_student = pd.DataFrame({
    'Hours Studied': [7],
    'Previous Scores': [75],
    'Extracurricular Activities': [1],  # 1 = Yes
    'Sleep Hours': [7],
    'Sample Question Papers Practiced': [5]
})

prediction = model.predict(sample_student)

print('Student Details:')
print(f'  - Hours Studied: 7')
print(f'  - Previous Scores: 75')
print(f'  - Extracurricular Activities: Yes')
print(f'  - Sleep Hours: 7')
print(f'  - Sample Papers Practiced: 5')
print(f'\nüéØ Predicted Performance Index: {prediction[0]:.2f}')

## 9. Save Model (Optional)

In [None]:
# Save the trained model using pickle
import pickle

with open('../models/linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print('‚úÖ Model saved to: models/linear_regression_model.pkl')

## 10. Conclusion

In [None]:
print('=' * 50)
print('üìä PROJECT SUMMARY')
print('=' * 50)
print(f'\nüìÅ Dataset: {df.shape[0]} students, {df.shape[1]} features')
print(f'\nüéØ Target Variable: Performance Index')
print(f'\nüìà Model: Linear Regression')
print(f'\n‚úÖ Model Accuracy (R¬≤ Score): {r2*100:.2f}%')
print(f'\nüìâ Average Error (MAE): {mae:.2f} points')
print('\n' + '=' * 50)
print('\nüîë Key Insights:')
print('  1. Hours Studied has strong positive correlation with performance')
print('  2. Previous Scores are a good predictor of future performance')
print('  3. Extracurricular activities show positive impact')
print('\n' + '=' * 50)