In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

def analyze_and_preprocess(df, target_column='math score'):
    """
    Analyze and preprocess the dataset
    """
    print("\n==== DATASET OVERVIEW ====")
    print(f"Shape: {df.shape}")
    print("\nColumn data types:")
    print(df.dtypes)
    
    print("\nMissing values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values")
    
    # Basic statistics for numerical columns
    print("\nBasic statistics for numerical features:")
    print(df.describe())
    
    # Identify numerical and categorical columns
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if target_column in numeric_features:
        numeric_features.remove(target_column)
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"\nNumeric features: {len(numeric_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    
    # Handle any missing values
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if col in numeric_features:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df, numeric_features, categorical_features

def create_visualizations(df, numeric_features, categorical_features, target_column):
    """
    Create visualizations for data analysis
    """
    print("\n==== CREATING VISUALIZATIONS ====")
    os.makedirs('visualizations', exist_ok=True)
    
    # Correlation heatmap for numerical features
    plt.figure(figsize=(10, 8))
    correlation_features = numeric_features.copy()
    correlation_features.append(target_column)
    
    correlation_matrix = df[correlation_features].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Numeric Features')
    plt.tight_layout()
    plt.savefig('visualizations/correlation_heatmap.png')
    plt.close()
    
    # Distribution of the target variable
    plt.figure(figsize=(10, 6))
    sns.histplot(df[target_column], kde=True)
    plt.title(f'Distribution of {target_column}')
    plt.xlabel(target_column)
    plt.savefig(f'visualizations/{target_column}_distribution.png')
    plt.close()
    
    # Visualizing relationships between key numeric features and the target
    if len(numeric_features) > 0:
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(numeric_features, 1):
            plt.subplot(2, 2, i)
            sns.scatterplot(x=feature, y=target_column, data=df)
            plt.title(f'{feature} vs {target_column}')
        plt.tight_layout()
        plt.savefig('visualizations/feature_relationships.png')
        plt.close()
    
    # Boxplots for categorical features
    if len(categorical_features) > 0:
        plt.figure(figsize=(15, 15))
        for i, feature in enumerate(categorical_features, 1):
            plt.subplot(3, 2, i)
            sns.boxplot(x=feature, y=target_column, data=df)
            plt.title(f'{target_column} by {feature}')
            plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('visualizations/categorical_boxplots.png')
        plt.close()
    
    print("Visualizations created and saved in 'visualizations' folder")

def build_and_train_models(X, y, numeric_features, categorical_features):
    """
    Build, train and evaluate models
    """
    print("\n==== BUILDING AND TRAINING MODELS ====")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Define preprocessing for numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Create models dictionary
    models = {
        'SGD Linear Regression': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', SGDRegressor(max_iter=1000, tol=1e-3, random_state=42))
        ]),
        
        'Linear Regression': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ]),
        
        'Decision Tree': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', DecisionTreeRegressor(random_state=42))
        ]),
        
        'Random Forest': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
        ])
    }
    
    # Dictionary to store results
    results = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        r2 = r2_score(y_test, y_test_pred)
        
        # Store results
        results[name] = {
            'train_mse': train_mse,
            'test_mse': test_mse,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'r2': r2
        }
        
        print(f"{name} - Train MSE: {train_mse:.2f}, Test MSE: {test_mse:.2f}")
        print(f"{name} - Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")
        print(f"{name} - R² Score: {r2:.2f}")
    
    # Plot model comparison
    plt.figure(figsize=(12, 6))
    models_list = list(results.keys())
    test_mse_list = [results[model]['test_mse'] for model in models_list]
    
    plt.bar(models_list, test_mse_list)
    plt.xlabel('Models')
    plt.ylabel('Test MSE')
    plt.title('Model Comparison (Lower is Better)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('visualizations/model_comparison.png')
    plt.close()
    
    # Find the best model (lowest test MSE)
    best_model_name = min(results, key=lambda k: results[k]['test_mse'])
    print(f"\nBest model based on test MSE: {best_model_name}")
    
    return models, results, best_model_name

def save_best_model(best_model, numeric_features, categorical_features, target_column):
    """
    Save the best model and create a prediction script
    """
    print("\n==== SAVING BEST MODEL ====")
    
    # Create models directory if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Save the model
    model_file = './models/best_student_performance_model.pkl'
    joblib.dump(best_model, model_file)
    
    # Save feature lists
    feature_file = './models/model_features.pkl'
    joblib.dump({
        'numeric_features': numeric_features,
        'categorical_features': categorical_features,
        'target_column': target_column
    }, feature_file)
    
    print(f"Best model saved as '{model_file}'")

# Main execution
if __name__ == "__main__":
    print("==== STUDENT PERFORMANCE PREDICTION MODEL ====")
    
    # Load data
    print("Loading data...")
    df = pd.read_csv('StudentsPerformance.csv')
    
    # Set target column
    target_column = 'math score'
    
    # Analyze and preprocess data
    df, numeric_features, categorical_features = analyze_and_preprocess(df, target_column)
    
    # Create visualizations
    create_visualizations(df, numeric_features, categorical_features, target_column)
    
    # Prepare features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Build and train models
    models, results, best_model_name = build_and_train_models(X, y, numeric_features, categorical_features)
    
    # Save the best model
    save_best_model(models[best_model_name], numeric_features, categorical_features, target_column)
    
    print("\n==== MODEL TRAINING COMPLETE ====")

==== STUDENT PERFORMANCE PREDICTION MODEL ====
Loading data...

==== DATASET OVERVIEW ====
Shape: (1000, 8)

Column data types:
gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

Missing values:
No missing values

Basic statistics for numerical features:
       math score  reading score  writing score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000      68.054000
std      15.16308      14.600192      15.195657
min       0.00000      17.000000      10.000000
25%      57.00000      59.000000      57.750000
50%      66.00000      70.000000      69.000000
75%      77.00000      79.000000      79.000000
max     100.00000     100.000000     100.000000

Numeric features: 2
Categorical features: 5
