In [None]:
# Data preprocessing - Feature Selection and Target Variable
# Selected Features
selected_features = [
    'study_year',
    'assignment_delay_frequency',
    'procrastination_reasons',
    'last_minute_exam_preparation',
    'study_hours_per_week',
    'use_of_time_management',
    'procrastination_management_training',
    'procrastination_recovery_strategies',
    'hours_spent_on_mobile_non_academic',
    'study_session_distractions'
]

# Target variable
target_variable = 'procrastination_and_grade_outcome'

# Check if the target variable exists in the DataFrame
if target_variable not in df.columns:
    print(f"Error: The target variable '{target_variable}' is not in the DataFrame.")
else:
    print(f"Target variable: {target_variable}")
    # Display distribution of the target variable
    print("\nDistribution of the target variable:")
    print(df[target_variable].value_counts())
    
    # Binary encoding of the target variable (if needed)
    if df[target_variable].dtype == 'object':
        print("\nUnique values of the target variable:", df[target_variable].unique())
        
        # If the target variable is 'Yes' and 'No', convert to 1 and 0
        if set(df[target_variable].unique()) == {'Yes', 'No'}:
            df['target'] = (df[target_variable] == 'Yes').astype(int)
            print("Target variable converted to binary format (Yes=1, No=0)")
        else:
            # For other values use a LabelEncoder
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            df['target'] = le.fit_transform(df[target_variable])
            print("Target variable encoded with LabelEncoder:")
            for i, label in enumerate(le.classes_):
                print(f"  {label} -> {i}")
    else:
        # If already numeric, use directly
        df['target'] = df[target_variable]

# Select features and encoded target variable
X = df[selected_features]
y = df['target']

print(f"\nSelected Features for the model ({len(selected_features)}):")
for feature in selected_features:
    print(f"- {feature}")

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical columns:", categorical_cols)

# Split the data into training and testing sets (80% training, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining data: {X_train.shape[0]} samples")
print(f"Test data: {X_test.shape[0]} samples")

# Data preprocessing - Pipeline for categorical variables
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Pipeline for categorical variables: Imputation and One-Hot-Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for numerical variables (if any)
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]
)

# Model Training and Evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create and train the Logistic Regression model with Grid Search
print("Training the Logistic Regression model with Grid Search...")
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

lr_param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['liblinear', 'lbfgs']
}

lr_grid_search = GridSearchCV(
    lr_pipeline, 
    lr_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1
)

lr_grid_search.fit(X_train, y_train)

# Print the best parameters
print("\nBest parameters for Logistic Regression:")
print(lr_grid_search.best_params_)

# Make predictions and evaluate the model
y_pred_lr = lr_grid_search.predict(X_test)
print("\nLogistic Regression - Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# Calculate and plot ROC curve
if len(np.unique(y_test)) == 2:  # Only for binary classification
    y_pred_proba = lr_grid_search.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Logistic Regression')
    plt.legend(loc='lower right')
    plt.show()

# Train and evaluate Random Forest model
print("\nTraining Random Forest model...")
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

rf_grid_search = GridSearchCV(
    rf_pipeline, 
    rf_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1
)

rf_grid_search.fit(X_train, y_train)

# Print the best parameters
print("\nBest parameters for Random Forest:")
print(rf_grid_search.best_params_)

# Make predictions and evaluate the model
y_pred_rf = rf_grid_search.predict(X_test)
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

# Calculate and plot ROC curve
if len(np.unique(y_test)) == 2:  # Only for binary classification
    y_pred_proba_rf = rf_grid_search.predict_proba(X_test)[:, 1]
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
    auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {auc:.2f})')
    plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve Comparison')
    plt.legend(loc='lower right')
    plt.show()

# Feature importance analysis for Random Forest
if hasattr(rf_grid_search.best_estimator_.named_steps['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = []
    
    # Get the column transformer
    ct = rf_grid_search.best_estimator_.named_steps['preprocessor']
    
    # Get all transformers
    for name, trans, cols in ct.transformers_:
        if name == 'cat':
            # For categorical features, get the one-hot encoded feature names
            for i, col in enumerate(cols):
                feature_names.extend([f"{col}_{val}" for val in trans.named_steps['onehot'].get_feature_names_out([col])])
        else:
            # For numerical features, use the column names
            feature_names.extend(cols)
    
    # Get feature importances
    importances = rf_grid_search.best_estimator_.named_steps['classifier'].feature_importances_
    
    # Create a DataFrame for visualization
    if len(feature_names) == len(importances):
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Plot feature importances
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
        plt.title('Top 20 Feature Importances (Random Forest)')
        plt.tight_layout()
        plt.show()
        
        print("\nTop 10 Most Important Features:")
        print(feature_importance_df.head(10))
