In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# --- 0. Load Data and Initial Preprocessing (as per your existing notebook) ---
df = pd.read_csv("/content/CS101_Student_Behavior.csv")

# Convert stringified lists/dicts back to Python objects
df['Weekly_Logins'] = df['Weekly_Logins'].apply(ast.literal_eval)
df['Weekly_Attendance'] = df['Weekly_Attendance'].apply(ast.literal_eval)
df['Assignments'] = df['Assignments'].apply(ast.literal_eval)
df['Quizzes'] = df['Quizzes'].apply(ast.literal_eval)
df['Programming_Projects'] = df['Programming_Projects'].apply(ast.literal_eval)

# Grade mapping for numerical conversion
grade_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'E': 0.5, 'F': 0}

# --- 1. Data Preprocessing for Machine Learning ---

# Feature Engineering Helper Functions
def calculate_weekly_metrics(df_row):
    """Calculates weekly metrics and returns a DataFrame row."""
    attendance = np.array(df_row['Weekly_Attendance'])
    logins = np.array(df_row['Weekly_Logins'])

    # Convert assignment grades and submission status
    assignment_grades_numeric = [grade_map[a['Grade']] for a in df_row['Assignments']]
    assignment_on_time = [1 if a['Submission_Status'] == 'On Time' else 0 for a in df_row['Assignments']]

    # Convert quiz grades
    quiz_grades_numeric = [grade_map[q] for q in df_row['Quizzes']]

    # Convert project grades
    project_grades_numeric = [grade_map[p] for p in df_row['Programming_Projects']]

    # Aggregate weekly metrics (already done in your visualization section, but re-doing for clarity)
    avg_attendance = np.mean(attendance)
    avg_logins = np.mean(logins)
    pct_on_time_assignments = sum(assignment_on_time) / len(assignment_on_time) if len(assignment_on_time) > 0 else 0
    avg_quiz_grade = np.mean(quiz_grades_numeric)
    avg_project_grade = np.mean(project_grades_numeric) if len(project_grades_numeric) > 0 else 0

    # Lagged Features (example for 1-week lag)
    # Note: For a real-time system, you'd need to ensure these lags are based on *past* data only.
    # For this dataset, we'll simulate by taking the average of the first few weeks as 'early' data.
    # A more robust approach for time-series prediction would involve structuring data per week.
    # For simplicity here, we'll use overall averages and then add 'lagged' features based on weekly data.

    # Let's create features based on the *trend* over the 16 weeks for each student
    # This is a simplification for a single-row prediction, rather than week-by-week prediction.
    # If you want to predict week N's risk based on week N-1 data, the dataset structure needs to change.

    # For now, let's focus on features that describe the student's overall behavior pattern
    # and their performance up to a certain point (e.g., mid-semester).
    # We'll use the full 16 weeks for training, assuming we have historical data.

    # Example: Average of first 5 weeks vs last 5 weeks
    avg_attendance_early = np.mean(attendance[:5])
    avg_logins_early = np.mean(logins[:5])
    pct_on_time_assignments_early = sum(assignment_on_time[:5]) / 5 if 5 > 0 else 0
    avg_quiz_grade_early = np.mean(quiz_grades_numeric[:5])

    avg_attendance_late = np.mean(attendance[10:])
    avg_logins_late = np.mean(logins[10:])
    pct_on_time_assignments_late = sum(assignment_on_time[10:]) / 6 if 6 > 0 else 0
    avg_quiz_grade_late = np.mean(quiz_grades_numeric[10:])

    # Change over time (simple difference between early and late averages)
    attendance_change = avg_attendance_late - avg_attendance_early
    logins_change = avg_logins_late - avg_logins_early
    assignments_change = pct_on_time_assignments_late - pct_on_time_assignments_early
    quiz_change = avg_quiz_grade_late - avg_quiz_grade_early

    # Rolling Averages (example: average of last 3 weeks)
    # This is tricky for a single row, as it implies a time window.
    # For simplicity, let's use standard deviation over the full period to capture consistency.
    std_attendance = np.std(attendance)
    std_logins = np.std(logins)
    std_quiz_grade = np.std(quiz_grades_numeric)

    # Cumulative Metrics
    total_logins = np.sum(logins)
    total_assignments_submitted = sum(1 for a in df_row['Assignments'] if a['Submission_Status'] in ['On Time', 'Submitted', 'Late'])
    total_assignments_never_submitted = sum(1 for a in df_row['Assignments'] if a['Submission_Status'] == 'Never Submitted')

    return pd.Series({
        'Avg_Attendance': avg_attendance,
        'Avg_Logins': avg_logins,
        'Pct_OnTime_Assignments': pct_on_time_assignments,
        'Avg_Quiz_Grade': avg_quiz_grade,
        'Avg_Project_Grade': avg_project_grade,
        'Avg_Attendance_Early': avg_attendance_early,
        'Avg_Logins_Early': avg_logins_early,
        'Pct_OnTime_Assignments_Early': pct_on_time_assignments_early,
        'Avg_Quiz_Grade_Early': avg_quiz_grade_early,
        'Attendance_Change': attendance_change,
        'Logins_Change': logins_change,
        'Assignments_Change': assignments_change,
        'Quiz_Change': quiz_change,
        'Std_Attendance': std_attendance,
        'Std_Logins': std_logins,
        'Std_Quiz_Grade': std_quiz_grade,
        'Total_Logins': total_logins,
        'Total_Assignments_Submitted': total_assignments_submitted,
        'Total_Assignments_Never_Submitted': total_assignments_never_submitted
    })

# Apply feature engineering
engineered_features_df = df.apply(calculate_weekly_metrics, axis=1)
df_ml = pd.concat([df[['Student_ID', 'Gender', 'Ethnicity', 'Scholarship', 'Label']], engineered_features_df], axis=1)

print("--- Engineered Features Sample ---")
print(df_ml.head())
print("\n")

# Categorical Feature Encoding
categorical_features = ['Gender', 'Ethnicity', 'Scholarship']
numerical_features = [col for col in df_ml.columns if col not in ['Student_ID', 'Label'] + categorical_features]

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep numerical features as they are
)

# Define Target Variable
# For this exercise, we'll predict the 'Label' directly.
# Encode the target variable 'Label' to numerical format
label_encoder = LabelEncoder()
df_ml['Label_Encoded'] = label_encoder.fit_transform(df_ml['Label'])
# Map back for interpretability
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print(f"Label Encoding Map: {label_mapping}\n")

X = df_ml.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
y = df_ml['Label_Encoded']

# --- 2. Machine Learning Model Implementation ---

# Train-Test Split (using a random split for simplicity, but time-based is better for real EWS)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after one-hot encoding for interpretability
ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = list(ohe_feature_names) + numerical_features
# Ensure the order matches the ColumnTransformer's output
# The 'remainder' passthrough features will be in their original order at the end.
# This is a simplified way; for complex pipelines, ensure feature names are tracked carefully.
# For now, we'll just use the numerical features as they are passed through.

print(f"Shape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")

# Model Instantiation and Training
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    cm = confusion_matrix(y_test, y_pred)

    # ROC AUC for multi-class is typically calculated per class (OvR or OvO)
    # For simplicity, we'll calculate macro-averaged ROC AUC if applicable
    try:
        roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
    except ValueError:
        roc_auc = "N/A (requires binary or specific multi-class setup)"

    results[name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": cm,
        "roc_auc": roc_auc,
        "model": model # Store the trained model
    }
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)
    print(f"ROC AUC (macro-avg OvR): {roc_auc}\n")

# Hyperparameter Tuning (Example for Random Forest)
print("--- Hyperparameter Tuning for Random Forest ---")
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train_processed, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test_processed)
print("Tuned Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_tuned, target_names=label_encoder.classes_))

results["Random Forest (Tuned)"] = {
    "accuracy": accuracy_score(y_test, y_pred_rf_tuned),
    "classification_report": classification_report(y_test, y_pred_rf_tuned, target_names=label_encoder.classes_),
    "confusion_matrix": confusion_matrix(y_test, y_pred_rf_tuned),
    "model": best_rf_model
}

# --- 3. Predictive Model Integration into the System ---

# Choose the best performing model for integration (e.g., Tuned Random Forest or LightGBM)
# For demonstration, let's pick the tuned Random Forest
best_model_for_deployment = results["Random Forest (Tuned)"]["model"]
print(f"\n--- Deploying {best_model_for_deployment.__class__.__name__} for predictions ---")

# Example: Predict for the entire original dataset (simulating new data coming in)
# First, re-process the entire dataset to ensure consistency
X_full = df_ml.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
X_full_processed = preprocessor.transform(X_full) # Use the *fitted* preprocessor

df_ml['Predicted_Label_Encoded'] = best_model_for_deployment.predict(X_full_processed)
df_ml['Predicted_Label'] = df_ml['Predicted_Label_Encoded'].map(label_mapping)

# Get prediction probabilities for risk scoring
# Probabilities are ordered by label_encoder.classes_
df_ml['Prob_Crisis'] = best_model_for_deployment.predict_proba(X_full_processed)[:, label_encoder.transform(['Crisis'])[0]]
df_ml['Prob_Drift'] = best_model_for_deployment.predict_proba(X_full_processed)[:, label_encoder.transform(['Drift'])[0]]
df_ml['Prob_Normal'] = best_model_for_deployment.predict_proba(X_full_processed)[:, label_encoder.transform(['Normal'])[0]]


# Dynamic Intervention Triggering (Modify the original get_recommendation function)
def get_ml_recommendation(student_row):
    predicted_label = student_row['Predicted_Label']
    prob_crisis = student_row['Prob_Crisis']
    prob_drift = student_row['Prob_Drift']

    if predicted_label == "Crisis":
        return (
            "Immediate intervention: schedule meeting, connect with advisor or counselor.",
            f"Student is predicted to be in Crisis (Prob: {prob_crisis:.2f}). Disengaged early. May need academic and emotional support."
        )
    elif predicted_label == "Drift":
        # Example of threshold tuning: if drift probability is very high, maybe escalate intervention
        if prob_drift > 0.7: # Custom threshold for more urgent drift intervention
             return (
                "Urgent mid-semester check-in; offer time management tips and study group options, consider academic coaching.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but shows significant decline. Prevent further drop."
            )
        else:
            return (
                "Mid-semester check-in; offer time management tips and study group options.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but declines. Prevent further drop."
            )
    elif predicted_label == "Normal":
        return (
            "Congratulate on consistent performance.",
            f"Student is predicted to be Normal (Prob: {student_row['Prob_Normal']:.2f}). Stable performance. Encourage continued success or leadership roles."
        )
    else:
        return ("", "")

# Apply the new ML-driven recommendations
df_ml["ML_Recommendation"], df_ml["ML_Instructor_Note"] = zip(*df_ml.apply(get_ml_recommendation, axis=1))

# Student-Facing Risk Reports Enhancement (Feature Importance)
print("\n--- Feature Importance from Best Model (Random Forest) ---")
# Ensure feature names align with processed data
# This requires careful handling of ColumnTransformer's output
# For simplicity, let's assume numerical features are at the end after OHE features
feature_importances = pd.Series(best_model_for_deployment.feature_importances_, index=all_feature_names)
print(feature_importances.sort_values(ascending=False).head(10))

# To make this actionable for student reports, you'd need to map these features
# back to understandable language.
# Example: If 'Logins_Change' is very negative, suggest "Your recent decrease in platform logins..."

# Select relevant columns for the new intervention table
intervention_df_ml = df_ml[["Student_ID", "Gender", "Ethnicity", "Scholarship", "Label",
                            "Predicted_Label", "Prob_Crisis", "Prob_Drift", "Prob_Normal",
                            "ML_Recommendation", "ML_Instructor_Note"]]

# Save to CSV
intervention_df_ml.to_csv("CS101_Student_Intervention_Table_ML.csv", index=False)
print("\n ML-driven Intervention table saved as 'CS101_Student_Intervention_Table_ML.csv'")

print("\n--- Sample of ML-driven Intervention Table ---")
print(intervention_df_ml.head())

# --- 4. Iterative Refinement (Conceptual) ---
# This part is more about system design and less about direct code.
# To implement a feedback loop:
# 1. Store intervention outcomes: After an intervention, record if the student's behavior improved.
# 2. Data collection: Continuously collect new student data.
# 3. Retraining pipeline: Periodically retrain the ML models with the updated, larger dataset
#    including intervention outcomes (if you want to model intervention effectiveness).
# 4. Model monitoring: Set up dashboards to track model performance (accuracy, F1-score) over time
#    and trigger alerts if performance degrades.



--- Engineered Features Sample ---
  Student_ID  Gender Ethnicity Scholarship   Label  Avg_Attendance  \
0       S001    Male     Other         Yes   Drift           1.375   
1       S002    Male     White          No  Normal           2.000   
2       S003    Male  Hispanic         Yes  Normal           2.000   
3       S004  Female  Hispanic         Yes  Normal           2.000   
4       S005  Female  Hispanic          No  Normal           2.000   

   Avg_Logins  Pct_OnTime_Assignments  Avg_Quiz_Grade  Avg_Project_Grade  ...  \
0      3.9375                   0.375          3.3750               3.50  ...   
1      8.0625                   1.000          3.3750               3.75  ...   
2      8.0625                   1.000          3.5625               3.75  ...   
3      7.4375                   1.000          3.5000               3.50  ...   
4      8.0625                   1.000          3.3125               3.50  ...   

   Attendance_Change  Logins_Change  Assignments_Change  



Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Tuned Random Forest Classification Report:
               precision    recall  f1-score   support

      Crisis       1.00      1.00      1.00         6
       Drift       1.00      1.00      1.00        10
      Normal       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


--- Deploying RandomForestClassifier for predictions ---

--- Feature Importance from Best Model (Random Forest) ---
Avg_Attendance                  0.136124
Avg_Attendance_Early            0.104974
Std_Attendance                  0.098839
Pct_OnTime_Assignments          0.094976
Total_Logins                    0.087685
Logins_Change                   0.067271
Avg_Logins_Early                0.064514
Avg_Logins                      0.060489
Pct_OnTime_Assignments_Early   

In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# --- 0. Load Data and Initial Preprocessing (as per your existing notebook) ---
df = pd.read_csv("CS101_Student_Behavior.csv")

# Convert stringified lists/dicts back to Python objects
df['Weekly_Logins'] = df['Weekly_Logins'].apply(ast.literal_eval)
df['Weekly_Attendance'] = df['Weekly_Attendance'].apply(ast.literal_eval)
df['Assignments'] = df['Assignments'].apply(ast.literal_eval)
df['Quizzes'] = df['Quizzes'].apply(ast.literal_eval)
df['Programming_Projects'] = df['Programming_Projects'].apply(ast.literal_eval)

# Grade mapping for numerical conversion
grade_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'E': 0.5, 'F': 0}

# --- 1. Data Preprocessing for Machine Learning ---

# Feature Engineering Helper Functions
def calculate_weekly_metrics(df_row):
    """Calculates weekly metrics and returns a DataFrame row."""
    attendance = np.array(df_row['Weekly_Attendance'])
    logins = np.array(df_row['Weekly_Logins'])

    # Convert assignment grades and submission status
    assignment_grades_numeric = [grade_map[a['Grade']] for a in df_row['Assignments']]
    assignment_on_time = [1 if a['Submission_Status'] == 'On Time' else 0 for a in df_row['Assignments']]

    # Convert quiz grades
    quiz_grades_numeric = [grade_map[q] for q in df_row['Quizzes']]

    # Convert project grades
    project_grades_numeric = [grade_map[p] for p in df_row['Programming_Projects']]

    # Aggregate weekly metrics (already done in your visualization section, but re-doing for clarity)
    avg_attendance = np.mean(attendance)
    avg_logins = np.mean(logins)
    pct_on_time_assignments = sum(assignment_on_time) / len(assignment_on_time) if len(assignment_on_time) > 0 else 0
    avg_quiz_grade = np.mean(quiz_grades_numeric)
    avg_project_grade = np.mean(project_grades_numeric) if len(project_grades_numeric) > 0 else 0

    # Lagged Features (example for 1-week lag)
    # Note: For a real-time system, you'd need to ensure these lags are based on *past* data only.
    # For this dataset, we'll simulate by taking the average of the first few weeks as 'early' data.
    # A more robust approach for time-series prediction would involve structuring data per week.
    # For simplicity here, we'll use overall averages and then add 'lagged' features based on weekly data.

    # Let's create features based on the *trend* over the 16 weeks for each student
    # This is a simplification for a single-row prediction, rather than week-by-week prediction.
    # If you want to predict week N's risk based on week N-1 data, the dataset structure needs to change.

    # For now, let's focus on features that describe the student's overall behavior pattern
    # and their performance up to a certain point (e.g., mid-semester).
    # We'll use the full 16 weeks for training, assuming we have historical data.

    # Example: Average of first 5 weeks vs last 5 weeks
    avg_attendance_early = np.mean(attendance[:5])
    avg_logins_early = np.mean(logins[:5])
    pct_on_time_assignments_early = sum(assignment_on_time[:5]) / 5 if 5 > 0 else 0
    avg_quiz_grade_early = np.mean(quiz_grades_numeric[:5])

    avg_attendance_late = np.mean(attendance[10:])
    avg_logins_late = np.mean(logins[10:])
    pct_on_time_assignments_late = sum(assignment_on_time[10:]) / 6 if 6 > 0 else 0
    avg_quiz_grade_late = np.mean(quiz_grades_numeric[10:])

    # Change over time (simple difference between early and late averages)
    attendance_change = avg_attendance_late - avg_attendance_early
    logins_change = avg_logins_late - avg_logins_early
    assignments_change = pct_on_time_assignments_late - pct_on_time_assignments_early
    quiz_change = avg_quiz_grade_late - avg_quiz_grade_early

    # Rolling Averages (example: average of last 3 weeks)
    # This is tricky for a single row, as it implies a time window.
    # For simplicity, let's use standard deviation over the full period to capture consistency.
    std_attendance = np.std(attendance)
    std_logins = np.std(logins)
    std_quiz_grade = np.std(quiz_grades_numeric)

    # Cumulative Metrics
    total_logins = np.sum(logins)
    total_assignments_submitted = sum(1 for a in df_row['Assignments'] if a['Submission_Status'] in ['On Time', 'Submitted', 'Late'])
    total_assignments_never_submitted = sum(1 for a in df_row['Assignments'] if a['Submission_Status'] == 'Never Submitted')

    return pd.Series({
        'Avg_Attendance': avg_attendance,
        'Avg_Logins': avg_logins,
        'Pct_OnTime_Assignments': pct_on_time_assignments,
        'Avg_Quiz_Grade': avg_quiz_grade,
        'Avg_Project_Grade': avg_project_grade,
        'Avg_Attendance_Early': avg_attendance_early,
        'Avg_Logins_Early': avg_logins_early,
        'Pct_OnTime_Assignments_Early': pct_on_time_assignments_early,
        'Avg_Quiz_Grade_Early': avg_quiz_grade_early,
        'Attendance_Change': attendance_change,
        'Logins_Change': logins_change,
        'Assignments_Change': assignments_change,
        'Quiz_Change': quiz_change,
        'Std_Attendance': std_attendance,
        'Std_Logins': std_logins,
        'Std_Quiz_Grade': std_quiz_grade,
        'Total_Logins': total_logins,
        'Total_Assignments_Submitted': total_assignments_submitted,
        'Total_Assignments_Never_Submitted': total_assignments_never_submitted
    })

# Apply feature engineering
engineered_features_df = df.apply(calculate_weekly_metrics, axis=1)
df_ml = pd.concat([df[['Student_ID', 'Gender', 'Ethnicity', 'Scholarship', 'Label']], engineered_features_df], axis=1)

# --- Set Pandas display options to show all rows and columns ---
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) # Adjust display width for better readability

print("--- Engineered Features for ALL Students ---")
print(df_ml)
print("\n")

# Categorical Feature Encoding
categorical_features = ['Gender', 'Ethnicity', 'Scholarship']
numerical_features = [col for col in df_ml.columns if col not in ['Student_ID', 'Label'] + categorical_features]

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep numerical features as they are
)

# Define Target Variable
# For this exercise, we'll predict the 'Label' directly.
# Encode the target variable 'Label' to numerical format
label_encoder = LabelEncoder()
df_ml['Label_Encoded'] = label_encoder.fit_transform(df_ml['Label'])
# Map back for interpretability
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print(f"Label Encoding Map: {label_mapping}\n")

X = df_ml.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
y = df_ml['Label_Encoded']

# --- 2. Machine Learning Model Implementation ---

# Train-Test Split (using a random split for simplicity, but time-based is better for real EWS)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after one-hot encoding for interpretability
ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
# The order of features in X_train_processed will be OHE features first, then numerical features
all_feature_names = list(ohe_feature_names) + numerical_features

print(f"Shape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")

# Model Instantiation and Training
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    cm = confusion_matrix(y_test, y_pred)

    # ROC AUC for multi-class is typically calculated per class (OvR or OvO)
    # For simplicity, we'll calculate macro-averaged ROC AUC if applicable
    try:
        roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
    except ValueError:
        roc_auc = "N/A (requires binary or specific multi-class setup)"

    results[name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": cm,
        "roc_auc": roc_auc,
        "model": model # Store the trained model
    }
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)
    print(f"ROC AUC (macro-avg OvR): {roc_auc}\n")

# Hyperparameter Tuning (Example for Random Forest)
print("--- Hyperparameter Tuning for Random Forest ---")
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train_processed, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test_processed)
print("Tuned Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_tuned, target_names=label_encoder.classes_))

results["Random Forest (Tuned)"] = {
    "accuracy": accuracy_score(y_test, y_pred_rf_tuned),
    "classification_report": classification_report(y_test, y_pred_rf_tuned, target_names=label_encoder.classes_),
    "confusion_matrix": confusion_matrix(y_test, y_pred_rf_tuned),
    "model": best_rf_model
}

# --- 3. Predictive Model Integration into the System ---

# Choose the best performing model for integration (e.g., Tuned Random Forest or LightGBM)
# For demonstration, let's pick the tuned Random Forest
best_model_for_deployment = results["Random Forest (Tuned)"]["model"]
print(f"\n--- Deploying {best_model_for_deployment.__class__.__name__} for predictions ---")

# Example: Predict for the entire original dataset (simulating new data coming in)
# First, re-process the entire dataset to ensure consistency
X_full = df_ml.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
X_full_processed = preprocessor.transform(X_full) # Use the *fitted* preprocessor

df_ml['Predicted_Label_Encoded'] = best_model_for_deployment.predict(X_full_processed)
df_ml['Predicted_Label'] = df_ml['Predicted_Label_Encoded'].map(label_mapping)

# Get prediction probabilities for risk scoring
# Probabilities are ordered by label_encoder.classes_
# Find the index for 'Crisis', 'Drift', 'Normal' in label_encoder.classes_
crisis_idx = np.where(label_encoder.classes_ == 'Crisis')[0][0]
drift_idx = np.where(label_encoder.classes_ == 'Drift')[0][0]
normal_idx = np.where(label_encoder.classes_ == 'Normal')[0][0]

df_ml['Prob_Crisis'] = best_model_for_deployment.predict_proba(X_full_processed)[:, crisis_idx]
df_ml['Prob_Drift'] = best_model_for_deployment.predict_proba(X_full_processed)[:, drift_idx]
df_ml['Prob_Normal'] = best_model_for_deployment.predict_proba(X_full_processed)[:, normal_idx]


# Dynamic Intervention Triggering (Modify the original get_recommendation function)
def get_ml_recommendation(student_row):
    predicted_label = student_row['Predicted_Label']
    prob_crisis = student_row['Prob_Crisis']
    prob_drift = student_row['Prob_Drift']

    if predicted_label == "Crisis":
        return (
            "Immediate intervention: schedule meeting, connect with advisor or counselor.",
            f"Student is predicted to be in Crisis (Prob: {prob_crisis:.2f}). Disengaged early. May need academic and emotional support."
        )
    elif predicted_label == "Drift":
        # Example of threshold tuning: if drift probability is very high, maybe escalate intervention
        if prob_drift > 0.7: # Custom threshold for more urgent drift intervention
             return (
                "Urgent mid-semester check-in; offer time management tips and study group options, consider academic coaching.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but shows significant decline. Prevent further drop."
            )
        else:
            return (
                "Mid-semester check-in; offer time management tips and study group options.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but declines. Prevent further drop."
            )
    elif predicted_label == "Normal":
        return (
            "Congratulate on consistent performance.",
            f"Student is predicted to be Normal (Prob: {student_row['Prob_Normal']:.2f}). Stable performance. Encourage continued success or leadership roles."
        )
    else:
        return ("", "")

# Apply the new ML-driven recommendations
df_ml["ML_Recommendation"], df_ml["ML_Instructor_Note"] = zip(*df_ml.apply(get_ml_recommendation, axis=1))

# Student-Facing Risk Reports Enhancement (Feature Importance)
print("\n--- Feature Importance from Best Model (Random Forest) ---")
# Ensure feature names align with processed data
# This requires careful handling of ColumnTransformer's output
# For simplicity, let's assume numerical features are at the end after OHE features
feature_importances = pd.Series(best_model_for_deployment.feature_importances_, index=all_feature_names)
print(feature_importances.sort_values(ascending=False).head(10))

# To make this actionable for student reports, you'd need to map these features
# back to understandable language.
# Example: If 'Logins_Change' is very negative, suggest "Your recent decrease in platform logins..."

# Select relevant columns for the new intervention table
intervention_df_ml = df_ml[["Student_ID", "Gender", "Ethnicity", "Scholarship", "Label",
                            "Predicted_Label", "Prob_Crisis", "Prob_Drift", "Prob_Normal",
                            "ML_Recommendation", "ML_Instructor_Note"]]

# Save to CSV
intervention_df_ml.to_csv("CS101_Student_Intervention_Table_ML.csv", index=False)
print("\n ML-driven Intervention table saved as 'CS101_Student_Intervention_Table_ML.csv'")

print("\n--- Full ML-driven Intervention Table for ALL Students ---")
print(intervention_df_ml)

# --- Reset Pandas display options to default if needed for subsequent code ---
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')
# pd.reset_option('display.width')


--- Engineered Features for ALL Students ---
    Student_ID      Gender Ethnicity Scholarship   Label  Avg_Attendance  Avg_Logins  Pct_OnTime_Assignments  Avg_Quiz_Grade  Avg_Project_Grade  Avg_Attendance_Early  Avg_Logins_Early  Pct_OnTime_Assignments_Early  Avg_Quiz_Grade_Early  Attendance_Change  Logins_Change  Assignments_Change  Quiz_Change  Std_Attendance  Std_Logins  Std_Quiz_Grade  Total_Logins  Total_Assignments_Submitted  Total_Assignments_Never_Submitted
0         S001        Male     Other         Yes   Drift          1.3750      3.9375                  0.3750         3.37500              3.500                   2.0               5.8                           0.4                   3.4          -1.000000      -3.300000           -0.066667     0.100000        0.484123    1.477699        0.780625          63.0                         16.0                                0.0
1         S002        Male     White          No  Normal          2.0000      8.0625                  1.0



Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Tuned Random Forest Classification Report:
               precision    recall  f1-score   support

      Crisis       1.00      1.00      1.00         6
       Drift       1.00      1.00      1.00        10
      Normal       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


--- Deploying RandomForestClassifier for predictions ---

--- Feature Importance from Best Model (Random Forest) ---
Avg_Attendance                  0.136124
Avg_Attendance_Early            0.104974
Std_Attendance                  0.098839
Pct_OnTime_Assignments          0.094976
Total_Logins                    0.087685
Logins_Change                   0.067271
Avg_Logins_Early                0.064514
Avg_Logins                      0.060489
Pct_OnTime_Assignments_Early   

In [4]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import matplotlib.pyplot as plt

# --- Load Data and Initial Preprocessing ---
df = pd.read_csv("CS101_Student_Behavior.csv")

# Convert stringified lists/dicts back to Python objects
df['Weekly_Logins'] = df['Weekly_Logins'].apply(ast.literal_eval)
df['Weekly_Attendance'] = df['Weekly_Attendance'].apply(ast.literal_eval)
df['Assignments'] = df['Assignments'].apply(ast.literal_eval)
df['Quizzes'] = df['Quizzes'].apply(ast.literal_eval)
df['Programming_Projects'] = df['Programming_Projects'].apply(ast.literal_eval)

# Grade mapping for numerical conversion
grade_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'E': 0.5, 'F': 0}

# --- Feature Engineering ---
def calculate_features_up_to_week(df_row, current_week):
    attendance_slice = np.array(df_row['Weekly_Attendance'])[:current_week]
    logins_slice = np.array(df_row['Weekly_Logins'])[:current_week]
    assignment_grades_numeric = [grade_map[a['Grade']] for a in df_row['Assignments']][:current_week]
    assignment_on_time = [1 if a['Submission_Status'] == 'On Time' else 0 for a in df_row['Assignments']][:current_week]

    avg_attendance = np.mean(attendance_slice) if len(attendance_slice) > 0 else 0
    avg_logins = np.mean(logins_slice) if len(logins_slice) > 0 else 0
    pct_on_time_assignments = sum(assignment_on_time) / len(assignment_on_time) if len(assignment_on_time) > 0 else 0

    return pd.Series({
        'Avg_Attendance': avg_attendance,
        'Avg_Logins': avg_logins,
        'Pct_OnTime_Assignments': pct_on_time_assignments,
    })

# Create a new DataFrame for weekly predictions
PREDICTION_WEEK = 8
weekly_data = []
for index, row in df.iterrows():
    features = calculate_features_up_to_week(row, PREDICTION_WEEK)
    features['Student_ID'] = row['Student_ID']
    # Include categorical features
    features['Gender'] = row['Gender']
    features['Ethnicity'] = row['Ethnicity']
    features['Scholarship'] = row['Scholarship']
    features['Label'] = row['Label']  # This is the final label we want to predict
    weekly_data.append(features)

df_ml_weekly = pd.DataFrame(weekly_data)

# Categorical Feature Encoding
categorical_features = ['Gender', 'Ethnicity', 'Scholarship']
numerical_features = [col for col in df_ml_weekly.columns if col not in ['Student_ID', 'Label'] + categorical_features]

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numerical features as they are
)

# Define Target Variable
label_encoder = LabelEncoder()
df_ml_weekly['Label_Encoded'] = label_encoder.fit_transform(df_ml_weekly['Label'])
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))


X = df_ml_weekly.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
y = df_ml_weekly['Label_Encoded']

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --- Model Training and Evaluation ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting Machine": GradientBoostingClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    cm = confusion_matrix(y_test, y_pred)

    try:
        roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
    except ValueError:
        roc_auc = "N/A (requires binary or specific multi-class setup)"

    results[name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": cm,
        "roc_auc": roc_auc,
        "model": model  # Store the trained model
    }
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)
    print(f"ROC AUC (macro-avg OvR): {roc_auc}\n")

# --- Hyperparameter Tuning for Gradient Boosting Machine ---
print("--- Hyperparameter Tuning for Gradient Boosting Machine ---")
gbm_model = GradientBoostingClassifier(random_state=42)
param_grid_gbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_search_gbm = GridSearchCV(gbm_model, param_grid_gbm, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search_gbm.fit(X_train_processed, y_train)

print(f"Best parameters for Gradient Boosting Machine: {grid_search_gbm.best_params_}")
best_gbm_model = grid_search_gbm.best_estimator_

# Evaluate the tuned model
y_pred_gbm_tuned = best_gbm_model.predict(X_test_processed)
print("Tuned Gradient Boosting Machine Classification Report:\n", classification_report(y_test, y_pred_gbm_tuned, target_names=label_encoder.classes_))

# --- Dynamic Intervention System ---
def get_ml_recommendation(student_row):
    predicted_label = student_row['Predicted_Label']
    prob_crisis = student_row['Prob_Crisis']
    prob_drift = student_row['Prob_Drift']

    if predicted_label == "Crisis":
        return (
            "Immediate intervention: schedule meeting, connect with advisor or counselor.",
            f"Student is predicted to be in Crisis (Prob: {prob_crisis:.2f}). Disengaged early. May need academic and emotional support."
        )
    elif predicted_label == "Drift":
        if prob_drift > 0.7:  # Custom threshold for more urgent drift intervention
            return (
                "Urgent mid-semester check-in; offer time management tips and study group options, consider academic coaching.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but shows significant decline. Prevent further drop."
            )
        else:
            return (
                "Mid-semester check-in; offer time management tips and study group options.",
                f"Student is predicted to be in Drift (Prob: {prob_drift:.2f}). Performance starts strong but declines. Prevent further drop."
            )
    elif predicted_label == "Normal":
        return (
            "Congratulate on consistent performance.",
            f"Student is predicted to be Normal (Prob: {student_row['Prob_Normal']:.2f}). Stable performance. Encourage continued success or leadership roles."
        )
    else:
        return ("", "")

# Apply the new ML-driven recommendations
X_full = df_ml_weekly.drop(['Student_ID', 'Label', 'Label_Encoded'], axis=1)
X_full_processed = preprocessor.transform(X_full)

df_ml_weekly["Predicted_Label_Encoded"] = best_gbm_model.predict(X_full_processed)
df_ml_weekly["Predicted_Label"] = df_ml_weekly["Predicted_Label_Encoded"].map(label_mapping)

# Get prediction probabilities for risk scoring
# Probabilities are ordered by label_encoder.classes_
# Find the index for 'Crisis', 'Drift', 'Normal' in label_encoder.classes_
crisis_idx = np.where(label_encoder.classes_ == 'Crisis')[0][0]
drift_idx = np.where(label_encoder.classes_ == 'Drift')[0][0]
normal_idx = np.where(label_encoder.classes_ == 'Normal')[0][0]

df_ml_weekly['Prob_Crisis'] = best_gbm_model.predict_proba(X_full_processed)[:, crisis_idx]
df_ml_weekly['Prob_Drift'] = best_gbm_model.predict_proba(X_full_processed)[:, drift_idx]
df_ml_weekly['Prob_Normal'] = best_gbm_model.predict_proba(X_full_processed)[:, normal_idx]

# Apply recommendations
df_ml_weekly["ML_Recommendation"], df_ml_weekly["ML_Instructor_Note"] = zip(*df_ml_weekly.apply(get_ml_recommendation, axis=1))

# Save to CSV
intervention_df_ml = df_ml_weekly[["Student_ID", "Gender", "Ethnicity", "Scholarship", "Label",
                                     "Predicted_Label", "Prob_Crisis", "Prob_Drift", "Prob_Normal",
                                     "ML_Recommendation", "ML_Instructor_Note"]]

intervention_df_ml.to_csv("CS101_Student_Intervention_Table_ML_Weekly.csv", index=False)
print("\n✅ ML-driven Intervention table saved as 'CS101_Student_Intervention_Table_ML_Weekly.csv'")

# Display the full intervention table for all students
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print("\n--- Full ML-driven Intervention Table for ALL Students (Weekly Prediction Context) ---")
print(intervention_df_ml)

# --- Reset Pandas display options to default if needed for subsequent code ---
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

--- Training Logistic Regression ---
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

      Crisis       1.00      1.00      1.00         6
       Drift       1.00      1.00      1.00        10
      Normal       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

Confusion Matrix:
 [[ 6  0  0]
 [ 0 10  0]
 [ 0  0 24]]
ROC AUC (macro-avg OvR): 1.0

--- Training Decision Tree ---
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

      Crisis       1.00      1.00      1.00         6
       Drift       1.00      1.00      1.00        10
      Normal       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

Co



Best parameters for Gradient Boosting Machine: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Tuned Gradient Boosting Machine Classification Report:
               precision    recall  f1-score   support

      Crisis       1.00      1.00      1.00         6
       Drift       1.00      1.00      1.00        10
      Normal       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


✅ ML-driven Intervention table saved as 'CS101_Student_Intervention_Table_ML_Weekly.csv'

--- Full ML-driven Intervention Table for ALL Students (Weekly Prediction Context) ---
    Student_ID      Gender Ethnicity Scholarship   Label Predicted_Label  Prob_Crisis  Prob_Drift  Prob_Normal                                  ML_Recommendation                                 ML_Instructor_Note
0         S001        Male     Other         Yes   Drift        