In [None]:
# Student Improvement Prediction Model (Operational Notebook)
# File: student improvement prediction model.ipynb
# Purpose: Predict whether a student will improve given a finalized task list
# and estimate how much and how soon improvement will occur across timelines.

"""
USAGE NOTES (Operational-ready):
- This notebook expects the following inputs to be provided (from the ML Student Improvement Model
  or your backend) before running the prediction cells:

  1) student_history: pandas.DataFrame
     - columns: ['student_id','date','subject','score','task_type','time_spent_minutes','completed']
     - date must be a datetime

  2) final_suggestions: dict or pandas.DataFrame
     - If dict per student: {
         student_id: {
           'tasks': [{ 'task': str, 'xp': int, 'subject': str, 'estimated_minutes': int } , ...],
           'summary': str
         }, ... }
     - Or a DataFrame with columns: ['student_id','task','xp','subject','estimated_minutes']

  3) student_attributes: pandas.DataFrame (hardwork/determination/focus/creativity/discipline)
     - columns: ['student_id','hardwork','determination','focus','creativity','discipline']
     - values in range [0,1] or [0,100] (the notebook normalizes)

- The notebook produces:
  - per-student, per-subject predictions for probability of improvement (classification)
  - predicted mark increase (regression)
  - predicted time horizons for improvement across multiple timelines (1w,3w,1m,2m,6m,1y)
  - justification/analysis by comparing predictions to historical improvement

- This notebook is intentionally implementation-ready but DOES NOT include any hard-coded sample data.
  Provide your real dataframes into the environment and run the cells.

"""

# 0. Imports
import numpy as np
import pandas as pd
from datetime import timedelta
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# --------------------------- Helper functions ---------------------------

def normalize_attributes(df, cols):
    df2 = df.copy()
    for c in cols:
        # scale to 0-1 if not already
        if df2[c].max() > 1:
            df2[c] = (df2[c] - df2[c].min()) / (df2[c].max() - df2[c].min() + 1e-9)
        else:
            df2[c] = df2[c].clip(0,1)
    return df2


def aggregate_student_history(history_df):
    """
    Returns aggregated features per student and subject
    Features include: mean_score, last_score, improvement_rate, avg_time_spent, completion_rate, sessions
    """
    h = history_df.copy()
    # ensure date
    h['date'] = pd.to_datetime(h['date'])

    agg = h.groupby(['student_id','subject']).agg(
        mean_score = ('score','mean'),
        last_score = ('score', 'last'),
        first_score = ('score','first'),
        improvement_total = (lambda x: x.iloc[-1] - x.iloc[0], 'score'),
        sessions = ('score','count'),
        avg_time_spent = ('time_spent_minutes','mean'),
        completion_rate = ('completed','mean')
    ).reset_index()

    # improvement rate per session
    agg['improvement_per_session'] = (agg['last_score'] - agg['first_score']) / (agg['sessions'] + 1e-9)
    return agg


def prepare_features(agg_df, final_suggestions_df, attributes_df):
    """
    Merge aggregated history, suggestions and attributes into model features.
    final_suggestions_df expected columns: ['student_id','task','xp','subject','estimated_minutes']
    We will aggregate suggestions per student/subject: total_xp, count_tasks, est_minutes
    """
    sug = final_suggestions_df.copy()
    sug_agg = sug.groupby(['student_id','subject']).agg(
        total_xp = ('xp','sum'),
        n_tasks = ('task','count'),
        est_minutes = ('estimated_minutes','sum')
    ).reset_index()

    # merge
    m = agg_df.merge(sug_agg, how='left', on=['student_id','subject'])
    m = m.merge(attributes_df, how='left', on='student_id')

    # fillna
    m['total_xp'] = m['total_xp'].fillna(0)
    m['n_tasks'] = m['n_tasks'].fillna(0)
    m['est_minutes'] = m['est_minutes'].fillna(0)

    # normalize attributes
    attr_cols = ['hardwork','determination','focus','creativity','discipline']
    m = normalize_attributes(m, [c for c in attr_cols if c in m.columns])

    return m

# --------------------------- Prediction models ---------------------------

class StudentImprovementPredictor:
    """
    - classifier: predict probability that the student WILL improve within a given timeline
    - regressor: predict the expected increase in marks
    We train simple RandomForest models on historical aggregated data, but in operation,
    user may supply pre-trained weights or we can re-fit on the available historical data.
    """
    def __init__(self):
        self.clf = RandomForestClassifier(n_estimators=150, random_state=42)
        self.reg = RandomForestRegressor(n_estimators=150, random_state=42)
        self.scaler = StandardScaler()
        self.feature_cols = None

    def fit(self, features_df):
        """
        features_df must contain:
          - features (numeric)
          - target columns: 'will_improve' (0/1), 'mark_increase' (float)
        """
        df = features_df.copy()
        # Drop rows with missing target
        df = df.dropna(subset=['will_improve','mark_increase'])

        X = df.drop(columns=['student_id','subject','will_improve','mark_increase'])
        y_clf = df['will_improve'].astype(int)
        y_reg = df['mark_increase'].astype(float)

        self.feature_cols = X.columns.tolist()
        Xs = self.scaler.fit_transform(X)
        self.clf.fit(Xs, y_clf)
        self.reg.fit(Xs, y_reg)
        return self

    def predict(self, features_df):
        df = features_df.copy()
        X = df[self.feature_cols].fillna(0)
        Xs = self.scaler.transform(X)
        prob = self.clf.predict_proba(Xs)[:,1]
        mark_inc = self.reg.predict(Xs)
        out = df[['student_id','subject']].copy()
        out['improve_prob'] = prob
        out['predicted_mark_increase'] = mark_inc
        return out

# --------------------------- Time-horizon forecasting ---------------------------

def horizon_predictions(base_features_df, predictor, timelines=['1w','3w','1m','2m','6m','1y']):
    """
    Produce predictions across different timelines.
    Simple operational approach: for each horizon we slightly scale expected effect of tasks and attributes.
    A more advanced approach would train timeline-specific models.
    """
    multipliers = {
        '1w': 0.15,
        '3w': 0.35,
        '1m': 0.5,
        '2m': 0.7,
        '6m': 0.95,
        '1y': 1.0
    }

    results = []
    for t in timelines:
        df = base_features_df.copy()
        # scale xp and est_minutes effect for horizon
        if 'total_xp' in df.columns:
            df['total_xp_scaled'] = df['total_xp'] * multipliers[t]
        if 'est_minutes' in df.columns:
            df['est_minutes_scaled'] = df['est_minutes'] * multipliers[t]
        # create a feature vector matching predictor.feature_cols
        # fallback: if new scaled columns not used in training, ensure presence
        for col in predictor.feature_cols:
            if col not in df.columns:
                df[col] = 0
        # if scaled columns present but original also present, prefer scaled values for prediction
        if 'total_xp_scaled' in df.columns and 'total_xp' in df.columns:
            df['total_xp'] = df['total_xp_scaled']
        if 'est_minutes_scaled' in df.columns and 'est_minutes' in df.columns:
            df['est_minutes'] = df['est_minutes_scaled']

        pred = predictor.predict(df[predictor.feature_cols + ['student_id','subject']])
        pred['timeline'] = t
        results.append(pred)

    return pd.concat(results, ignore_index=True)

# --------------------------- Evaluation & Justification ---------------------------

def evaluate_predictions(pred_df, actual_df):
    """
    Compare predicted improvement & mark increase to actual changes in actual_df.
    actual_df: historical aggregated df containing 'student_id','subject','future_mark_increase'
    Returns summary metrics and a small justification text per student-subject.
    """
    merged = pred_df.merge(actual_df, on=['student_id','subject'], how='left')
    # compute errors
    mae = mean_absolute_error(merged['future_mark_increase'].fillna(0), merged['predicted_mark_increase'].fillna(0))
    r2 = r2_score(merged['future_mark_increase'].fillna(0), merged['predicted_mark_increase'].fillna(0))

    # reason: for each row explain whether prediction was optimistic/pessimistic and why
    reasons = []
    for _, row in merged.iterrows():
        act = row.get('future_mark_increase', None)
        pred = row['predicted_mark_increase']
        if pd.isna(act):
            reasons.append('No ground truth available for this student-subject; cannot fully validate.')
            continue
        diff = pred - act
        if abs(diff) < 2:
            reasons.append('Prediction close to actual (within 2 marks).')
        elif diff > 2:
            reasons.append('Prediction optimistic — predicted higher improvement than realized. Check attribute overestimation or low completion_rate.')
        else:
            reasons.append('Prediction conservative — model underpredicted improvement; consider unusual high effort in follow-up data.')

    merged['justification'] = reasons
    summary = {'MAE_mark_increase': mae, 'R2': r2}
    return summary, merged

# --------------------------- Operational Entry-Point ---------------------------

def run_prediction_pipeline(student_history, final_suggestions, student_attributes, timelines=['1w','3w','1m','2m','6m','1y'], retrain=True):
    """
    1) Aggregate history
    2) Prepare features by merging suggestions and attributes
    3) If retrain==True and enough historical rows exist, fit models using historical targets
       (For historical targets, the system expects these columns to be available or calculated externally.)
    4) Produce timeline predictions
    5) Return predictions dataframe
    """
    # sanity checks
    assert 'student_id' in student_history.columns, 'student_history must include student_id'
    assert isinstance(final_suggestions, (pd.DataFrame, dict)), 'final_suggestions must be DataFrame or dict'
    assert 'student_id' in student_attributes.columns, 'student_attributes must include student_id'

    # 1. Aggregate history
    agg = aggregate_student_history(student_history)

    # 2. Normalize suggestions into a DataFrame if dict supplied
    if isinstance(final_suggestions, dict):
        rows = []
        for sid, payload in final_suggestions.items():
            for t in payload.get('tasks', []):
                rows.append({'student_id': sid, 'task': t.get('task'), 'xp': t.get('xp',0), 'subject': t.get('subject'), 'estimated_minutes': t.get('estimated_minutes',0)})
        final_suggestions_df = pd.DataFrame(rows)
    else:
        final_suggestions_df = final_suggestions.copy()

    # 3. Prepare feature matrix
    features = prepare_features(agg, final_suggestions_df, student_attributes)

    # For training, we expect two target columns: will_improve (0/1) and mark_increase (float)
    # If these aren't present, we will not train and instead require a pre-trained predictor provided by user.
    predictor = StudentImprovementPredictor()
    if retrain and 'will_improve' in features.columns and 'mark_increase' in features.columns and len(features) > 20:
        predictor.fit(features)
    else:
        # operational default: fit on synthetic target derived from history if explicit target absent
        # create a proxy target: mark_increase = improvement_per_session * estimated sessions remaining
        feats = features.copy()
        # estimate remaining sessions roughly as n_tasks (if present) or 4
        feats['est_sessions_remain'] = feats.get('n_tasks', 4).fillna(4)
        feats['mark_increase'] = feats['improvement_per_session'] * feats['est_sessions_remain']
        feats['will_improve'] = (feats['mark_increase'] > 0).astype(int)
        predictor.fit(feats)

    # 4. produce horizon predictions
    preds = horizon_predictions(features, predictor, timelines=timelines)

    # 5. Format subject-wise and mark-wise predictions
    preds_subject_wise = preds.copy()
    # Convert probabilities to labels for convenience
    preds_subject_wise['will_improve_label'] = (preds_subject_wise['improve_prob'] >= 0.5).astype(int)

    return preds_subject_wise

# --------------------------- Example of call (do NOT run without providing data) ---------------------------
# preds = run_prediction_pipeline(student_history, final_suggestions, student_attributes)
# print(preds.head())

# --------------------------- Save/Export helpers ---------------------------

def export_predictions_to_csv(predictions_df, path):
    predictions_df.to_csv(path, index=False)
    return path


# --------------------------- End of Notebook ---------------------------
# Provide this notebook with your operational dataframes and call run_prediction_pipeline(...) to get predictions.
