In [None]:
# --- Cell 1: Environment Setup ---
print("‚öôÔ∏è Installing Dependencies (This takes ~5-8 mins for vLLM)...")
# Uninstall standard torch to allow vLLM to install its specific version if needed
!pip uninstall -y torch
!pip install -q vllm
!pip install -q -U transformers accelerate sentence-transformers faiss-cpu json_repair textstat
!pip install -q pandas numpy scikit-learn matplotlib seaborn

# Re-import essential libraries
import pandas as pd
import numpy as np
import faiss
import json
import textstat
import os
import gc
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score, mean_absolute_error
from sentence_transformers import SentenceTransformer
from json_repair import repair_json
from tqdm.auto import tqdm
from google.colab import files

# Check GPU
print(f"‚úÖ Environment Ready.")
!nvidia-smi

# --- Cell 2: SMART CONTROL PANEL ---

# 1. USER SELECTION
SELECTED_ESSAY_SET = 6  # <--- CHANGE THIS (1-8)
SAMPLE_SIZE = None      # Set to None for FULL RUN

# 2. CONFIG DATABASE
ESSAY_CONFIGS = {
    1: {'ctx': 3000, 'tok': 200, 'batch': 8,  'type': 'Persuasive'},
    2: {'ctx': 3000, 'tok': 200, 'batch': 8,  'type': 'Persuasive'},
    3: {'ctx': 1500, 'tok': 150, 'batch': 16, 'type': 'Source-Based'},
    4: {'ctx': 1500, 'tok': 150, 'batch': 16, 'type': 'Source-Based'},
    5: {'ctx': 1500, 'tok': 150, 'batch': 16, 'type': 'Source-Based'},
    6: {'ctx': 1500, 'tok': 150, 'batch': 16, 'type': 'Source-Based'},
    7: {'ctx': 2000, 'tok': 200, 'batch': 10, 'type': 'Narrative'},
    8: {'ctx': 6000, 'tok': 300, 'batch': 2,  'type': 'Narrative'}
}

# 3. AUTO-CONFIGURE
cfg = ESSAY_CONFIGS[SELECTED_ESSAY_SET]
print(f"üîß CONFIGURING FOR SET {SELECTED_ESSAY_SET} ({cfg['type']})")
print(f"   -> Context: {cfg['ctx']} | Batch: {cfg['batch']} | Tokens: {cfg['tok']}")

CONTROL_PANEL = {
    'essay_set': SELECTED_ESSAY_SET,
    'target_col': 'domain1_score',
    'pilot_sample_size': SAMPLE_SIZE,
    'random_state': 42,
    'checkpoint_path': f"thesis_checkpoint_set{SELECTED_ESSAY_SET}.csv",

    # vLLM Settings (Dynamic)
    'llm_model': "casperhansen/deepseek-r1-distill-llama-8b-awq",
    'max_new_tokens': cfg['tok'],
    'context_char_limit': cfg['ctx'],
    'vllm_batch_size': cfg['batch'],
    'gpu_memory_utilization': 0.7,

    # RAG Settings
    'embedding_model': 'all-MiniLM-L6-v2',
    'top_k_numeric': 5
}

# --- Cell 3: Data Loading ---
def load_and_prep_data(path=None):
    if path is None:
        print("üìÇ Upload your dataset (ASAP 'training_set_rel3.tsv' or .xlsx)...")
        uploaded = files.upload()
        fn = next(iter(uploaded))
        df = pd.read_excel(fn) if fn.endswith('.xlsx') else pd.read_csv(fn, encoding='latin-1', sep='\t' if fn.endswith('.tsv') else ',')
    else:
        df = pd.read_csv(path)

    df = df[df['essay_set'] == CONTROL_PANEL['essay_set']].copy()
    df = df[['essay_id', 'essay', CONTROL_PANEL['target_col']]].dropna()

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=CONTROL_PANEL['random_state'])
    print(f"‚úÖ Data Prepared: {len(train_df)} Train, {len(test_df)} Test.")
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

def generate_embeddings(train_df, test_df):
    print("üß† Generating Semantic Embeddings...")
    model = SentenceTransformer(CONTROL_PANEL['embedding_model'], device='cpu')
    train_emb = model.encode(train_df['essay'].tolist(), convert_to_numpy=True, show_progress_bar=True)
    test_emb = model.encode(test_df['essay'].tolist(), convert_to_numpy=True, show_progress_bar=True)
    return train_emb, test_emb

# --- Cell 4: Feature Engineering (Linguistic) ---
def extract_linguistic_features(df):
    print("   -> Extracting Linguistic Features (Readability, Counts)...")
    df = df.copy()

    # 1. Length & Structure
    df['char_count'] = df['essay'].apply(len)
    df['word_count'] = df['essay'].apply(lambda x: len(str(x).split()))
    df['sentence_count'] = df['essay'].apply(textstat.sentence_count)
    df['avg_sentence_len'] = df['word_count'] / (df['sentence_count'] + 1)

    # 2. Complexity / Readability
    df['flesch_kincaid'] = df['essay'].apply(textstat.flesch_kincaid_grade)
    df['gunning_fog'] = df['essay'].apply(textstat.gunning_fog)

    # 3. Vocabulary Richness
    def get_ttr(text):
        words = str(text).lower().split()
        if not words: return 0
        return len(set(words)) / len(words)

    df['ttr'] = df['essay'].apply(get_ttr)

    feature_cols = ['char_count', 'word_count', 'sentence_count', 'avg_sentence_len', 'flesch_kincaid', 'gunning_fog', 'ttr']
    return df[feature_cols].values

# --- Cell 5: EXPERT 1 & 2 (Math Specialists - ENHANCED) ---
def run_math_specialists(test_df, train_df, test_emb, train_emb):
    print("\n--- Running Math Specialists (RAG + Enhanced RF) ---")

    # 1. Numeric RAG
    train_scores = train_df[CONTROL_PANEL['target_col']].values
    index = faiss.IndexFlatL2(train_emb.shape[1])
    index.add(train_emb)
    D, I = index.search(test_emb, k=CONTROL_PANEL['top_k_numeric'])
    test_df['feat_rag_numeric'] = [np.mean(train_scores[idx_list]) for idx_list in I]

    # 2. Enhanced Random Forest
    print("   -> Extracting Features & Training RF...")
    train_feats = extract_linguistic_features(train_df)
    test_feats = extract_linguistic_features(test_df)

    X_train = np.hstack([train_emb, train_feats])
    X_test = np.hstack([test_emb, test_feats])

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, train_scores)
    test_df['feat_rf'] = rf.predict(X_test)

    return test_df, index

# --- Cell 6: EXPERT 3 (DeepSeek via vLLM) ---

def run_delta_llm_vllm(test_df, train_df, test_emb, index):
    print("\n--- Running Expert 3: DeepSeek Reasoning (vLLM Engine) ---")

    # 1. Initialize vLLM
    from vllm import LLM, SamplingParams

    print(f"   -> Loading vLLM Engine: {CONTROL_PANEL['llm_model']}")
    llm = LLM(
        model=CONTROL_PANEL['llm_model'],
        quantization="awq",
        dtype="half",
        gpu_memory_utilization=CONTROL_PANEL['gpu_memory_utilization'],
        max_model_len=8192,           # Increased max_len to support Set 8 context
        trust_remote_code=True,
        enforce_eager=True
    )

    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=CONTROL_PANEL['max_new_tokens']
    )

    # 2. Checkpoint Logic
    checkpoint_file = CONTROL_PANEL['checkpoint_path']
    if os.path.exists(checkpoint_file):
        print("   -> Loading checkpoint...")
        saved_df = pd.read_csv(checkpoint_file)
        test_df = test_df.merge(saved_df[['essay_id', 'feat_delta_llm']], on='essay_id', how='left', suffixes=('', '_saved'))
        if 'feat_delta_llm_saved' in test_df.columns:
            test_df['feat_delta_llm'] = test_df['feat_delta_llm'].fillna(test_df['feat_delta_llm_saved'])
            test_df.drop(columns=['feat_delta_llm_saved'], inplace=True)

    if 'feat_delta_llm' not in test_df.columns:
        test_df['feat_delta_llm'] = np.nan

    # 3. Identify Work
    indices_to_process = [i for i in range(len(test_df)) if pd.isna(test_df.at[i, 'feat_delta_llm'])]
    print(f"   -> Processing {len(indices_to_process)} essays.")

    if not indices_to_process:
        return test_df

    D, I = index.search(test_emb, k=1)
    limit = CONTROL_PANEL['context_char_limit']
    batch_size = CONTROL_PANEL['vllm_batch_size']

    # 4. Batch Processing Loop
    pbar = tqdm(total=len(indices_to_process))

    for i in range(0, len(indices_to_process), batch_size):
        batch_idx = indices_to_process[i : i + batch_size]
        prompts = []

        # Prepare Batch Prompts
        for idx in batch_idx:
            ref_idx = I[idx][0]
            ref_text = train_df.iloc[ref_idx]['essay']
            ref_score = train_df.iloc[ref_idx][CONTROL_PANEL['target_col']]
            student_text = test_df.at[idx, 'essay']

            p = f"""<|user|>
Reference Essay (Score: {ref_score}):
"{str(ref_text)[:limit]}"

Student Essay (Target):
"{str(student_text)[:limit]}"

Compare the Student Essay to the Reference Essay.
Think step-by-step about the differences in quality.
Is the Student Essay better, worse, or equal?
Output the Final Score.

Format:
<think>
... reasoning ...
</think>
JSON: {{"score": int}}
<|end|>
<|assistant|>"""
            prompts.append(p)

        # Run vLLM Inference
        try:
            outputs = llm.generate(prompts, sampling_params, use_tqdm=False)

            for j, output in enumerate(outputs):
                original_idx = batch_idx[j]

                # Parse
                gen = output.outputs[0].text
                if "</think>" in gen:
                    gen = gen.split("</think>")[-1]

                try:
                    data = repair_json(gen, return_objects=True)
                    if isinstance(data, list): data = data[0]

                    ref_idx = I[original_idx][0]
                    ref_score = train_df.iloc[ref_idx][CONTROL_PANEL['target_col']]
                    score = float(data.get('score', ref_score))
                except:
                    ref_idx = I[original_idx][0]
                    score = float(train_df.iloc[ref_idx][CONTROL_PANEL['target_col']])

                test_df.at[original_idx, 'feat_delta_llm'] = score

            pbar.update(len(batch_idx))

            # Checkpoint
            if i % 10 == 0:
                test_df.to_csv(checkpoint_file, index=False)

        except Exception as e:
            print(f"Error in batch: {e}")
            test_df.to_csv(checkpoint_file, index=False)

    pbar.close()
    test_df.to_csv(checkpoint_file, index=False)

    # Cleanup vLLM
    del llm
    gc.collect()
    torch.cuda.empty_cache()

    return test_df

# --- Cell 7: DUAL STACKING ---
def run_dual_stacking(df):
    print("\n--- Final Dual Stacking ---")
    df['feat_delta_llm'] = df['feat_delta_llm'].fillna(df['feat_rf'])
    X = df[['feat_rag_numeric', 'feat_rf', 'feat_delta_llm']]
    y = df[CONTROL_PANEL['target_col']]

    meta_lin = LinearRegression()
    meta_lin.fit(X, y)
    pred_lin = meta_lin.predict(X)

    meta_rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
    meta_rf.fit(X, y)
    pred_rf = meta_rf.predict(X)

    df['pred_final'] = (pred_lin + pred_rf) / 2
    return df, meta_lin.coef_

# --- Cell 8: EXECUTION LOOP ---
train_df, test_df = load_and_prep_data()

if CONTROL_PANEL['pilot_sample_size']:
    print(f"‚ö†Ô∏è PILOT MODE: Sampling {CONTROL_PANEL['pilot_sample_size']} essays.")
    test_df = test_df.sample(n=CONTROL_PANEL['pilot_sample_size'], random_state=42).reset_index(drop=True)
else:
    print("üöÄ FULL RUN: Processing entire Test Set.")

train_emb, test_emb = generate_embeddings(train_df, test_df)

# Specialists (Enhanced RF)
test_df, faiss_index = run_math_specialists(test_df, train_df, test_emb, train_emb)

# DeepSeek (vLLM)
test_df = run_delta_llm_vllm(test_df, train_df, test_emb, faiss_index)

# Stacking
test_df, lin_weights = run_dual_stacking(test_df)

# Evaluation
y_true = test_df[CONTROL_PANEL['target_col']]
metrics = []
for col, name in [
    ('feat_rag_numeric', '1. Numeric RAG'),
    ('feat_rf', '2. Random Forest (Enhanced)'),
    ('feat_delta_llm', '3. DeepSeek Delta'),
    ('pred_final', '4. DUAL ENSEMBLE')
]:
    qwk = cohen_kappa_score(y_true, np.rint(test_df[col]), weights='quadratic')
    mae = mean_absolute_error(y_true, test_df[col])
    metrics.append({"Model": name, "QWK": qwk, "MAE": mae})

metrics_df = pd.DataFrame(metrics)
print("\n" + "="*60)
print(f"üèÜ FINAL RESULTS (SET {CONTROL_PANEL['essay_set']})")
print("="*60)
print(metrics_df.to_string(index=False))
print("-" * 60)
print(f"Linear Weights: RAG={lin_weights[0]:.2f}, RF={lin_weights[1]:.2f}, DS={lin_weights[2]:.2f}")
print("="*60)

# Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_true, y=test_df['pred_final'], alpha=0.6, color='blue', label='Predictions')
sns.regplot(x=y_true, y=test_df['pred_final'], scatter=False, color='red', label='Trend Line')
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', label='Perfect Fit')
plt.title(f"Human vs AI Scores (QWK: {metrics_df.iloc[3]['QWK']:.4f})")
plt.xlabel("Human Score")
plt.ylabel("AI Prediction")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig("thesis_correlation_plot.png")
plt.show()

# Download
print("Downloading Final Results...")
test_df.to_csv("FINAL_THESIS_RESULTS.csv", index=False)
files.download("FINAL_THESIS_RESULTS.csv")
files.download("thesis_correlation_plot.png")