In [None]:
# Load the DataFrame from Parquet
import pandas as pd

df_all = pd.read_parquet("/kaggle/input/parquet-df-audit-opinions")
print("Loaded DataFrame with shape:", df_all.shape)

In [None]:
# Set max rows to 3000 so full output shows
pd.set_option("display.max_rows", 3000)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report,  confusion_matrix

from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import warnings
import numpy as np

In [None]:
# Load your combined dataframe (df_all) here, with columns: ['year', 'text', 'label', 'source', 'llm']

# Filter only relevant years for training/testing
df_all = df_all[(df_all['year'] >= 1998) & (df_all['year'] <= 2021)].reset_index(drop=True)

In [None]:
df_all

In [None]:
df_all[df_all['source']=='original']

In [None]:
df_all.describe()

## STACKED MODEL 1 (BEST OF ALL WITH REGARD TO F1 SCORE)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support

# ---------------------------
# Helper: compute F-beta
# ---------------------------
def fbeta_score_custom(y_true, y_pred, beta=2):
    prec, rec, _, _ = precision_recall_fscore_support(
        y_true, y_pred, beta=beta, average="binary", zero_division=0
    )
    if prec + rec == 0:
        return 0
    return (1 + beta**2) * (prec * rec) / ((beta**2 * prec) + rec)

# ---------------------------
# Validation curve table
# ---------------------------
def validation_curve_table(y_true, scores, beta=2):
    sorted_idx = np.argsort(scores)[::-1]
    y_true_sorted = y_true[sorted_idx]
    scores_sorted = scores[sorted_idx]

    best_thr, best_fbeta = None, -1
    tp, fp, fn = 0, 0, int(np.sum(y_true_sorted))

    for i in range(len(scores_sorted)):
        if y_true_sorted[i] == 1:
            tp += 1
            fn -= 1
        else:
            fp += 1
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0
        fbeta = (1 + beta**2) * prec * rec / ((beta**2 * prec) + rec) if (prec + rec) > 0 else 0
        if fbeta > best_fbeta:
            best_fbeta = fbeta
            best_thr = scores_sorted[i]
    return best_thr, best_fbeta

# ---------------------------
# Base model trainer with class weights
# ---------------------------
def train_base_model(X_train, y_train, model_type='sgd', param_grid=None, beta=2):
    best_model, best_params, best_threshold, best_fbeta = None, None, 0.5, -1

    for params in ParameterGrid(param_grid):
        if model_type == 'sgd':
            clf = SGDClassifier(random_state=42, class_weight='balanced', **params)
        elif model_type == 'linsvc':
            clf = LinearSVC(random_state=42, class_weight='balanced', **params)
        elif model_type == 'rbf':
            clf = SVC(kernel='rbf', probability=False, random_state=42, class_weight='balanced', **params)
        elif model_type == 'nb':
            clf = MultinomialNB(**params)
        else:
            raise ValueError("Unknown model_type")

        clf.fit(X_train, y_train)

        if model_type == 'nb':
            y_scores = clf.predict_proba(X_train)[:, 1]
        else:
            y_scores = clf.decision_function(X_train)
        thr, fbeta = validation_curve_table(np.array(y_train), y_scores, beta=beta)

        if fbeta > best_fbeta:
            best_fbeta, best_model, best_params, best_threshold = fbeta, clf, params, thr

    return best_model, best_params, best_threshold, best_fbeta

# ---------------------------
# Stacked evaluation per year
# ---------------------------
def evaluate_year_stacked(df, test_year, beta=2):
    train_years = list(range(test_year - 4, test_year))
    df_trainval = df[df['year'].isin(train_years)].copy()

    # Keep the original validation split using train_test_split
    df_train, df_val = train_test_split(
        df_trainval, test_size=0.25, stratify=df_trainval['label'], random_state=42
    )

    df_test = df[(df['year'] == test_year) & (df['source'] == 'original')].copy()

    df_train_orig = df_train[df_train['source'] == 'original']
    df_train_syn = df_train[df_train['source'] != 'original']

    # Initialize vectorizer with more features for more aggressive approach
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=30000)  # Increased from 10000
    vectorizer.fit(pd.concat([df_train_orig['text'], df_train_syn['text']]))

    # Transform all data
    X_train_vec = vectorizer.transform(pd.concat([df_train_orig['text'], df_train_syn['text']]))
    y_train_vec = pd.concat([df_train_orig['label'], df_train_syn['label']])
    X_val_vec = vectorizer.transform(df_val['text'])
    y_val_vec = df_val['label'].values
    X_test_vec = vectorizer.transform(df_test['text'])
    y_test_vec = df_test['label'].values

    # Base model grids with less regularization for more aggressive approach
    param_grid_sgd = {'loss': ['hinge', 'log_loss'], 'alpha': [1e-4, 1e-3], 'penalty': ['l2', 'l1']}
    param_grid_linsvc = {'C': [0.1, 0.5, 1.0]}  # Higher C values for less regularization
    param_grid_rbf = {'C': [0.5, 1.0], 'gamma': ['scale']}  # Added back RBF SVM
    param_grid_nb = {'alpha': [0.5, 1.0, 2.0]}

    # Train base models
    base_models = {}
    for name, grid in [('sgd', param_grid_sgd), ('linsvc', param_grid_linsvc), ('rbf', param_grid_rbf), ('nb', param_grid_nb)]:
        print(f"[{test_year}] Processing base model: {name}")
        model, best_params, best_thr, best_fbeta = train_base_model(
            X_train_vec, y_train_vec, model_type=name, param_grid=grid, beta=beta
        )
        base_models[name] = (model, best_thr)
        print(f"[{test_year}] -> base {name} chosen setting: {best_params}, OOF-F{beta}(approx)={best_fbeta:.3f}")

    # Meta features
    def get_decision_matrix(models, X):
        meta_features = []
        for name, (model, thr) in models.items():
            if name == 'nb':
                scores = model.predict_proba(X)[:, 1]
            else:
                scores = model.decision_function(X)
            meta_features.append(scores.reshape(-1, 1))
        return np.hstack(meta_features)

    X_meta_val = get_decision_matrix(base_models, X_val_vec)
    X_meta_test = get_decision_matrix(base_models, X_test_vec)

    # Train meta model with less regularization
    meta_clf = LogisticRegression(
        random_state=42, 
        max_iter=200, 
        class_weight='balanced',
        C=0.5,  # Less regularization
        penalty='l2'
    )
    meta_clf.fit(X_meta_val, y_val_vec)

    # Tune threshold on validation with F2 for balanced precision/recall
    val_scores = meta_clf.predict_proba(X_meta_val)[:, 1]
    thresholds = np.linspace(0.2, 0.8, 61)  # Wider threshold range for more aggressive approach
    best_fbeta, best_thr = -1, 0.5
    for thr in thresholds:
        y_pred_val = (val_scores >= thr).astype(int)
        fbeta_val = fbeta_score_custom(y_val_vec, y_pred_val, beta=2)  # F2 for balanced approach
        if fbeta_val > best_fbeta:
            best_fbeta, best_thr = fbeta_val, thr
    print(f"[{test_year}] Meta OOF best threshold={best_thr:.4f}, OOF-F2={best_fbeta:.3f}")

    # Predict test
    test_scores = meta_clf.predict_proba(X_meta_test)[:, 1]
    y_pred_test = (test_scores >= best_thr).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_test_vec, y_pred_test, labels=[0, 1]).ravel()
    f1 = f1_score(y_test_vec, y_pred_test)

    print(f"\n=== Year {test_year} Results ===")
    print(f"TP={tp}, FP={fp}, FN={fn}, F1={f1:.3f}")
    print(f"[{test_year}] Stored {len(df_test)} test rows, predicted 1s: {int(np.sum(y_pred_test))}\n")

    df_test_result = df_test[['year', 'text', 'firm_id']].copy()  # Include firm_id
    df_test_result['true_label'] = y_test_vec
    df_test_result['pred_label'] = y_pred_test

    return df_test_result

# ---------------------------
# Iterate all years with cumulative metrics
# ---------------------------
all_years_results = []
df_all_predicted = []
cumulative_metrics = {'TP': 0, 'FP': 0, 'FN': 0}

for test_year in range(2000, 2022):
    print(f"\n########### START YEAR {test_year} ###########")
    result = evaluate_year_stacked(df_all, test_year, beta=2)

    df_all_predicted.append(result)

    tn, fp, fn, tp = confusion_matrix(result['true_label'], result['pred_label'], labels=[0, 1]).ravel()
    f1 = f1_score(result['true_label'], result['pred_label'])
    all_years_results.append({'year': test_year, 'TP': tp, 'FP': fp, 'FN': fn, 'F1': f1})
    
    # Update cumulative metrics
    cumulative_metrics['TP'] += tp
    cumulative_metrics['FP'] += fp
    cumulative_metrics['FN'] += fn
    
    # Calculate cumulative F1
    cumulative_F1 = (2 * cumulative_metrics['TP']) / (
        2 * cumulative_metrics['TP'] + cumulative_metrics['FP'] + cumulative_metrics['FN']
    ) if (2 * cumulative_metrics['TP'] + cumulative_metrics['FP'] + cumulative_metrics['FN']) > 0 else 0

    print(f"=== End YEAR {test_year} ===")
    print(f"Year {test_year}: TP={tp}, FP={fp}, FN={fn}, F1={f1:.3f}")
    print(f"Cumulative (2000-{test_year}): TP={cumulative_metrics['TP']}, FP={cumulative_metrics['FP']}, FN={cumulative_metrics['FN']}, F1={cumulative_F1:.3f}")
    print(f"########### END YEAR {test_year} ###########\n")

# Combine predictions
df_all_predicted = pd.concat(df_all_predicted, ignore_index=True)

# Overall metrics
overall_TP = sum(r['TP'] for r in all_years_results)
overall_FP = sum(r['FP'] for r in all_years_results)
overall_FN = sum(r['FN'] for r in all_years_results)
overall_F1 = (2 * overall_TP) / (2 * overall_TP + overall_FP + overall_FN)

print("\n=== Overall Summary across all years ===")
print(f"Overall TP={overall_TP}, FP={overall_FP}, FN={overall_FN}, F1={overall_F1:.3f}")

# Store predicted 1s with firm_id
df_predicted_1s = df_all_predicted[df_all_predicted['pred_label'] == 1][['year', 'firm_id', 'text', 'true_label']]
print(f"\nTotal predicted 1s across all years: {len(df_predicted_1s)}")

In [None]:
import pickle

# --- Save ---
with open("df_predicted_1s.pkl", "wb") as f:
    pickle.dump(df_predicted_1s, f)

print("✅ df_predicted_1s saved to df_predicted_1s.pkl")



In [None]:
# Save as CSV
df_predicted_1s.to_csv("df_predicted_1s.csv", index=False, encoding='utf-8')
print("✅ df_predicted_1s saved to df_predicted_1s.csv")

## STACKED MODEL 2 (2ND TO BEST)

In [None]:
# =====================================================
# Document Classification Pipeline (Yearly Rolling)
# =====================================================
# Requirements:
#   pip install scikit-learn pandas numpy tqdm
#
# Expected DataFrame: df_all with columns:
#   - 'year' (int)
#   - 'text' (str)
#   - 'label' (0/1)
#   - 'source' ('original' or 'synthetic')
# =====================================================

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, fbeta_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

SEED = 42
np.random.seed(SEED)

# =====================================================
# Helper Functions
# =====================================================

def choose_threshold_by_f2(y_true, scores, beta=2):
    """
    Scan thresholds, choose the one maximizing F-beta on validation.
    """
    thresholds = np.linspace(0.05, 0.95, 50)
    best_thr, best_fbeta = 0.5, -1
    for thr in thresholds:
        preds = (scores >= thr).astype(int)
        fbeta = fbeta_score(y_true, preds, beta=beta, zero_division=0)
        if fbeta > best_fbeta:
            best_fbeta, best_thr = fbeta, thr
    return best_thr, best_fbeta


def evaluate_year(df, test_year, beta=2, ngram_range=(1,2), max_features=20000):
    """
    Train on years (t-5 ... t-2), validate on year (t-1), test on year (t).
    """
    train_years = list(range(test_year - 5, test_year - 1))
    val_year = test_year - 1

    df_train = df[df["year"].isin(train_years) ]
    df_val = df[(df["year"] == val_year) ]
    df_test = df[(df["year"] == test_year) & (df["source"] == "original")]

    if df_train.empty or df_val.empty or df_test.empty:
        print(f"[{test_year}] Skipped due to insufficient data.")
        return None

    # TF-IDF vectorizer fit on train+val (NOT test → avoids lookahead bias)
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    vectorizer.fit(pd.concat([df_train["text"], df_val["text"]]))

    X_train = vectorizer.transform(df_train["text"])
    y_train = df_train["label"].values
    X_val = vectorizer.transform(df_val["text"])
    y_val = df_val["label"].values
    X_test = vectorizer.transform(df_test["text"])
    y_test = df_test["label"].values

    # Logistic Regression (balanced)
    clf = LogisticRegression(
        solver="saga",
        max_iter=200,
        class_weight="balanced",
        C=1.0,
        random_state=SEED,
    )
    clf.fit(X_train, y_train)

    # Validation threshold tuning
    val_scores = clf.predict_proba(X_val)[:, 1]
    thr, best_f2 = choose_threshold_by_f2(y_val, val_scores, beta=beta)

    # Test predictions
    test_scores = clf.predict_proba(X_test)[:, 1]
    y_pred = (test_scores >= thr).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    f2 = fbeta_score(y_test, y_pred, beta=beta, zero_division=0)

    print(f"\n=== Year {test_year} Results ===")
    print(f"Threshold={thr:.3f}, Val-F{beta}={best_f2:.3f}, Test-F{beta}={f2:.3f}")
    print(f"TP={tp}, FP={fp}, FN={fn}, Total Test={len(df_test)}")

    df_test_result = df_test[["year", "text","firm_id"]].copy()
    df_test_result["true_label"] = y_test
    df_test_result["pred_label"] = y_pred
    df_test_result["score"] = test_scores

    return df_test_result, {"year": test_year, "TP": tp, "FP": fp, "FN": fn, f"F{beta}": f2}


# =====================================================
# Execution Loop
# =====================================================

all_results = []
df_all_preds = []

for year in tqdm(range(2000, 2022), desc="Processing years"):
    res = evaluate_year(df_all, year, beta=2)
    if res is not None:
        df_preds, metrics = res
        df_all_preds.append(df_preds)
        all_results.append(metrics)

        # --- Cumulative metrics up to this year ---
        df_metrics = pd.DataFrame(all_results)
        cum_TP = df_metrics["TP"].sum()
        cum_FP = df_metrics["FP"].sum()
        cum_FN = df_metrics["FN"].sum()
        cum_F1 = (2 * cum_TP) / (2 * cum_TP + cum_FN + cum_FP) if (cum_TP + cum_FP + cum_FN) > 0 else 0

        print(f"\n--- Cumulative up to {year} ---")
        print(f"Total TP={cum_TP}, FP={cum_FP}, FN={cum_FN}, Cum-F1={cum_F1:.3f}")


# =====================================================
# Final Summary
# =====================================================

if df_all_preds:
    df_all_preds = pd.concat(df_all_preds, ignore_index=True)
else:
    df_all_preds = pd.DataFrame(columns=["year", "text", "true_label", "pred_label", "score"])

df_metrics = pd.DataFrame(all_results)

overall_TP = df_metrics["TP"].sum()
overall_FP = df_metrics["FP"].sum()
overall_FN = df_metrics["FN"].sum()
overall_F2 = (5 * overall_TP) / (5 * overall_TP + 4 * overall_FN + overall_FP) if (overall_TP + overall_FP + overall_FN) > 0 else 0

df_predicted_1s=df_all_preds[df_all_preds['pred_label']==1]

print("\n=== Overall Summary ===")
print(f"Overall TP={overall_TP}, FP={overall_FP}, FN={overall_FN}, F2={overall_F2:.3f}")

# Save outputs if needed
# df_all_preds.to_pickle("df_all_preds.pkl")
# df_metrics.to_csv("per_year_metrics.csv", index=False)


In [None]:
# --- Save results ---
import pickle

# Save as pickle
with open("df_predicted_1s_2nd_best.pkl", "wb") as f:
    pickle.dump(df_predicted_1s, f)
print("✅ df_predicted_1s_2nd_best saved to df_predicted_1s_2nd_best.pkl")

# Save as CSV
df_predicted_1s.to_csv("df_predicted_1s_2nd_best.csv.csv", index=False, encoding='utf-8')
print("✅ df_predicted_1s_2nd_best saved to df_predicted_1s_2nd_best.csv")

## STACKED MODEL 3 (3rd to BEST)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import fbeta_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

# ------------------------------
# TF-IDF Vectorizer (train-only fitting)
# ------------------------------
def get_vectorizer(max_features=60000, ngram_range=(1,3)):
    return TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        analyzer="word",
        lowercase=True,
        stop_words="english"
    )

# ------------------------------
# Base models (aggressive, recall-focused)
# ------------------------------
def get_base_models():
    return {
        "logreg": LogisticRegression(
            penalty="elasticnet", solver="saga", l1_ratio=0.7,
            C=0.5, class_weight="balanced", max_iter=3000, random_state=RANDOM_STATE
        ),
        "linsvc": CalibratedClassifierCV(
            LinearSVC(C=0.5, class_weight="balanced", random_state=RANDOM_STATE),
            cv=3
        ),
        "sgd": SGDClassifier(
            loss="log_loss", alpha=1e-6, penalty="l1",
            class_weight="balanced", random_state=RANDOM_STATE
        )
    }

# ------------------------------
# Threshold tuning (validation only)
# ------------------------------
def pick_threshold(y_val, val_scores, beta=2.0):
    thresholds = np.linspace(0.01, 0.99, 99)
    best_thr, best_fbeta = 0.5, -1
    for thr in thresholds:
        preds = (val_scores >= thr).astype(int)
        fbeta = fbeta_score(y_val, preds, beta=beta, zero_division=0)
        if fbeta > best_fbeta:
            best_fbeta, best_thr = fbeta, thr
    return best_thr, best_fbeta

# ------------------------------
# Evaluate a single year
# ------------------------------
def evaluate_year(df, test_year):
    df_trainval = df[df["year"] < test_year]
    df_test = df[(df["year"] == test_year) & (df["source"] == "original")]

    if df_trainval.empty or df_test.empty:
        return None, None

    # Train/val split
    df_train, df_val = train_test_split(
        df_trainval, test_size=0.2,
        stratify=df_trainval["label"], random_state=RANDOM_STATE
    )

    # Vectorize (fit on train only)
    vec = get_vectorizer()
    X_train = vec.fit_transform(df_train["text"])
    X_val = vec.transform(df_val["text"])
    X_test = vec.transform(df_test["text"])

    y_train, y_val, y_test = df_train["label"].values, df_val["label"].values, df_test["label"].values

    # Base models
    models = get_base_models()
    val_preds, test_preds = [], []

    for name, model in models.items():
        model.fit(X_train, y_train)
        val_probs = model.predict_proba(X_val)[:, 1]
        test_probs = model.predict_proba(X_test)[:, 1]
        val_preds.append(val_probs)
        test_preds.append(test_probs)

    # Stacking meta-learner
    X_val_stack = np.vstack(val_preds).T
    X_test_stack = np.vstack(test_preds).T
    meta = LogisticRegression(solver="liblinear", class_weight="balanced", random_state=RANDOM_STATE)
    meta.fit(X_val_stack, y_val)
    val_scores = meta.predict_proba(X_val_stack)[:, 1]
    test_scores = meta.predict_proba(X_test_stack)[:, 1]

    # Threshold selection (validation only)
    used_thr, best_f2 = pick_threshold(y_val, val_scores, beta=2.0)

    # Apply threshold to test set
    y_pred_test = (test_scores >= used_thr).astype(int)

    # Metrics
    TP = ((y_pred_test == 1) & (y_test == 1)).sum()
    FP = ((y_pred_test == 1) & (y_test == 0)).sum()
    FN = ((y_pred_test == 0) & (y_test == 1)).sum()

    prec = precision_score(y_test, y_pred_test, zero_division=0)
    rec = recall_score(y_test, y_pred_test, zero_division=0)
    f1 = fbeta_score(y_test, y_pred_test, beta=1.0, zero_division=0)

    print(f"=== {test_year} Results ===")
    print(f"TP={TP}, FP={FP}, FN={FN}, Prec={prec:.3f}, Rec={rec:.3f}, F1={f1:.3f}, thr={used_thr:.3f}")

    metrics = {
        "year": test_year,
        "TP": TP, "FP": FP, "FN": FN,
        "Precision": prec, "Recall": rec, "F1": f1, "thr": used_thr
    }
    return df_test.assign(pred_score=test_scores, pred_label=y_pred_test, true_label=y_test, used_threshold=used_thr), metrics

# ------------------------------
# Run multiple years with cumulative metrics
# ------------------------------
def run_all_years(df, start_year, end_year):
    preds_all, metrics_all = [], []
    cumulative = {"TP": 0, "FP": 0, "FN": 0}

    for year in range(start_year, end_year + 1):
        print(f"\n#################### START YEAR {year} ####################")
        df_out, metrics = evaluate_year(df, year)
        if df_out is not None:
            preds_all.append(df_out)
            metrics_all.append(metrics)

            # Update cumulative metrics
            cumulative["TP"] += metrics["TP"]
            cumulative["FP"] += metrics["FP"]
            cumulative["FN"] += metrics["FN"]

            cum_f1 = (2 * cumulative["TP"]) / (
                2 * cumulative["TP"] + cumulative["FP"] + cumulative["FN"]
            ) if (2 * cumulative["TP"] + cumulative["FP"] + cumulative["FN"]) > 0 else 0

            print(f"=== End YEAR {year} === Cumulative F1 ({start_year}-{year}) = {cum_f1:.3f}")
            print(f"#################### END YEAR {year} ####################")

    # Overall summary
    overall_TP = sum(m["TP"] for m in metrics_all)
    overall_FP = sum(m["FP"] for m in metrics_all)
    overall_FN = sum(m["FN"] for m in metrics_all)
    overall_F1 = (2 * overall_TP) / (2 * overall_TP + overall_FP + overall_FN) if (2 * overall_TP + overall_FP + overall_FN) > 0 else 0

    print(f"\n=== Overall Summary across all years ===")
    print(f"Overall TP={overall_TP}, FP={overall_FP}, FN={overall_FN}, F1={overall_F1:.3f}")

    return pd.concat(preds_all, ignore_index=True), pd.DataFrame(metrics_all)

# ------------------------------
# Main
# ------------------------------
if __name__ == "__main__":
    # Expect df_all present in environment with columns: year, firm_id, text, label, source
    try:
        df_all
    except NameError:
        raise RuntimeError("Load df_all first (pandas DataFrame) with columns: year, firm_id, text, label, source")

    # Run pipeline
    preds_df, yearly_metrics = run_all_years(df_all, start_year=2000, end_year=2021)

    # Extract predicted 1s and save
    df_predicted_1s = preds_df[preds_df['pred_label'] == 1][['year','firm_id','text','true_label','pred_score','used_threshold']]
    print(f"\nTotal predicted 1s across all years: {len(df_predicted_1s)}")

    # Save outputs
    df_predicted_1s.to_csv("df_predicted_1s.csv", index=False, encoding="utf-8")
    print("✅ Predictions saved to df_predicted_1s.csv")
    yearly_metrics.to_csv("metrics_summary.csv", index=False, encoding="utf-8")
    print("✅ Metrics saved to metrics_summary.csv")


In [None]:
import pickle

# --- Save ---
with open("df_predicted_1s_3rd_best.pkl", "wb") as f:
    pickle.dump(df_predicted_1s, f)

print("✅ df_predicted_1s_3rd_best saved to df_predicted_1s_3rd_best.pkl")



In [None]:
# Save as CSV
df_predicted_1s.to_csv("df_predicted_1s_3rd_best.csv", index=False, encoding='utf-8')
print("✅ df_predicted_1s_3rd_best saved to df_predicted_1s_3rd_best.csv")

## STACKED MODEL 4 (4th to BEST)

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support

RANDOM_STATE = 42

# ---------------------------
# Helpers
# ---------------------------
def fbeta_score_custom(y_true, y_pred, beta=2):
    prec, rec, _, _ = precision_recall_fscore_support(
        y_true, y_pred, beta=beta, average="binary", zero_division=0
    )
    if prec + rec == 0:
        return 0.0
    return (1 + beta**2) * (prec * rec) / ((beta**2 * prec) + rec)

def pick_threshold(y_val, val_scores, beta=2.0):
    """Return threshold maximizing F-beta on validation."""
    thresholds = np.linspace(0.01, 0.99, 99)
    best_thr, best_fbeta = 0.5, -1
    for thr in thresholds:
        preds = (val_scores >= thr).astype(int)
        fbeta = fbeta_score_custom(y_val, preds, beta=beta)
        if fbeta > best_fbeta:
            best_fbeta, best_thr = fbeta, thr
    return best_thr, best_fbeta

# ---------------------------
# Base models for Version B2
# ---------------------------
def get_base_models_B2():
    return {
        "logreg": LogisticRegression(
            penalty="elasticnet", solver="saga", l1_ratio=0.7,
            C=0.5, class_weight="balanced", max_iter=3000, random_state=RANDOM_STATE
        ),
        "linsvc": CalibratedClassifierCV(
            LinearSVC(C=0.5, class_weight="balanced", random_state=RANDOM_STATE),
            cv=3
        ),
        "sgd": SGDClassifier(
            loss="log_loss", alpha=1e-6, penalty="l1",
            class_weight="balanced", random_state=RANDOM_STATE
        )
    }

# ---------------------------
# Evaluate single year (Version B2)
# ---------------------------
def evaluate_year_version_b2(df, test_year, beta=2):
    train_years = list(range(test_year - 4, test_year))
    df_trainval = df[df['year'].isin(train_years)].copy()
    if df_trainval.empty:
        print(f"[{test_year}] Not enough training data (train_years={train_years}) -> skip")
        return None

    # stratified train/val split
    df_train, df_val = train_test_split(
        df_trainval, test_size=0.25, stratify=df_trainval['label'], random_state=RANDOM_STATE
    )

    # test set (original only)
    df_test = df[(df['year'] == test_year) & (df['source'] == 'original')].copy()
    if df_test.empty:
        print(f"[{test_year}] No test rows (original) -> skip")
        return None

    # TF-IDF fit on training only
    vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=60000, stop_words="english")
    vectorizer.fit(df_train["text"])

    X_train = vectorizer.transform(df_train["text"])
    y_train = df_train["label"].values

    X_val = vectorizer.transform(df_val["text"])
    y_val = df_val["label"].values

    X_test = vectorizer.transform(df_test["text"])
    y_test = df_test["label"].values

    # Base models (Version B2 params)
    models = get_base_models_B2()
    val_preds, test_preds = [], []

    for name, model in models.items():
        print(f"[{test_year}] Training base model: {name}")
        model.fit(X_train, y_train)
        val_probs = model.predict_proba(X_val)[:, 1]
        test_probs = model.predict_proba(X_test)[:, 1]
        val_preds.append(val_probs)
        test_preds.append(test_probs)

    # Meta learner (stacking)
    X_val_stack = np.vstack(val_preds).T
    X_test_stack = np.vstack(test_preds).T
    meta = LogisticRegression(solver="liblinear", class_weight="balanced", random_state=RANDOM_STATE)
    meta.fit(X_val_stack, y_val)

    val_scores = meta.predict_proba(X_val_stack)[:, 1]
    test_scores = meta.predict_proba(X_test_stack)[:, 1]

    # Threshold tuning (validation only)
    best_thr, best_f2 = pick_threshold(y_val, val_scores, beta=2.0)
    print(f"[{test_year}] Meta OOF best threshold={best_thr:.4f}, OOF-F2={best_f2:.3f}")

    # Apply to test
    y_pred_test = (test_scores >= best_thr).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test, labels=[0, 1]).ravel()
    f1 = f1_score(y_test, y_pred_test)

    print(f"\n=== Year {test_year} Results ===")
    print(f"TP={tp}, FP={fp}, FN={fn}, F1={f1:.3f}")
    print(f"[{test_year}] Stored {len(df_test)} test rows, predicted 1s: {int(np.sum(y_pred_test))}\n")

    df_test_result = df_test[['year', 'text', 'firm_id']].copy()
    df_test_result['true_label'] = y_test
    df_test_result['pred_label'] = y_pred_test
    df_test_result['pred_score'] = test_scores
    df_test_result['used_threshold'] = best_thr

    return df_test_result

# ---------------------------
# Runner across years
# ---------------------------
def run_all_years_version_b2(df, start_year=2000, end_year=2021):
    all_results = []
    df_all_predicted = []
    cumulative = {'TP': 0, 'FP': 0, 'FN': 0}

    for year in range(start_year, end_year + 1):
        print(f"\n########### START YEAR {year} ###########")
        res = evaluate_year_version_b2(df, year, beta=2)
        if res is None:
            print(f"[{year}] skipped.")
            continue

        df_all_predicted.append(res)
        tn, fp, fn, tp = confusion_matrix(res['true_label'], res['pred_label'], labels=[0, 1]).ravel()
        f1 = f1_score(res['true_label'], res['pred_label'])
        all_results.append({'year': year, 'TP': tp, 'FP': fp, 'FN': fn, 'F1': f1})

        cumulative['TP'] += tp
        cumulative['FP'] += fp
        cumulative['FN'] += fn
        cumF1 = (2 * cumulative['TP']) / (2 * cumulative['TP'] + cumulative['FP'] + cumulative['FN']) if (2 * cumulative['TP'] + cumulative['FP'] + cumulative['FN']) > 0 else 0.0
        print(f"=== End YEAR {year} === Cumulative F1 ({start_year}-{year}) = {cumF1:.3f}")
        print(f"########### END YEAR {year} ###########\n")

    df_all_predicted = pd.concat(df_all_predicted, ignore_index=True) if df_all_predicted else pd.DataFrame(columns=['year','text','firm_id','true_label','pred_label','pred_score','used_threshold'])
    df_yearly = pd.DataFrame(all_results).sort_values('year').reset_index(drop=True)

    # Overall
    overall_TP = df_yearly['TP'].sum() if not df_yearly.empty else 0
    overall_FP = df_yearly['FP'].sum() if not df_yearly.empty else 0
    overall_FN = df_yearly['FN'].sum() if not df_yearly.empty else 0
    overall_F1 = (2 * overall_TP) / (2 * overall_TP + overall_FP + overall_FN) if (2 * overall_TP + overall_FP + overall_FN) > 0 else 0.0

    print("\n=== Overall Summary across all years ===")
    print(f"Overall TP={overall_TP}, FP={overall_FP}, FN={overall_FN}, F1={overall_F1:.3f}")

    return df_all_predicted, df_yearly

# ---------------------------
# Example usage (execution)
# ---------------------------
if __name__ == "__main__":
    # Expect df_all present in environment with columns: year, firm_id, text, label, source
    try:
        df_all
    except NameError:
        raise RuntimeError("Load df_all first (pandas DataFrame) with columns: year, firm_id, text, label, source")

    # Run pipeline
    preds_df, yearly_metrics = run_all_years_version_b2(df_all, start_year=2000, end_year=2021)

    # Extract predicted 1s and save
    df_predicted_1s = preds_df[preds_df['pred_label'] == 1][['year','firm_id','text','true_label','pred_score','used_threshold']]
    print(f"\nTotal predicted 1s across all years: {len(df_predicted_1s)}")


In [None]:
import pickle

# --- Save ---
with open("df_predicted_1s_4th_best.pkl", "wb") as f:
    pickle.dump(df_predicted_1s, f)

print("✅ df_predicted_1s_4th_best saved to df_predicted_1s_4th_best.pkl")



In [None]:
# Save as CSV
df_predicted_1s.to_csv("df_predicted_1s_4th_best.csv", index=False, encoding='utf-8')
print("✅ df_predicted_1s_4th_best saved to df_predicted_1s_4th_best.csv")