<a href="https://colab.research.google.com/github/ma2070-spec/Outamation_AI_Externship/blob/main/Project_6_Final_RAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Project 6: Cold-Start Master Cell ===
# Runs end-to-end on a tiny eval set for speed.

# 1) Install deps
!pip -q install "transformers>=4.42.0" "accelerate>=0.30.0" bitsandbytes \
                 "scikit-learn>=1.3" sentencepiece pymupdf reportlab

# 2) Imports & constants
from google.colab import drive; drive.mount('/content/drive')
import os, glob, re, time, textwrap
import pandas as pd, numpy as np, torch, fitz
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

ROOT = "/content/drive/MyDrive/MA_Project6"
CSV_PATH = f"{ROOT}/ma_p6_labeling_template.csv"
DATA_DIR = Path("/content/data"); DATA_DIR.mkdir(parents=True, exist_ok=True)

ALLOWED = ["RATES_FEES","ESCROW","LATE_FEE","INSURANCE_MIP","OTHER"]
ALLOWED_SET = set(ALLOWED)
ALLOWED_STR = ", ".join(ALLOWED)
LABEL_RE = re.compile(r"(RATES_FEES|ESCROW|LATE_FEE|INSURANCE_MIP|OTHER)")

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().lower() for c in df.columns]
    for col in ["id","file","page","text","gold_label","split","notes"]:
        if col not in df.columns: df[col] = None
    return df[["id","file","page","text","gold_label","split","notes"]]

# 3) Ensure CSV exists (convert from Excel if needed)
if not os.path.exists(CSV_PATH):
    cands = glob.glob(f"{ROOT}/**/*.csv", recursive=True) + glob.glob(f"{ROOT}/**/*.xlsx", recursive=True)
    used = None
    # prefer any CSV under MA_Project6
    for c in cands:
        if c.lower().endswith(".csv"):
            df = normalize_columns(pd.read_csv(c))
            df.to_csv(CSV_PATH, index=False); used = c; break
    if used is None:
        # try the Excel from your earlier file name; otherwise first .xlsx found
        xl = f"{ROOT}/ma_p6_loan_mortgage data.xlsx"
        if os.path.exists(xl):
            df = normalize_columns(pd.read_excel(xl)); df.to_csv(CSV_PATH, index=False); used = xl
        else:
            for c in cands:
                if c.lower().endswith(".xlsx"):
                    df = normalize_columns(pd.read_excel(c)); df.to_csv(CSV_PATH, index=False); used = c; break
    if used is None:
        raise FileNotFoundError("No CSV/XLSX found. Export your sheet as CSV to My Drive/MA_Project6 as ma_p6_labeling_template.csv.")

print("USING CSV:", CSV_PATH)

# 4) Load & normalize dataset
df = pd.read_csv(CSV_PATH)
df["file"] = df["file"].astype(str).str.strip()
df["page"] = pd.to_numeric(df["page"], errors="coerce").astype("Int64")
df["text"] = df["text"].astype(str).str.strip()
df["gold_label"] = df["gold_label"].astype(str).str.upper().str.replace(" ", "_")

# map common synonyms to your 5-label schema
MAP = {
    "INSURANCE":"INSURANCE_MIP","ANNUAL_MIP":"INSURANCE_MIP","MIP":"INSURANCE_MIP","PMI":"INSURANCE_MIP",
    "DISCOUNT_FEES":"RATES_FEES","RATE_FEES":"RATES_FEES","RATES__FEES":"RATES_FEES",
    "STATE_LAWS":"OTHER","TRANSFER_PROPERTY":"OTHER","PREPAYMENT_PENALTY":"OTHER"
}
df["gold_label"] = df["gold_label"].map(lambda x: MAP.get(x, x))
df = df[df["text"].str.len() > 0]
df = df[df["gold_label"].isin(ALLOWED_SET)].copy()
df["file"] = df["file"].apply(lambda s: s if s.lower().endswith(".pdf") else s + ".pdf")
df["page"] = df["page"].fillna(1).astype(int)

print("Label counts:", df["gold_label"].value_counts().to_dict())

# 5) Ensure PDFs in /content/data (copy from Drive if present; else generate)
drive_pdfs = {Path(p).name: p for p in glob.glob(f"{ROOT}/*.pdf")}
needed = sorted(df["file"].unique())
missing = []
for fname in needed:
    src = drive_pdfs.get(fname)
    dst = DATA_DIR / fname
    if src and not dst.exists():
        os.system(f'cp "{src}" "{dst}"')
    if not dst.exists():
        missing.append(fname)

if missing:
    # generate synthetic PDFs exactly matching file/page from your CSV
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter
    from reportlab.lib.units import inch
    by_file = {}
    for r in df.itertuples(index=False):
        by_file.setdefault(r.file, {}).setdefault(int(r.page), []).append(str(r.text))
    def write_pdf(out_path, pages_dict):
        c = canvas.Canvas(out_path, pagesize=letter)
        w, h = letter; left, top = 0.85*inch, h-1.0*inch; lh = 12
        maxp = max(pages_dict.keys())
        for p in range(1, maxp+1):
            c.setFont("Helvetica-Bold", 12); c.drawString(left, top+6, f"Page {p}")
            c.setFont("Helvetica", 11); y = top-20
            blocks = pages_dict.get(p, ["(blank page for alignment)"])
            body = "\n\n".join(blocks)
            for para in body.split("\n"):
                for line in (textwrap.wrap(para, width=95) or [""]):
                    if y < 0.9*inch:
                        c.showPage()
                        c.setFont("Helvetica-Bold",12); c.drawString(left, top+6, f"Page {p} (cont.)")
                        c.setFont("Helvetica",11); y = top-20
                    c.drawString(left, y, line); y -= lh
            c.showPage()
        c.save()
    DATA_DIR.mkdir(exist_ok=True, parents=True)
    os.makedirs(f"{ROOT}/generated_pdfs", exist_ok=True)
    for f, pages in by_file.items():
        out_pdf = str(DATA_DIR / f)
        write_pdf(out_pdf, pages)
        os.system(f'cp "{out_pdf}" "{ROOT}/generated_pdfs/{Path(f).name}"')
    print("Generated PDFs for missing files:", len(missing))

# 6) Verify page ranges
bad_pages = []
for f in df["file"].unique():
    p = str(DATA_DIR / f)
    if not os.path.exists(p): continue
    n = len(fitz.open(p))
    for pg in df.loc[df["file"]==f, "page"]:
        if int(pg) < 1 or int(pg) > n:
            bad_pages.append((f, int(pg), n))
print("Out-of-range pages:", bad_pages)

# 7) Build a small stratified test slice (fast) + robust FEWSHOT
def small_test(xdf, n_per_label=5):
    return (xdf.groupby("gold_label", group_keys=False)
              .apply(lambda g: g.sample(min(n_per_label, len(g)), random_state=42))
              .reset_index(drop=True))

tiny = small_test(df, n_per_label=5)
X_test = tiny["text"].tolist()
y_test = tiny["gold_label"].tolist()
print("Tiny test size:", len(y_test), "| label counts:", tiny["gold_label"].value_counts().to_dict())

# FEWSHOT: one strong example per label (safe .loc usage)
FEWSHOT, missing_labels = [], []
for label in ALLOWED:
    c = df.loc[df["gold_label"]==label, ["text"]].copy()
    c["len"] = c["text"].str.len()
    c = c[c["len"] > 0]
    if c.empty:
        missing_labels.append(label)
        continue
    idx = c["len"].idxmax()       # label-based index
    FEWSHOT.append((c.loc[idx, "text"], label))
print("FEWSHOT built:", len(FEWSHOT), "| missing:", missing_labels)

# 8) Prompt + runner + eval (one small model for speed)
def build_prompt(user_text, fewshots):
    lines = [
        "You are a document classifier for mortgage text.",
        f"Output EXACTLY one label from: {ALLOWED_STR}.",
        "Format: LABEL=<LABEL> and nothing else.", ""
    ]
    for i,(t,lbl) in enumerate(fewshots, 1):
        lines += [f"Example {i}:", t, f"Answer: LABEL={lbl}", ""]
    lines += ["Classify the following text:", user_text, "Answer: LABEL="]
    return "\n".join(lines)

def run_model(model_id, texts, fewshots, max_new_tokens=6):
    print(f"\nLoading {model_id} …")
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id, trust_remote_code=True, device_map="auto",
            torch_dtype=torch.float16, load_in_8bit=True
        )
    except Exception:
        model = AutoModelForCausalLM.from_pretrained(
            model_id, trust_remote_code=True, device_map="auto",
            torch_dtype=torch.float16
        )
    gen = pipeline("text-generation", model=model, tokenizer=tok,
                   device_map="auto", torch_dtype=torch.float16)

    preds, lat = [], []
    for x in texts:
        t0 = time.time()
        out = gen(build_prompt(x, FEWSHOT),
                  max_new_tokens=max_new_tokens, do_sample=False, temperature=0.0)[0]["generated_text"]
        lat.append(time.time()-t0)
        m = LABEL_RE.search(out)
        preds.append(m.group(1) if m else "OTHER")
    return preds, lat

def evaluate(name, y_true, y_pred, latency_s):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    med = float(np.median(latency_s))
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.3f} | Macro-F1: {f1:.3f} | Median latency: {med:.2f}s")
    print("\nPer-class report:\n", classification_report(y_true, y_pred, digits=3, labels=ALLOWED))
    cm = confusion_matrix(y_true, y_pred, labels=ALLOWED)
    print(pd.DataFrame(cm, index=[f"T_{l}" for l in ALLOWED], columns=[f"P_{l}" for l in ALLOWED]))
    return {"model": name, "accuracy": acc, "macro_f1": f1, "median_latency_s": med}

results = []
mid = "Qwen/Qwen2-0.5B-Instruct"   # small & quick
preds, lat = run_model(mid, X_test, FEWSHOT)
res = evaluate(mid, y_test, preds, lat)
results.append(res)

# 9) Save summary + detailed predictions for slides
summary = pd.DataFrame(results)
sum_path = f"{ROOT}/p6_results_summary.csv"; summary.to_csv(sum_path, index=False)
print("\nSaved summary to:", sum_path); display(summary)

detail = pd.DataFrame({"text": X_test, "true": y_test, "pred": preds})
detail["correct"] = (detail["true"] == detail["pred"])
det_path = f"{ROOT}/p6_predictions_last_model.csv"; detail.to_csv(det_path, index=False)
print("Saved details to:", det_path)
print("\nTop 5 errors for slides:")
display(detail[~detail["correct"]].head(5))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
USING CSV: /content/drive/MyDrive/MA_Project6/ma_p6_labeling_template.csv
Label counts: {'OTHER': 4, 'INSURANCE_MIP': 4, 'RATES_FEES': 3, 'LATE_FEE': 1, 'ESCROW': 1}
Generated PDFs for missing files: 12
Out-of-range pages: []
Tiny test size: 13 | label counts: {'INSURANCE_MIP': 4, 'OTHER': 4, 'RATES_FEES': 3, 'ESCROW': 1, 'LATE_FEE': 1}
FEWSHOT built: 5 | missing: []

Loading Qwen/Qwen2-0.5B-Instruct …


  .apply(lambda g: g.sample(min(n_per_label, len(g)), random_state=42))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: [


=== Qwen/Qwen2-0.5B-Instruct ===
Accuracy: 0.231 | Macro-F1: 0.075 | Median latency: 117.90s

Per-class report:
                precision    recall  f1-score   support

   RATES_FEES      0.231     1.000     0.375         3
       ESCROW      0.000     0.000     0.000         1
     LATE_FEE      0.000     0.000     0.000         1
INSURANCE_MIP      0.000     0.000     0.000         4
        OTHER      0.000     0.000     0.000         4

     accuracy                          0.231        13
    macro avg      0.046     0.200     0.075        13
 weighted avg      0.053     0.231     0.087        13

                 P_RATES_FEES  P_ESCROW  P_LATE_FEE  P_INSURANCE_MIP  P_OTHER
T_RATES_FEES                3         0           0                0        0
T_ESCROW                    1         0           0                0        0
T_LATE_FEE                  1         0           0                0        0
T_INSURANCE_MIP             4         0           0                0        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Saved summary to: /content/drive/MyDrive/MA_Project6/p6_results_summary.csv


Unnamed: 0,model,accuracy,macro_f1,median_latency_s
0,Qwen/Qwen2-0.5B-Instruct,0.230769,0.075,117.904083


Saved details to: /content/drive/MyDrive/MA_Project6/p6_predictions_last_model.csv

Top 5 errors for slides:


Unnamed: 0,text,true,pred,correct
0,Escrow for taxes and insurance is required unl...,ESCROW,RATES_FEES,False
1,Annual MIP is 0.55% of the outstanding princip...,INSURANCE_MIP,RATES_FEES,False
2,Flood insurance is required if the property is...,INSURANCE_MIP,RATES_FEES,False
3,Upfront MIP equals 1.75% of the base loan amou...,INSURANCE_MIP,RATES_FEES,False
4,Private Mortgage Insurance (PMI) is required w...,INSURANCE_MIP,RATES_FEES,False


In [2]:
# Confusion matrix image + short errors CSV for slides
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

ROOT = "/content/drive/MyDrive/MA_Project6"
ALLOWED = ["RATES_FEES","ESCROW","LATE_FEE","INSURANCE_MIP","OTHER"]

detail = pd.read_csv(f"{ROOT}/p6_predictions_last_model.csv")

cm = confusion_matrix(detail["true"], detail["pred"], labels=ALLOWED)

plt.figure(figsize=(6,5))
plt.imshow(cm, aspect='auto')
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(range(len(ALLOWED)), ALLOWED, rotation=45, ha='right')
plt.yticks(range(len(ALLOWED)), ALLOWED)
for i in range(len(ALLOWED)):
    for j in range(len(ALLOWED)):
        plt.text(j, i, cm[i, j], ha='center', va='center')
plt.tight_layout()
plt.savefig(f"{ROOT}/p6_confusion_matrix.png", dpi=180)
plt.close()
print("Saved:", f"{ROOT}/p6_confusion_matrix.png")

# Top errors (first 8) for a quick slide table
errors = detail[detail["true"] != detail["pred"]].copy()
errors["snippet"] = errors["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.slice(0, 180) + "…"
path = f"{ROOT}/p6_top_errors.csv"
errors[["snippet","true","pred"]].head(8).to_csv(path, index=False)
print("Saved:", path)


Saved: /content/drive/MyDrive/MA_Project6/p6_confusion_matrix.png
Saved: /content/drive/MyDrive/MA_Project6/p6_top_errors.csv


In [3]:
import os, zipfile
ROOT = "/content/drive/MyDrive/MA_Project6"
bundle = [
  "p6_results_summary.csv",
  "p6_predictions_last_model.csv",
  "p6_confusion_matrix.png",
  "p6_top_errors.csv",
  "ma_p6_labeling_template.csv",
]
zip_path = "/content/p6_artifacts.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for name in bundle:
        p = f"{ROOT}/{name}"
        if os.path.exists(p): z.write(p, arcname=name)
print("Zipped:", zip_path)

from google.colab import files
files.download(zip_path)


Zipped: /content/p6_artifacts.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
import pandas as pd, os

ROOT = "/content/drive/MyDrive/MA_Project6"

# 1) Results summary (send me these rows)
summary = pd.read_csv(f"{ROOT}/p6_results_summary.csv")
print("=== p6_results_summary.csv ===")
print(summary.to_string(index=False))

# 2) Top errors: use existing file if present, else derive from predictions
top_err_path = f"{ROOT}/p6_top_errors.csv"
if os.path.exists(top_err_path):
    err = pd.read_csv(top_err_path)
else:
    detail = pd.read_csv(f"{ROOT}/p6_predictions_last_model.csv")
    err = detail[detail["true"] != detail["pred"]].copy()
    err["snippet"] = err["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.slice(0, 180) + "…"
    err = err[["snippet","true","pred"]].head(3)

print("\n=== top_errors (first 3) ===")
print(err.head(3).to_string(index=False))


=== p6_results_summary.csv ===
                   model  accuracy  macro_f1  median_latency_s
Qwen/Qwen2-0.5B-Instruct  0.230769     0.075        117.904083

=== top_errors (first 3) ===
                                                                                             snippet          true       pred
                                       Escrow for taxes and insurance is required unless LTV < 80%.…        ESCROW RATES_FEES
Annual MIP is 0.55% of the outstanding principal and is collected monthly with the mortgage payment… INSURANCE_MIP RATES_FEES
                Flood insurance is required if the property is in a FEMA Special Flood Hazard Area.… INSURANCE_MIP RATES_FEES


In [5]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

ROOT = "/content/drive/MyDrive/MA_Project6"
ALLOWED = ["RATES_FEES","ESCROW","LATE_FEE","INSURANCE_MIP","OTHER"]

detail = pd.read_csv(f"{ROOT}/p6_predictions_last_model.csv")
cm = confusion_matrix(detail["true"], detail["pred"], labels=ALLOWED)

plt.figure(figsize=(6,5))
plt.imshow(cm, aspect='auto')
plt.title('Confusion matrix')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.xticks(range(len(ALLOWED)), ALLOWED, rotation=45, ha='right')
plt.yticks(range(len(ALLOWED)), ALLOWED)
for i in range(len(ALLOWED)):
    for j in range(len(ALLOWED)):
        plt.text(j, i, cm[i, j], ha='center', va='center')
plt.tight_layout()
plt.savefig(f"{ROOT}/p6_confusion_matrix.png", dpi=180)
plt.close()
print("Saved:", f"{ROOT}/p6_confusion_matrix.png")


Saved: /content/drive/MyDrive/MA_Project6/p6_confusion_matrix.png


In [6]:
!unzip -o /content/p6_artifacts.zip -d /content/p6_artifacts > /dev/null
!find /content/p6_artifacts -maxdepth 2 -type f -printf "%f\n" | sort


ma_p6_labeling_template.csv
p6_confusion_matrix.png
p6_predictions_last_model.csv
p6_results_summary.csv
p6_top_errors.csv


In [7]:
import os, shutil

SRC = "/content/p6_artifacts/p6_confusion_matrix.png"  # adjust if your filename differs
DST_DIR = "/content/drive/MyDrive/MA_Project6"
os.makedirs(DST_DIR, exist_ok=True)
DST = os.path.join(DST_DIR, "p6_confusion_matrix.png")

shutil.copy(SRC, DST)
print("Copied to:", DST)


Copied to: /content/drive/MyDrive/MA_Project6/p6_confusion_matrix.png


In [8]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import os

ROOT = "/content/drive/MyDrive/MA_Project6"
ALLOWED = ["RATES_FEES","ESCROW","LATE_FEE","INSURANCE_MIP","OTHER"]

detail = pd.read_csv(f"{ROOT}/p6_predictions_last_model.csv")
cm = confusion_matrix(detail["true"], detail["pred"], labels=ALLOWED)

plt.figure(figsize=(6,5))
plt.imshow(cm, aspect='auto', cmap="Blues")
plt.title('Confusion matrix'); plt.xlabel('Predicted'); plt.ylabel('True')
plt.xticks(range(len(ALLOWED)), ALLOWED, rotation=45, ha='right')
plt.yticks(range(len(ALLOWED)), ALLOWED)
for i in range(len(ALLOWED)):
    for j in range(len(ALLOWED)):
        plt.text(j, i, cm[i, j], ha='center', va='center', color="black")
plt.tight_layout()
out_path = f"{ROOT}/p6_confusion_matrix.png"
plt.savefig(out_path, dpi=180)
plt.close()
print("Saved:", out_path)


Saved: /content/drive/MyDrive/MA_Project6/p6_confusion_matrix.png
