In [None]:
from pathlib import Path
import re
import json

# 1. Read the entire file
raw_text = Path("equity_analyses.txt").read_text(encoding="utf-8")

# 2. Split into individual analyses.
#    Each analysis ends with </analysis>. There may be trailing whitespace.
raw_blocks = re.split(r"</analysis>\s*", raw_text)
# The last split is often empty if the file ends with </analysis>
raw_blocks = [b for b in raw_blocks if b.strip()]

print(f"Found {len(raw_blocks)} raw analyses")


In [None]:
def extract_tag(block: str, tag: str, required: bool = False):
    """
    Extracts the content inside <tag>...</tag> from the text block.
    If required=True and the tag is not found, returns None (we can later drop those examples).
    """
    # Note: for 'fund name' the tag literally is <fund name>...</fund name>
    pattern = fr"<{tag}>(.*?)</{tag}>"
    m = re.search(pattern, block, flags=re.DOTALL)
    if not m:
        return None
    return m.group(1).strip()


In [None]:
records = []

for i, block in enumerate(raw_blocks):
    # Defensive: trim leading/trailing whitespace
    b = block.strip()

    # Some files may or may not have an explicit <analysis> open tag.
    # If yours always has "<analysis>" at the beginning, you can strip it:
    # b = b.replace("<analysis>", "", 1).strip()

    record = {
        "fund_name":      extract_tag(b, "fund name"),
        "asset_class":    extract_tag(b, "asset_class"),
        "category":       extract_tag(b, "category"),
        "date":           extract_tag(b, "date"),
        "author":         extract_tag(b, "author"),
        "people_rating":  extract_tag(b, "people_rating"),
        "process_rating": extract_tag(b, "process_rating"),
        "summary":        extract_tag(b, "summary"),
        "people":         extract_tag(b, "people"),
        "process":        extract_tag(b, "process"),
        "portfolio":      extract_tag(b, "portfolio"),
        "performance":    extract_tag(b, "performance"),
    }

    records.append(record)

len(records)


In [None]:
required_fields = ["summary", "people", "process", "portfolio", "performance"]

clean_records = []
for r in records:
    if all(r[field] is not None and r[field].strip() for field in required_fields):
        clean_records.append(r)

print(f"Total records: {len(records)}")
print(f"Clean records with all required fields: {len(clean_records)}")


In [None]:
import json
from pathlib import Path

out_path = Path("equity_analyses_structured.jsonl")

with out_path.open("w", encoding="utf-8") as f:
    for r in clean_records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"Saved {len(clean_records)} records to {out_path}")


In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import json
from pathlib import Path
from json import JSONDecodeError

DATA_PATH = Path("equity_analyses_structured.jsonl")

records = []
bad_count = 0

with DATA_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            obj = json.loads(line)
        except JSONDecodeError as e:
            bad_count += 1
            print(f"Skipping malformed line {i}: {e}")
            continue
        records.append(obj)

print(f"Loaded {len(records)} valid records")
print(f"Skipped {bad_count} malformed lines")

print("Sample keys from first record:", records[0].keys())


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load a pretrained Flan-T5 model for zero-shot summarization
MODEL_NAME = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("Model and tokenizer loaded.")


In [None]:
def build_input(rec):
    return (
        "summarize_fund_analysis:\n"
        "[PEOPLE]\n" + rec["people"] + "\n\n"
        "[PROCESS]\n" + rec["process"] + "\n\n"
        "[PORTFOLIO]\n" + rec["portfolio"] + "\n\n"
        "[PERFORMANCE]\n" + rec["performance"]
    )

dataset = [
    {"input": build_input(r), "target": r["summary"], "fund_name": r["fund_name"]}
    for r in records
]

len(dataset)

In [None]:
import textwrap

def build_input_from_record(rec):
    """Format one analysis into a single input string for the model."""
    return (
        "summarize_fund_analysis:\n"
        "[PEOPLE]\n" + rec["people"] + "\n\n"
        "[PROCESS]\n" + rec["process"] + "\n\n"
        "[PORTFOLIO]\n" + rec["portfolio"] + "\n\n"
        "[PERFORMANCE]\n" + rec["performance"]
    )

# Pick one example (you can change the index to inspect others)
example = records[0]
input_text = build_input_from_record(example)

# Tokenize
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=1024,      # truncate long analyses for now
    truncation=True,
)

# Generate a summary (zero-shot)
output_ids = model.generate(
    **inputs,
    max_new_tokens=256,
    num_beams=4,
    length_penalty=1.0,
    no_repeat_ngram_size=3,
)

generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("=== FUND NAME ===")
print(example["fund_name"])
print()

print("=== GENERATED SUMMARY (Flan-T5, zero-shot) ===")
print(textwrap.fill(generated_summary, width=100))
print()

print("=== HUMAN SUMMARY (Morningstar analyst) ===")
print(textwrap.fill(example["summary"], width=100))


In [None]:
from datasets import Dataset

# Turn our Python list into a Hugging Face Dataset
full_ds = Dataset.from_list(dataset)

# Simple train/val split: 90% train, 10% val
splits = full_ds.train_test_split(test_size=0.1, seed=42)
train_ds = splits["train"]
val_ds   = splits["test"]

print(train_ds)
print(val_ds)


In [None]:
max_input_length = 1024       # truncate People/Process/Portfolio/Performance
max_target_length = 384       # multi-paragraph summaries

def tokenize_batch(batch):
    # Tokenize inputs
    model_inputs = tokenizer(
        batch["input"],
        max_length=max_input_length,
        truncation=True,
    )

    # Tokenize targets (labels)
    labels = tokenizer(
        text_target=batch["target"],
        max_length=max_target_length,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(
    tokenize_batch,
    batched=True,
    remove_columns=train_ds.column_names,
)

val_tok = val_ds.map(
    tokenize_batch,
    batched=True,
    remove_columns=val_ds.column_names,
)

print(train_tok)
print(val_tok)



In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"
print("W&B disabled for this session.")


In [None]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./flan_t5_fund_summary",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,   # effective batch size ~8
    num_train_epochs=1,              # keep it fast
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,      # will be used if we call trainer.evaluate()/predict later
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



In [None]:
import textwrap
import torch

# Pick an index to inspect â€“ 0 is fine, or change to another integer
idx = 0
rec = records[idx]

def build_input(rec):
    return (
        "summarize_fund_analysis:\n"
        "[PEOPLE]\n" + rec["people"] + "\n\n"
        "[PROCESS]\n" + rec["process"] + "\n\n"
        "[PORTFOLIO]\n" + rec["portfolio"] + "\n\n"
        "[PERFORMANCE]\n" + rec["performance"]
    )

input_text = build_input(rec)

# Tokenize for generation
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=1024,
    truncation=True,
)

# Move to GPU if available
if torch.cuda.is_available():
    model.to("cuda")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

# Generate with the fine-tuned model
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        num_beams=4,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
    )

generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("=== FUND NAME ===")
print(rec["fund_name"])
print()

print("=== GENERATED SUMMARY (fine-tuned Flan-T5) ===")
print(textwrap.fill(generated, width=100))
print()

print("=== HUMAN SUMMARY (Morningstar analyst) ===")
print(textwrap.fill(rec["summary"], width=100))
