In [6]:
from pathlib import Path
import re
import json

# 1. Read the entire file
raw_text = Path("equity_analyses.txt").read_text(encoding="utf-8")

# 2. Split into individual analyses.
#    Each analysis ends with </analysis>. There may be trailing whitespace.
raw_blocks = re.split(r"</analysis>\s*", raw_text)
# The last split is often empty if the file ends with </analysis>
raw_blocks = [b for b in raw_blocks if b.strip()]

print(f"Found {len(raw_blocks)} raw analyses")


Found 2393 raw analyses


In [8]:
def extract_tag(block: str, tag: str, required: bool = False):
    """
    Extracts the content inside <tag>...</tag> from the text block.
    If required=True and the tag is not found, returns None (we can later drop those examples).
    """
    # Note: for 'fund name' the tag literally is <fund name>...</fund name>
    pattern = fr"<{tag}>(.*?)</{tag}>"
    m = re.search(pattern, block, flags=re.DOTALL)
    if not m:
        return None
    return m.group(1).strip()


In [9]:
records = []

for i, block in enumerate(raw_blocks):
    # Defensive: trim leading/trailing whitespace
    b = block.strip()

    # Some files may or may not have an explicit <analysis> open tag.
    # If yours always has "<analysis>" at the beginning, you can strip it:
    # b = b.replace("<analysis>", "", 1).strip()

    record = {
        "fund_name":      extract_tag(b, "fund name"),
        "asset_class":    extract_tag(b, "asset_class"),
        "category":       extract_tag(b, "category"),
        "date":           extract_tag(b, "date"),
        "author":         extract_tag(b, "author"),
        "people_rating":  extract_tag(b, "people_rating"),
        "process_rating": extract_tag(b, "process_rating"),
        "summary":        extract_tag(b, "summary"),
        "people":         extract_tag(b, "people"),
        "process":        extract_tag(b, "process"),
        "portfolio":      extract_tag(b, "portfolio"),
        "performance":    extract_tag(b, "performance"),
    }

    records.append(record)

len(records)


2393

In [10]:
required_fields = ["summary", "people", "process", "portfolio", "performance"]

clean_records = []
for r in records:
    if all(r[field] is not None and r[field].strip() for field in required_fields):
        clean_records.append(r)

print(f"Total records: {len(records)}")
print(f"Clean records with all required fields: {len(clean_records)}")


Total records: 2393
Clean records with all required fields: 2393


In [11]:
import json
from pathlib import Path

out_path = Path("equity_analyses_structured.jsonl")

with out_path.open("w", encoding="utf-8") as f:
    for r in clean_records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"Saved {len(clean_records)} records to {out_path}")


Saved 2393 records to equity_analyses_structured.jsonl


In [None]:
!pip install -q transformers datasets accelerate

In [12]:
import json
from pathlib import Path
from json import JSONDecodeError

DATA_PATH = Path("equity_analyses_structured.jsonl")

records = []
bad_count = 0

with DATA_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            obj = json.loads(line)
        except JSONDecodeError as e:
            bad_count += 1
            print(f"Skipping malformed line {i}: {e}")
            continue
        records.append(obj)

print(f"Loaded {len(records)} valid records")
print(f"Skipped {bad_count} malformed lines")

print("Sample keys from first record:", records[0].keys())


Loaded 2393 valid records
Skipped 0 malformed lines
Sample keys from first record: dict_keys(['fund_name', 'asset_class', 'category', 'date', 'author', 'people_rating', 'process_rating', 'summary', 'people', 'process', 'portfolio', 'performance'])


In [13]:
# Load a pretrained Flan-T5 model for zero-shot summarization
MODEL_NAME = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("Model and tokenizer loaded.")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model and tokenizer loaded.


In [14]:
import textwrap

def build_input_from_record(rec):
    """Format one analysis into a single input string for the model."""
    return (
        "summarize_fund_analysis:\n"
        "[PEOPLE]\n" + rec["people"] + "\n\n"
        "[PROCESS]\n" + rec["process"] + "\n\n"
        "[PORTFOLIO]\n" + rec["portfolio"] + "\n\n"
        "[PERFORMANCE]\n" + rec["performance"]
    )

# Pick one example (you can change the index to inspect others)
example = records[0]
input_text = build_input_from_record(example)

# Tokenize
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=1024,      # truncate long analyses for now
    truncation=True,
)

# Generate a summary (zero-shot)
output_ids = model.generate(
    **inputs,
    max_new_tokens=256,
    num_beams=4,
    length_penalty=1.0,
    no_repeat_ngram_size=3,
)

generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("=== FUND NAME ===")
print(example["fund_name"])
print()

print("=== GENERATED SUMMARY (Flan-T5, zero-shot) ===")
print(textwrap.fill(generated_summary, width=100))
print()

print("=== HUMAN SUMMARY (Morningstar analyst) ===")
print(textwrap.fill(example["summary"], width=100))


=== FUND NAME ===
4D Global Infrastructure AUD Hedged

=== GENERATED SUMMARY (Flan-T5, zero-shot) ===
4D Global Infrastructure underperformed during the 2024 calendar year, in part due to an underweight
exposure to the US and allocations in Brazil

=== HUMAN SUMMARY (Morningstar analyst) ===
4D Global Infrastructure’s exposure to emerging markets presents potential upsides, yet its
dependence on portfolio manager Sarah Shaw tempers our confidence.  Shaw brings multidecade
experience and a distinctive lens on emerging-markets infrastructure, honed during her tenure
managing RARE Infrastructure’s Emerging Markets Fund from 2006 to 2010. Shaw’s deep familiarity with
these regions positions her well to navigate their complexities, though the strategy’s reliance on
her expertise introduces key-person risk.   Additionally, supporting internal macroeconomic
forecasting and country risk assessments is Tim Snelgrove, head of markets and trading. Having
completed a yearlong transition with his p

In [15]:
def build_input(rec):
    return (
        "summarize_fund_analysis:\n"
        "[PEOPLE]\n" + rec["people"] + "\n\n"
        "[PROCESS]\n" + rec["process"] + "\n\n"
        "[PORTFOLIO]\n" + rec["portfolio"] + "\n\n"
        "[PERFORMANCE]\n" + rec["performance"]
    )

dataset = []

for rec in records:
    dataset.append({
        "input": build_input(rec),
        "target": rec["summary"],
        "fund_name": rec["fund_name"],  # optional, useful for debugging later
    })

len(dataset)


2393

In [16]:
import random

# For reproducibility
random.seed(42)

indices = list(range(len(dataset)))
random.shuffle(indices)

n_total = len(indices)
n_train = int(0.8 * n_total)
n_val   = int(0.1 * n_total)
# rest goes to test
n_test  = n_total - n_train - n_val

train_indices = indices[:n_train]
val_indices   = indices[n_train:n_train + n_val]
test_indices  = indices[n_train + n_val:]

train_data = [dataset[i] for i in train_indices]
val_data   = [dataset[i] for i in val_indices]
test_data  = [dataset[i] for i in test_indices]

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
print("Example train record keys:", train_data[0].keys())


Train: 1914, Val: 239, Test: 240
Example train record keys: dict_keys(['input', 'target', 'fund_name'])


In [17]:
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq

# Create HF datasets from our Python lists
train_ds = Dataset.from_list(train_data)
val_ds   = Dataset.from_list(val_data)

max_input_length = 1024      # truncate body sections to this many tokens
max_target_length = 384      # allow multi-paragraph summaries

def preprocess_batch(batch):
    # Tokenize inputs
    model_inputs = tokenizer(
        batch["input"],
        max_length=max_input_length,
        truncation=True,
    )

    # Tokenize targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target"],
            max_length=max_target_length,
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_ds.map(preprocess_batch, batched=True, remove_columns=train_ds.column_names)
val_tokenized   = val_ds.map(preprocess_batch, batched=True, remove_columns=val_ds.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print(train_tokenized)
print(val_tokenized)


Map:   0%|          | 0/1914 [00:00<?, ? examples/s]



Map:   0%|          | 0/239 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1914
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 239
})


In [None]:
from transformers import TrainingArguments, Trainer

output_dir = "./flan_t5_fund_summary"

training_args = TrainingArguments(
    output_dir=output_dir,

    # older versions use "eval_strategy", not "evaluation_strategy"
    eval_strategy="epoch",
    save_strategy="epoch",

    logging_steps=50,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,

    gradient_accumulation_steps=4,

    num_train_epochs=3,
    learning_rate=3e-5,
    weight_decay=0.01,

    # drop unsupported arguments
    # predict_with_generate=True,
    # report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
