In [1]:
import pickle
from datetime import datetime

from sentence_transformers import SentenceTransformer

from config import DEVICE, STATEMENT_FILE
from graphprompt import gprompt_analyze
from models import Observation
import polars as pl
import json
import os


In [2]:
model = SentenceTransformer(
    "infgrad/jasper_en_vision_language_v1",
    trust_remote_code=True,
    device=DEVICE,
    config_kwargs={
        "is_text_encoder": True,
        "vector_dim": 1024,
        "use_memory_efficient_attention": False,
        "unpad_inputs": False,
    },
)
model.max_seq_length = 1024

Some weights of the model checkpoint at infgrad/jasper_en_vision_language_v1 were not used when initializing JasperVL: {'vision_model.vision_model.encoder.layers.5.self_attn.out_proj.weight', 'vision_model.vision_model.encoder.layers.10.mlp.fc1.bias', 'vision_model.vision_model.encoder.layers.21.self_attn.k_proj.bias', 'vision_model.vision_model.encoder.layers.7.layer_norm2.weight', 'vision_model.vision_model.encoder.layers.17.self_attn.k_proj.bias', 'vision_model.vision_model.encoder.layers.18.layer_norm2.bias', 'vision_model.vision_model.encoder.layers.10.self_attn.v_proj.weight', 'vision_model.vision_model.encoder.layers.11.self_attn.out_proj.bias', 'vision_model.vision_model.encoder.layers.13.self_attn.v_proj.weight', 'vision_model.vision_model.encoder.layers.17.layer_norm1.weight', 'vision_model.vision_model.encoder.layers.25.self_attn.out_proj.weight', 'vision_model.vision_model.encoder.layers.9.layer_norm1.weight', 'vision_model.vision_model.encoder.layers.7.self_attn.k_proj.wei

In [3]:
raw_fomc = pl.read_csv('./data/raw/communications.csv')
raw_fomc

Date,Release Date,Type,Text
str,str,str,str
"""2024-12-18""","""2025-01-08""","""Minute""","""Minutes of the Federal Open Ma…"
"""2024-12-18""","""2024-12-18""","""Statement""","""Recent indicators suggest that…"
"""2024-11-07""","""2024-11-26""","""Minute""","""Minutes of the Federal Open Ma…"
"""2024-11-07""","""2024-11-07""","""Statement""","""Recent indicators suggest that…"
"""2024-09-18""","""2024-09-18""","""Statement""","""Recent indicators suggest that…"
…,…,…,…
"""2000-05-16""","""2000-05-16""","""Statement""","""The Federal Open Market Commit…"
"""2000-03-21""","""2000-05-18""","""Minute""","""Minutes of the Federal Open Ma…"
"""2000-03-21""","""2000-03-21""","""Statement""","""The Federal Open Market Commit…"
"""2000-02-02""","""2000-03-23""","""Minute""","""Minutes of the Federal Open Ma…"


In [4]:
fomc = (
    raw_fomc.filter(pl.col("Type").eq("Statement"))
    .with_columns(pl.col("Release Date").str.to_datetime())
    .select(pl.col("Release Date").alias('date'), pl.col("Text").alias("text"))
    .sort("date")
)
fomc

date,text
datetime[μs],str
2000-02-02 00:00:00,"""The Federal Open Market Commit…"
2000-03-21 00:00:00,"""The Federal Open Market Commit…"
2000-05-16 00:00:00,"""The Federal Open Market Commit…"
2000-06-28 00:00:00,"""The Federal Open Market Commit…"
2000-08-22 00:00:00,"""The Federal Open Market Commit…"
…,…
2024-06-12 00:00:00,"""Recent indicators suggest that…"
2024-07-31 00:00:00,"""Recent indicators suggest that…"
2024-09-18 00:00:00,"""Recent indicators suggest that…"
2024-11-07 00:00:00,"""Recent indicators suggest that…"


In [5]:
import glob
from pathlib import Path
# Get all summary files
beige_book_files = glob.glob("./data/raw/beige-book/**/*-su.txt", recursive=True)

beige_book_data = []
for file_path in beige_book_files:
    # Extract date from filename (e.g. "1971-01-su.txt" -> "1971-01")
    filename = Path(file_path).stem  # Gets filename without extension
    date_str = filename.replace("-su", "")
    
    try:
        # Parse date 
        date = datetime.strptime(date_str, "%Y-%m")
        
        # Only include summaries after 2020
        if date.year >= 2000:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                beige_book_data.append({
                    "date": date,
                    "text": text,
                    "filename": file_path
                })
    except ValueError:
        # Skip files that don't match expected date format
        continue

# Convert to polars DataFrame and sort by date
beige_book_df = pl.DataFrame(beige_book_data).sort("date")
beige_book_df

date,text,filename
datetime[μs],str,str
2000-01-01 00:00:00,"""Reports from most Federal Rese…","""./data/raw/beige-book/2000/01/…"
2000-03-01 00:00:00,"""Reports from the twelve Federa…","""./data/raw/beige-book/2000/03/…"
2000-05-01 00:00:00,"""Reports from the twelve Federa…","""./data/raw/beige-book/2000/05/…"
2000-06-01 00:00:00,"""Reports from the Federal Reser…","""./data/raw/beige-book/2000/06/…"
2000-08-01 00:00:00,"""The information collected for …","""./data/raw/beige-book/2000/08/…"
…,…,…
2019-11-01 00:00:00,"""This report was prepared at th…","""./data/raw/beige-book/2019/11/…"
2020-01-01 00:00:00,"""This report was prepared at th…","""./data/raw/beige-book/2020/01/…"
2020-03-01 00:00:00,"""This report was prepared at th…","""./data/raw/beige-book/2020/03/…"
2020-04-01 00:00:00,"""This report was prepared at th…","""./data/raw/beige-book/2020/04/…"


In [6]:
from concurrent.futures import ThreadPoolExecutor

def process_row(row):
    g_analysis = gprompt_analyze({"document": row["text"]})
    g_analysis["source"] = row['filename']  # type: ignore
    g_analysis["month"] = row["date"].month
    g_analysis["day"] = row["date"].day
    g_analysis["year"] = row["date"].year
    with open("./data/analyses-bb.jsonl", "a") as f:
        f.write(json.dumps(g_analysis) + "\n")

rows = list(beige_book_df.iter_rows(named=True))
with ThreadPoolExecutor(max_workers=8) as executor:
    executor.map(process_row, rows)

In [7]:
try:
    with open(STATEMENT_FILE, "rb") as f:
        statements = pickle.load(f)
except FileNotFoundError:
    # Create new statements list if file doesn't exist
    statements = []


In [8]:
# Load analyses from JSONL file
analyses = []
with open("./data/analyses-bb.jsonl", "r") as f:
    for line in f:
        analyses.append(json.loads(line))


for g_analysis in analyses:
    what_embeddings = model.encode(
        [obs["what"] for obs in g_analysis["observations"]],
        device=DEVICE,
    )
    how_embeddings = model.encode(
        [obs["how"] for obs in g_analysis["observations"]],
        device=DEVICE,
    )

    # Convert observations to Observation objects and add to statements
    for i, obs in enumerate(g_analysis["observations"]):
        statement = Observation(
            id=None,
            what=obs["what"],
            what_embedding=what_embeddings[i],
            how=obs["how"],
            how_embedding=how_embeddings[i],
            citations=obs["citations"],
            source=g_analysis["source"],
            date=datetime.strptime(
                f"{g_analysis['year']}-{g_analysis['month']}-{g_analysis['day']}",
                "%Y-%m-%d",
            ),
        )

        statements.append(statement)


# Save updated statements
with open(STATEMENT_FILE, "wb") as f:
    pickle.dump(statements, f)