In [37]:
import sys
import os
import gc
import time

project_root = os.path.abspath("..")  # Adjust if needed
sys.path.append(project_root)

In [2]:
import pickle
from newsies.ap_news.article import Article


In [324]:
from huggingface_hub import snapshot_download

In [515]:
#from newsies.llm.specs import _BASE_MODEL_NAME as model_name
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [326]:
import torch
from pathlib import Path

In [6]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    logging as hf_logging,
)

In [3]:
with open("../daily_news/apnews.com/79008acf2e1641fda3b05f644903d73c.pkl","rb") as fh:
    a = pickle.load(fh)

In [4]:
print(a.formatted)

[|ITEM ID START|]79008acf2e1641fda3b05f644903d73c[|ITEM ID END|]
[|PUBLISH DATE START|]2025-05-07T17:20:02+00:00[|PUBLISH DATE END|]
[|SECTION START|]u.s. news[|SECTION END|]: [|TITLE START|]Washington governor signs rent-control bill into law[|TITLE END|]
[|SECTION START|]politics[|SECTION END|]: [|TITLE START|]Washington governor signs rent-control bill into law[|TITLE END|]
[|SECTION START|]business[|SECTION END|]: [|TITLE START|]Washington governor signs rent-control bill into law[|TITLE END|]
[|AUTHOR START|]MARTHA BELLISLE[|AUTHOR END|]
[|ARTICLE START|]Washington state Gov. Bob Ferguson signed a bill into law Wednesday that sets limits on rent increases, making the state among the first in the nation to provide protections for tenants.
The rent stabilization measure, House Bill 1217, adds Washington to states like Oregon and California that have sought new ways to curb homelessness. 
Bill sponsor Sen. Emily Alvarado, a West Seattle Democrat, said the measure sets common-sense gu

In [516]:
mistral_models_path = Path.home().joinpath("mistral_models", model_name)
if mistral_models_path.exists():
    print(
        f"mistral models already downloaded to {mistral_models_path}"
    )
else:
    print(f"downloading {model_name} to {mistral_models_path}")
    mistral_models_path.mkdir(parents=True, exist_ok=True)
    
    snapshot_download(
        repo_id=model_name,
                    allow_patterns=[
                    "params.json",
                    "consolidated.safetensors",
                    "tokenizer.model.v3",
                ],
        local_dir=mistral_models_path
)

mistral models already downloaded to /home/mpeters/mistral_models/mistralai/Mistral-7B-Instruct-v0.3


In [517]:
model = None
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [518]:
import re

def extract_summary_blocks(text, tag_str:str="SUMMARY"):
    """
    Extracts all content between [|SUMMARY START|] and [|SUMMARY END|] tags.
    Returns a list of matched strings.
    """
    pattern = r"\[\|" + tag_str + " START\|\](.*?)\[\|" + tag_str + " END\|\]"
    matches = re.findall(pattern, text, re.DOTALL)
    return [match.strip() for match in matches]


In [519]:
model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="cuda"
    )
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [520]:
prompt = (
"""
<s>
[INST]
You are a summarization system that produces two- to three-sentence summaries of articles.  

# Instructions
Summaries should be two- to -three sentences (inside [|SUMMARY START|] ... [|SUMMARY END|]).  The summaries
should include one or two significant facts from the article, but should still be terse.  Summaries should use proper names and titles
whenever possible and minimize the use of pronouns

1. A factual summary of the main events
2. A factual summary of the participating individuals and affected places or objects. Always include names and titles.
3. A summary of the societal, emotional, or possible future impact

## An example is below:
----------------------------
    Article Context:
    [|ITEM ID START|]abcdef0123456789 [|ITEM ID END|]
    [|ARTICLE START|]
    There is a toybox in my mother's basement that was a gift from my grandfather.  
    The toybox contains three blue balls and two brown balls, and a contingent of plastic army men from Company Alpha. 
    All the balls are different sizes, but all of the soldiers are the same size.  The toybox is red painted 
    wood with brass hinges and leather handles.
    [|ARTICLE END|]

    
    [|SUMMARY START|] There are three blue balls and two brown balls in the basement toybox. 
    The toybox also contains Company Alpha army men.[|SUMMARY END|]
    [|SUMMARY START|]The article mentions the author's mother and grandfather.[|SUMMARY END|]
    [|SUMMARY START|]The article about the author's toybox is rather terse and emotionless.[|SUMMARY END|]
----------------------------    


Article Context:
----------------------------
""" +
a.formatted +
"""
[/INST]
    
[|SUMMARY START|]
"""
)

In [521]:
def generate_summary(prompt:str, qty:int):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    input_ids = input_ids.to("cuda")
    resp = model.generate(input_ids,
                    max_length=2048,
                    temperature=0.7,
                    top_p=0.98,
                    do_sample=True,
                    num_return_sequences=qty,)
    output = []
    for i in range(qty):
        output.append(tokenizer.decode(resp[i]))
    return output

In [522]:
q=1
summaries = [s for o in generate_summary(prompt,q) for s in extract_summary_blocks(o,"SUMMARY")[-3:]  ]

In [523]:
print(summaries)

['Washington governor Bob Ferguson has signed a rent-control bill into law, setting limits on rent increases. This makes Washington one of the first states to provide tenant protections, following in the footsteps of Oregon and California. The bill caps rent increases at 7% plus inflation or 10%, whichever is lower, and includes protections for single-family home renters. The measure aims to prevent excessive rent increases for hardworking families and older adults, addressing the ongoing homelessness issue.', "The article was authored by Martha Bellisle and discusses the recent signing of a rent-control bill in Washington state by Governor Bob Ferguson. The bill aims to provide tenant protections, making Washington one of the first states to do so. The bill's sponsor, Sen. Emily Alvarado, emphasized that housing is a basic human need and everyone deserves a stable and affordable home.", "The successful passage of the rent-control bill has been met with praise from Governor Ferguson an

In [497]:
def qa_prompt(summary:str)->str:
    return (
"""
<s>
[INST]
You are a question-and-answer generation system.  Generate a specific, detailed question regarding the events, entities and impacts
of the provided summary.  Generate answers that leverage as many meaningful details from the provided summary as necessary to fully 
answer the question.

An example follows:
----------------------------    

[|SUMMARY START|]
There are three blue balls and two brown balls in the basement toybox. The toybox also contains plastic army men from Company Alpha.
[|SUMMARY END|]

[|QUESTION START|]
How many balls are in the basement toybox?
[|QUESTION END|]

[|ANSWER START|]
There are five balls in the basement toybox: two brown and three blue.
[|ANSWER END|]

[/INST]

[|SUMMARY START|]
""" +

summary +
"""
[|SUMMARY END|]
    
[|QUESTION START|]
"""
)

In [498]:
questions=[]
answers=[]

In [499]:
def gen_qa(summary:str, qty:int=1):
    input_ids = tokenizer(summary, return_tensors="pt")["input_ids"]
    input_ids = input_ids.to("cuda")
    return model.generate(input_ids,
                max_length=2048,
                temperature=0.40,
                top_p=0.98,
                do_sample=True,
                num_return_sequences=qty,)

In [500]:
for s in summaries:
    response=gen_qa(qa_prompt(s))
    for r in response:
        rr = tokenizer.decode(r)
        print("SUMMARY:",s)
        print("Q:",extract_summary_blocks(rr, "QUESTION")[-1])
        print("A:",extract_summary_blocks(rr, "ANSWER")[-1])
        print()
        

SUMMARY: Washington Governor Bob Ferguson signed a bill into law that sets limits on rent increases, making it one of the first states to provide rent protection for tenants. The rent stabilization measure caps increases at 7% plus inflation or 10%, whichever is lower, and includes single-family homes. The bill was sponsored by Senator Emily Alvarado, who emphasized the importance of stable and affordable housing for all residents.
Q: Why did Senator Emily Alvarado emphasize the importance of the rent stabilization measure?
A: Senator Emily Alvarado emphasized the importance of the rent stabilization measure because she believes in providing stable and affordable housing for all residents.

SUMMARY: The article was written by Martha Bellisle and published on May 7, 2025, in the U.S. news, politics, and business sections. The article discusses the signing of a rent-control bill in Washington state by Governor Bob Ferguson.
Q: What significant event did the article discuss?
A: The articl