# Proof-of-Concept for Fine-Tuning Model with PEFT for Daily News

In [None]:
!pip install peft datasets
!pip install mistral_inference
#!pip install accelerate


In [13]:
import chromadb
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from tqdm import tqdm
from datetime import datetime
import spacy

In [16]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
#from accelerate import dispatch_model

# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

In [4]:
import sys
import os

project_root = os.path.abspath("..")  # Adjust if needed
sys.path.append(project_root)

from newsies.chromadb_client import ChromaDBClient, collections, get_all_headlines, find_ordinal
from newsies import targets


['.pytest_cache', 'newsies.log', 'docs', 'dist', 'newsies.egg-info', '.git', '.newsies.pid', 'nohup.out', 'scripts', 'build', 'LICENSE', 'newsies_err.log', 'daily_news', 'notebooks', '.gitignore', 'docker', 'requirements.txt', 'setup.py', 'tests', 'newsies', 'junit', '.vscode', 'README.md']


In [5]:
# Step 1: Connect to ChromaDB and Retrieve Data
def fetch_news_data():
    client = ChromaDBClient()  # Update path
    client.collection_name=f"ap_news_{datetime.now().strftime(r'%Y-%m-%d')}"
    print(f"collection name: {client.collection.name}")
    collection = client.collection
    n  = collection.count()
    print(f"there are {n} stories in the collection")
    results = collection.get(where={"target":{"$eq":targets.DOCUMENT}}, limit=n)  
    return results["documents"], results["metadatas"]

news_docs, news_metadata = fetch_news_data()

collection name: ap_news_2025-03-12
there are 3727 stories in the collection


In [6]:
news_docs[0]

'NEW YORK (AP) — Harvey Weinstein ‘s #MeToo retrial next month will largely be an abridged version of the original, with one big addition: a charge based on an allegation from a woman who wasn’t a part of the first case.\nJust how the reprise of the disgraced movie mogul’s prosecution plays out is coming into focus at a hearing Wednesday, where a judge is set to issue rulings on a variety of issues, including the scope of accuser testimony and potential expert witnesses.\nWeinstein, 72, was in court for the hearing, which started more than a hour late after Judge Curtis Farber met with the prosecution and defense behind closed doors to discuss matters still under seal.\nThose included a prosecution request that two of the three accusers in the case be allowed to testify about other alleged encounters with Weinstein. They also discussed evidence of the accusers’ sexual history, which prosecutors say should be barred under New York’s Rape Shield Law.'

In [7]:
news_metadata[0]

{'chunk_index': 0,
 'collection': 'ap_news_2025-03-12',
 'date': '2025-03-12',
 'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
 'headline0': 'Harvey Weinstein appears in court  as judge weighs key rulings for his looming #MeToo retrial',
 'headline1': 'Harvey Weinstein appears in court as judge weighs key rulings for his looming #MeToo retrial',
 'headline2': 'Harvey Weinstein due in court for key rulings as his #MeToo retrial nears',
 'section0': '',
 'section1': 'politics',
 'section2': 'technology',
 'target': 'DOCUMENT',
 'text': 'NEW YORK (AP) — Harvey Weinstein ‘s #MeToo retrial next month will largely be an abridged version of the original, with one big addition: a charge based on an allegation from a woman who wasn’t a part of the first case.\nJust how the reprise of the disgraced movie mogul’s prosecution plays out is coming into focus at a hearing Wednesday, where a judge is set to issue rulings on a variety of issues, including the scope of accuser testimony an

## Use Flan-T5-large to generate questions for each article and for the named entities in it

In [36]:
# Step 2: Generate Question-Answer Pairs using an LLM
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
qa_generator = pipeline("text2text-generation", model="google/flan-t5-large", device=device)

def extract_named_entities(text):
    doc = nlp(text)
    entities = list(set(ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}))
    return entities

def generate_qa_pairs(news_docs, news_metadata):
    qa_data = []
    question_prompts = []
    entity_prompts = []
    
    for doc, meta in tqdm(zip(news_docs, news_metadata), total=len(news_docs), desc="Generating QA Prompts"):
        context = f"{meta['section0'] or 'front-page'}: {doc}" # in the front-page section is ''
        if meta["section1"] != "N/A":
            context += f"\n{meta['section1']}: {doc}"
        if meta["section2"] != "N/A":
            context += f"\n{meta['section2']}: {doc}"
        
        # Extract named entities from article
        entities = extract_named_entities(doc)
        
        # Generate 3 diverse questions about the article
        question_prompts.append(
            f"For the following question, return the section, headline, and URI: Generate 3 different questions about the following news article. "
            f"Include questions that focus on key details, impacts, and reasons. "
            f"News: {context}"
        )
        
        # Generate one question per named entity
        for entity in entities:
            entity_prompts.append(
                f"For the following question, return the section, headline, and URI: Generate a question about {entity} in relation to the following news article. "
                f"News: {context}"
            )
    
    # Generate questions
    print(datetime.now(), "generate article questions")
    article_questions = qa_generator(question_prompts, max_length=50, truncation=True)
    print(datetime.now(), "generate entity questions")
    entity_questions = qa_generator(entity_prompts, max_length=50, truncation=True) if entity_prompts else []
    print(datetime.now(), "prompt generation complete")
    entity_idx = 0
    for (doc, meta), article_question_output in tqdm(zip(zip(news_docs, news_metadata), article_questions), total=len(news_docs), desc="Processing QA Pairs"):
        questions = article_question_output["generated_text"].split("\n")
        
        # Store article questions
        qa_data.append({
            "questions": questions,
            "context": doc,
            "answer": [{"headline": meta["headline0"], "uri": meta["uri"]}]
        })
        
        # Store entity-based questions
        entities = extract_named_entities(doc)
        for entity in entities:
            if entity_idx < len(entity_questions):
                qa_data.append({
                    "questions": [entity_questions[entity_idx]["generated_text"]],
                    "context": doc,
                    "answer": [{"headline": meta["headline0"], "uri": meta["uri"]}]
                })
                entity_idx += 1
    
    return qa_data

Device set to use cuda:0


### Generate the training data

In [None]:
qa_data = generate_qa_pairs(news_docs, news_metadata)

Generating QA Prompts: 100%|████████████████████████████████████████████████████████████████████████| 2994/2994 [00:41<00:00, 71.45it/s]


2025-03-12 13:54:31.306997 generate article questions


In [None]:
qa_dataset[2]

In [30]:
# Step 3: Serialize and Deserialize QA Dataset using Parquet
def save_qa_to_parquet(qa_data, file_path=f"qa_dataset_{datetime.now().strftime(r'%Y-%m-%d')}.parquet"):
    df = pd.DataFrame(qa_data)
    df.to_parquet(file_path, index=False)

def load_qa_from_parquet(file_path=f"qa_dataset_{datetime.now().strftime(r'%Y-%m-%d')}.parquet"):
    df = pd.read_parquet(file_path)
    return df.to_dict(orient="records")

# Save dataset
save_qa_to_parquet(qa_dataset)

# Load dataset
qa_dataset = load_qa_from_parquet()

FileNotFoundError: [Errno 2] No such file or directory: 'qa_dataset_2025-03-12.parquet'

In [7]:
qa_dataset[0]

{'question': 'What is the main reason for the rise in food poisoning in November and December?',
 'context': 'health: Ready or not, the holidays are here. It’s a time when many Americans accustomed to preparing simple meals find themselves responsible for safely serving multi-dish feasts.\nIt’s no easy task. Outbreaks of some types of food poisoning tend to rise in November and December, according to the U.S. Centers for Disease Control and Prevention. Tainted turkey, undercooked stuffing and germ-laced gravy from holiday buffets have all led to past illnesses — and even deaths — CDC investigators have found.\nIt can be tricky for occasional cooks to prepare big meals in a way that avoids the common hazards that can make people sick, said Donald Schaffner, a food science expert at Rutgers University.\n“Cooking takes longer with big masses of food. Cooling takes longer with big masses of food,” said Schaffner, who co-hosts the food-safety podcast “Risky or Not?”\nAP Washington correspon

## Remove the Flan-T5 model from GPU

In [None]:
del qa_generator
torch.cuda.empty_cache()

In [10]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', '7B-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

params.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

tokenizer.model.v3:   0%|          | 0.00/587k [00:00<?, ?B/s]

consolidated.safetensors:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

'/home/mpeters/mistral_models/7B-v0.3'

In [24]:
# Step 4: Prepare Data for Fine-Tuning
def format_dataset(qa_dataset):
    dataset = Dataset.from_pandas(pd.DataFrame([{ "input_text": item["question"], "output_text": str(item["answer"]) } for item in qa_dataset]))
    return dataset.train_test_split(test_size=0.2)

split_dataset = format_dataset(qa_dataset)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


In [33]:
# Step 5: Load Model and Apply LoRA Fine-Tuning
base_model_name = "mistralai/Mistral-7B-v0.3"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [34]:
# LoRA Configuration
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none"
)
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./news_finetune_model",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


[2025-03-11 14:40:46,004] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /home/mpeters/.triton/autotune: No such file or directory
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [output_text, input_text]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
# Step 5: Evaluate the Fine-Tuned Model
def evaluate_model(sample_question):
    inputs = tokenizer(sample_question, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

sample_question = qa_dataset[0]["question"]
response = evaluate_model(sample_question)
print(f"Q: {sample_question}\nA: {response}")
