# Proof-of-Concept for Fine-Tuning Model with PEFT for Daily News

In [1]:
#!pip install peft datasets
#!pip install mistral_inference

import json
import chromadb
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from tqdm.notebook import tqdm
from datetime import datetime
import spacy
from datasets import Dataset

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
#from accelerate import dispatch_model

# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

import sys
import os

project_root = os.path.abspath("..")  # Adjust if needed
sys.path.append(project_root)

from newsies.chromadb_client import ChromaDBClient, collections, get_all_headlines, find_ordinal
from newsies import targets


! mkdir -p ./training_data

In [None]:
# Step 1: Connect to ChromaDB and Retrieve Data
def fetch_news_data():
    client = ChromaDBClient()  # Update path
    client.collection_name=f"ap_news_2025-03-12"
    print(f"collection name: {client.collection.name}")
    collection = client.collection
    n  = collection.count()
    print(f"there are {n} stories in the collection")
    results = collection.get(where={"target":{"$eq":targets.DOCUMENT}}, limit=n)
    return results["documents"], results["metadatas"]

news_docs, news_metadata = fetch_news_data()

news_docs[0]


collection name: ap_news_2025-03-12
there are 4142 stories in the collection


'NEW YORK (AP) — Harvey Weinstein ‘s #MeToo retrial next month will largely be an abridged version of the original, with one big addition: a charge based on an allegation from a woman who wasn’t a part of the first case.\nJust how the reprise of the disgraced movie mogul’s prosecution plays out is coming into focus at a hearing Wednesday, where a judge is set to issue rulings on a variety of issues, including the scope of accuser testimony and potential expert witnesses.\nWeinstein, 72, was in court for the hearing, which started more than a hour late after Judge Curtis Farber met with the prosecution and defense behind closed doors to discuss matters still under seal.\nThose included a prosecution request that two of the three accusers in the case be allowed to testify about other alleged encounters with Weinstein. They also discussed evidence of the accusers’ sexual history, which prosecutors say should be barred under New York’s Rape Shield Law.'

In [3]:
news_metadata[1]

{'chunk_index': 0,
 'collection': 'ap_news_2025-03-12',
 'date': '2025-03-12',
 'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
 'headline0': 'India’s official Oscar entry, which failed to make the cut, wins big at major Bollywood awards show',
 'headline1': 'N/A',
 'headline2': 'N/A',
 'section0': 'entertainment',
 'section1': 'N/A',
 'section2': 'N/A',
 'target': 'DOCUMENT',
 'text': 'JAIPUR, India (AP) — The film that was submitted as India’s official Oscar entry but failed to make the final list of nominees has swept the International Indian Film Academy Awards, which recognize outstanding work in the country’s film industry.\nDirector Kiran Rao’s critically acclaimed “Laapataa Ladies” — renamed “Lost Ladies” for its Oscar campaign — emerged as the biggest winner at the 2025 IIFA Awards, bagging 10 wins, including best picture and best direction.\nThe 2023 comedy is about two veiled brides who are accidentally swapped during a train ride, and tackles issues of patriarc

In [11]:
# Step 2: Generate Question-Answer Pairs using an LLM


def extract_named_entities(text):
    doc = nlp(text)
    entities = list(set(ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}))
    return entities

def save_debug_output(prompt, results):
    debug_data=[]
    debug_data.append({"qty_prompts":len(prompt), "qty_results":len(results)})
    for q,a in zip(prompt, results):
        debug_data.append({"prompt":q, "question":a})
    with open("debug_missing_questions.jsonl","w",encoding="utf8") as fh:
        fh.write(json.dumps(debug_data, indent=4))


def save_qa_to_parquet(qa_data, file_path):
    df = pd.DataFrame(qa_data)
    df.to_parquet(file_path, index=False)

def load_qa_from_parquet(file_path):
    df = pd.read_parquet(file_path)
    return df.to_dict(orient="records")

In [None]:
df = pd.DataFrame({
    "doc":news_docs,
    "uri": [meta["uri"] for meta in news_metadata],
    "section0": [meta["section0"] or "front-page" for meta in news_metadata],
    "headline0":[meta["headline0"] for meta in news_metadata],
    "section1": [meta["section1"] for meta in news_metadata],
    "headline1":[meta["headline1"] for meta in news_metadata],
    "section2": [meta["section2"] for meta in news_metadata],
    "headline2":[meta["headline2"] for meta in news_metadata],
    "ne": [extract_named_entities(doc) for doc in news_docs]
})

df1 = df[ (df["section1"]!="N/A") & (df["headline1"] != "N/A")]
df1 = df1.drop(["section0","headline0"], axis=1)
df1 = df1.rename(columns={"section1":"section", "headline1":"headline"})

df2 = df1[ (df1["section2"]!="N/A") & (df1["headline2"] != "N/A")]
df2 = df2.drop(["section","headline"], axis=1)
df2 = df2.rename(columns={"section2":"section", "headline2":"headline"})

df = df.drop(["section1","headline1","section2","headline2"],axis=1)
df = df.rename(columns={"section0":"section", "headline0":"headline"})
df1 = df1.drop(["section2","headline2"],axis=1)
df = pd.concat([df,df1,df2],ignore_index=True, sort=False)

dfnoents=df[df["ne"].apply(len)==0].copy()

dfne = df[df["ne"].apply(len)>0].copy()
dfne = dfne.explode("ne")



In [79]:

# add prompt
dfnoents["prompt"] = dfnoents.apply( lambda row: (
                "Generate three different questions that a reader might ask about the "
                "following news article. Focus on specific facts, key events, or "
                "important themes in the article. Each of the three questions should be clear, "
                "meaningful, and relevant to the article's details. The questions "
                "should avoid generic inquiries. Ensure the questions cannot be "
                "answered without reading the article.\n"
                f"news article: {row['doc']}"
            ), axis=1)


In [None]:
# add prompt
dfne["prompt"] = dfne.apply(lambda row: (
                    f"Generate a question about '{row['ne']}' that requires knowledge "
                    "of the following news article. Focus on specific facts, key "
                    "events, or important themes in the article. The questions should be "
                    "clear, meaningful, and relevant to the article's details. The questions "
                    "should avoid generic inquiries. Ensure the question cannot be "
                    "answered without reading the article.\n"
                    f"news article: {row['doc']}"

), axis=1)

In [81]:
df = pd.concat([dfnoents, dfne], ignore_index=True, sort=False)
df = df.drop(["ne","doc"],axis=1)
df

Unnamed: 0,uri,section,headline,prompt
0,./daily_news/20250312/gardening-spring-checkli...,front-page,Spring’s official start is nearly here and the...,Generate three different questions that a read...
1,./daily_news/20250312/movie-review-last-breath...,entertainment,Movie Review: A gripping deep-sea rescue missi...,Generate three different questions that a read...
2,./daily_news/20250312/health-insurance-deducti...,health,How to deal with fresh health insurance deduct...,Generate three different questions that a read...
3,./daily_news/20250312/masting-trees-acorns-wal...,science,If it seems like there are a lot of acorns thi...,Generate three different questions that a read...
4,./daily_news/20250312/china-economy-sluggish-c...,business,Consumer prices fell in China in February and ...,Generate three different questions that a read...
...,...,...,...,...
21713,./daily_news/20250312/ultraprocessed-foods-nih...,technology,A National Institutes of Health study aims to ...,Generate a question about 'HHS' that requires ...
21714,./daily_news/20250312/ultraprocessed-foods-nih...,technology,A National Institutes of Health study aims to ...,Generate a question about 'the Robert Wood Joh...
21715,./daily_news/20250312/ultraprocessed-foods-nih...,technology,A National Institutes of Health study aims to ...,Generate a question about 'The Associated Pres...
21716,./daily_news/20250312/ultraprocessed-foods-nih...,technology,A National Institutes of Health study aims to ...,Generate a question about 'AP' that requires k...


In [None]:
import torch
import pandas as pd
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
from datetime import datetime

# Model & Tokenizer Initialization
base_model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

# Enable torch.compile() for speedup (if available)
if torch.__version__ >= "2.0":
    model = torch.compile(model)

# Initialize HF Pipeline
qa_generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

# Convert Pandas DataFrame to Hugging Face Dataset
def convert_df_to_hf_dataset(df):
    return Dataset.from_pandas(df)

# Save to Parquet
def save_qa_to_parquet(dataset, batch_id):
    batch_file = f"training_data/qa_dataset_batch_{batch_id}.parquet"
    dataset.to_parquet(batch_file)

# Load Pandas DataFrame (Assuming `df` already exists)
batch_size = 1000  # Adjust batch size if needed
df["batch"] = df.index // batch_size
dataset = convert_df_to_hf_dataset(df)

# Define processing function for batch generation
def generate_questions(batch, batch_id):
    batch_prompts = batch["prompt"]

    # Generate Questions using Hugging Face Pipeline (Batch Mode)
    batch_questions = qa_generator(
        batch_prompts,
        max_length=100,
        truncation=True,
        num_return_sequences=3,  # Generate 3 questions per prompt
        do_sample=True,  # Introduce randomness for variation
        temperature=0.7,  # Adjust temperature for diversity
        top_p=0.9,  # Nucleus sampling for more natural responses
    )

    # Format questions properly
    formatted_questions = [
        [
            (
                "for the next question, return the 'section', "
                "the 'headline', and the 'URI'\n"
                f"question: '{v}'"
            )
            for d in qs
            for v in d.values()
        ]
        for qs in batch_questions
    ]

    batch["question"] = formatted_questions

    # Save Each Batch Immediately to Parquet
    save_qa_to_parquet(batch, batch_id)

    return batch

# Apply Efficient Batch Processing
dataset = dataset.map(generate_questions, batched=True, batch_size=batch_size, with_indices=True)

# Free GPU Memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(datetime.now(), "All batches processed")

In [31]:
# Modify the QA generation function
def generate_qa_pairs(news_docs, news_metadata, batch_size=1000, entity_batch_size=1000):
    qa_generator = pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device=0 if torch.cuda.is_available() else -1,
    )

    total_batches = (len(news_docs) + batch_size - 1) // batch_size

    # Initialize the DataFrames for question and entity prompts
    question_prompts = pd.DataFrame(
        columns=["batch", "doc", "meta", "prompt", "answer"]
    )
    entity_prompts = pd.DataFrame(
        columns=["batch", "doc", "meta", "entity", "prompt", "answer"]
    )
    print(datetime.now(), "start processing generated questions for training")

    for batch_start in range(0, len(news_docs), batch_size):
        entity_idx = 0
        batch_docs = news_docs[batch_start : batch_start + batch_size]
        batch_meta = news_metadata[batch_start : batch_start + batch_size]

        # Fill in question prompts and entity prompts
        for doc, meta in zip(batch_docs, batch_meta):
            context = (
                f"'section': {meta['section0'] or 'front-page'}\t"
                f"'headline':{meta['headline0']}\n"
            )
            if meta["section1"] != "N/A":
                context += (
                    f" 'section': {meta['section1']}\t'headline': {meta['headline1']}\n"
                )
            if meta["section2"] != "N/A":
                context += (
                    f" 'section': {meta['section2']}\t'headline': {meta['headline2']}\n"
                )
            context += f"'URI': {meta['uri']}\n"
            context += f"'article': {doc}"

            # Add question prompt for the document
            question_prompt = (
                "Generate 3 different questions that a reader might ask about the "
                "following news article. Focus on specific facts, key events, or "
                "important themes in the article. The questions should be clear, "
                "meaningful, and relevant to the article's details. The questions "
                "should avoid generic inquiries. Ensure the question cannot be "
                "answered without reading the article.\n"
                f"news article: {doc}"
            )

            answer = f"'URI'  {meta['uri']}\n"
            for i in range(3):
                if meta[f"section{i}"] != "N/A":
                    answer += (
                        f"'section' {meta[f"section{i}"] or "front-page"}\t"
                        f"'headline' {meta[f'headline{i}']}\n"
                    )

            question_prompts.loc[len(question_prompts)] = {
                "batch": f"{batch_start}",
                "doc": doc,
                "meta": meta,
                "prompt": question_prompt,
                "answer": answer,
            }

            # Entities can number an order of magnitude more than questions -
            # so they have to be batched on their own
            # Collect entities in the story
            entities = [e for e in extract_named_entities(doc) if e != "AP"]
            for entity in entities:
                entity_prompt = (
                    f"Generate a question about '{entity}' that requires knowledge "
                    "of the following news article. Focus on specific facts, key "
                    "events, or important themes in the article. The questions should be "
                    "clear, meaningful, and relevant to the article's details. The questions "
                    "should avoid generic inquiries. Ensure the question cannot be "
                    "answered without reading the article.\n"
                    f"news article: {doc}"
                )

                answer = f"'URI'  {meta['uri']}\n"
                for i in range(3):
                    if meta[f"section{i}"] != "N/A":
                        answer += (
                            f"'section' {meta[f"section{i}"] or "front-page"}\t"
                            f"'headline' {meta[f'headline{i}']}\n"
                        )

                entity_prompts.loc[len(entity_prompts)] = {
                    "batch": f"{batch_start}-{entity_idx // batch_size }",
                    "doc": doc,
                    "meta": meta,
                    "entity": entity,
                    "prompt": entity_prompt,
                    "answer": answer,
                }
                entity_idx += 1

        # At this point, question_prompts and entity_prompts have been built.
        # Use the 'batch' field to group the data by batch for efficient processing

        # Group question_prompts by the 'batch' field
        question_prompts_grouped = question_prompts.groupby("batch")
        entity_prompts_grouped = entity_prompts.groupby("batch")

        questions = []
        # Run qa_generator on question_prompt_ds - this should be batched to batch_size
        for batch, batch_data in tqdm(
            question_prompts_grouped,
            desc="Generating questions for question prompts",
            position=2,
        ):

            # Process the batch separately by extracting 'prompt' field
            batch_prompts = batch_data["prompt"].tolist()

            # Call qa_generate with the batch of prompts
            batch_questions = qa_generator(
                batch_prompts, max_length=50, truncation=True
            )
            batch_questions = [
                (
                    "for the next question, return the 'section', "
                    "the 'headline', and the 'URI'\n"
                    f"question: '{q}'"
                )
                for q in batch_questions
            ]

            batch_data["question"] = batch_questions

            batch_file = f"training_data/qa_dataset_batch_{batch}.parquet"
            save_qa_to_parquet(batch_data, batch_file)

            questions.extend(batch_questions)

        questions = []
        for batch, batch_data in tqdm(
            entity_prompts_grouped,
            desc="Generating questions for entity prompts",
            position=2,
        ):

            # Process the batch separately by extracting 'prompt' field
            batch_prompts = batch_data["prompt"].tolist()

            # Call qa_generate with the batch of prompts
            batch_questions = qa_generator(
                batch_prompts, max_length=50, truncation=True
            )
            batch_questions = [
                (
                    "for the next question, return the 'section', "
                    "the 'headline', and the 'URI'\n"
                    f"question: '{q}'"
                )
                for q in batch_questions
            ]
            batch_data["question"] = batch_questions

            batch_file = f"training_data/qa_dataset_batch_{batch}.parquet"
            save_qa_to_parquet(batch_data, batch_file)

            questions.extend(batch_questions)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print(datetime.now(), "All batches processed")

In [35]:
generate_qa_pairs(news_docs, news_metadata)

Device set to use cuda:0


2025-03-15 15:09:47.347008 start processing generated questions for training


Generating questions for question prompts:   0%|          | 0/1 [00:00<?, ?it/s]

Generating questions for entity prompts:   0%|          | 0/7 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
train_df=pd.read_parquet(project_root+"/notebooks/training_data")

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21253 entries, 0 to 21252
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  21253 non-null  object
 1   context   21253 non-null  object
 2   answer    21253 non-null  object
dtypes: object(3)
memory usage: 498.2+ KB


In [18]:
print(train_df.dtypes)  # Check column types

# Check if all values are strings
print(train_df.map(lambda x: isinstance(x, str)).all())

question    object
context     object
answer      object
dtype: object
question    True
context     True
answer      True
dtype: bool


In [19]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
from IPython.display import display
train_df.head(5)[["question","context","answer"]]

Unnamed: 0,question,context,answer
0,"for the next question, return the 'section', the 'headline', and the 'URI'question: 'What is the name of the movie mogul?'","[section]: front-page\t'headline':Court asked to intervene after email tells USAID workers to destroy classified documents\n [section]: politics\t'headline': N/A\n'URI': ./daily_news/20250312/usaid-trump-burn-order-shred-classified-documents-f042a51c0a9f74c96b0259b51a0d4a83.txt\n'article': Lawsuits are mounting over the abrupt shutdown of most U.S. foreign assistance and the targeting of the aid agency. In the latest court challenge, Personal Services Contractor Association, representing thousands of contractors now furloughed or fired from USAID, asked the judge to stop any document destruction to preserve evidence.\nThe email was sent under the name of Erica Carr — the acting executive secretary at USAID — and bears a USAID logo.\n“Thank you for your assistance in clearing our classified safes and personnel documents” at USAID headquarters in Washington, it begins.",'URI' ./daily_news/20250312/harvey-weinstein-sexual-misconduct-metoo-retrial-2e8f3c99224cf5ad068e7ef5b5907b8d.txt\n'section' front-page\t'headline' Harvey Weinstein appears in court as judge weighs key rulings for his looming #MeToo retrial\n'section' politics\t'headline' Harvey Weinstein appears in court as judge weighs key rulings for his looming #MeToo retrial\n'section' technology\t'headline' Harvey Weinstein due in court for key rulings as his #MeToo retrial nears\n
1,"for the next question, return the 'section', the 'headline', and the 'URI'question: 'What was the name of the film that was submitted as India's official Oscar entry but failed to make the final list of nominees?'","[section]: front-page\t'headline':Court asked to intervene after email tells USAID workers to destroy classified documents\n [section]: politics\t'headline': N/A\n'URI': ./daily_news/20250312/usaid-trump-burn-order-shred-classified-documents-f042a51c0a9f74c96b0259b51a0d4a83.txt\n'article': Lawsuits are mounting over the abrupt shutdown of most U.S. foreign assistance and the targeting of the aid agency. In the latest court challenge, Personal Services Contractor Association, representing thousands of contractors now furloughed or fired from USAID, asked the judge to stop any document destruction to preserve evidence.\nThe email was sent under the name of Erica Carr — the acting executive secretary at USAID — and bears a USAID logo.\n“Thank you for your assistance in clearing our classified safes and personnel documents” at USAID headquarters in Washington, it begins.","'URI' ./daily_news/20250312/india-iifa-bollywood-film-awards-6f827c8885563b258b4abadf3613baad.txt\n'section' entertainment\t'headline' India’s official Oscar entry, which failed to make the cut, wins big at major Bollywood awards show\n"
2,"for the next question, return the 'section', the 'headline', and the 'URI'question: 'What is the name of the Iranian journalist?'","[section]: front-page\t'headline':Court asked to intervene after email tells USAID workers to destroy classified documents\n [section]: politics\t'headline': N/A\n'URI': ./daily_news/20250312/usaid-trump-burn-order-shred-classified-documents-f042a51c0a9f74c96b0259b51a0d4a83.txt\n'article': Lawsuits are mounting over the abrupt shutdown of most U.S. foreign assistance and the targeting of the aid agency. In the latest court challenge, Personal Services Contractor Association, representing thousands of contractors now furloughed or fired from USAID, asked the judge to stop any document destruction to preserve evidence.\nThe email was sent under the name of Erica Carr — the acting executive secretary at USAID — and bears a USAID logo.\n“Thank you for your assistance in clearing our classified safes and personnel documents” at USAID headquarters in Washington, it begins.","'URI' ./daily_news/20250312/iran-murder-plot-trial-masih-alinejad-d17a4b4bad3205c705f4e61f5a288785.txt\n'section' politics\t'headline' At trial’s start, prosecutor blames Iran for plot to assassinate outspoken dissident\n"
3,"for the next question, return the 'section', the 'headline', and the 'URI'question: 'What was the name of the musician who performed at the memorial service?'","[section]: front-page\t'headline':Court asked to intervene after email tells USAID workers to destroy classified documents\n [section]: politics\t'headline': N/A\n'URI': ./daily_news/20250312/usaid-trump-burn-order-shred-classified-documents-f042a51c0a9f74c96b0259b51a0d4a83.txt\n'article': Lawsuits are mounting over the abrupt shutdown of most U.S. foreign assistance and the targeting of the aid agency. In the latest court challenge, Personal Services Contractor Association, representing thousands of contractors now furloughed or fired from USAID, asked the judge to stop any document destruction to preserve evidence.\nThe email was sent under the name of Erica Carr — the acting executive secretary at USAID — and bears a USAID logo.\n“Thank you for your assistance in clearing our classified safes and personnel documents” at USAID headquarters in Washington, it begins.",'URI' ./daily_news/20250312/roberta-flack-memorial-8b8b7151a5c603db8a87a7b1960c554e.txt\n'section' front-page\t'headline' Lauryn Hill and Stevie Wonder delight at Roberta Flack’s ‘Celebration of Life’ memorial\n'section' entertainment\t'headline' N/A\n
4,"for the next question, return the 'section', the 'headline', and the 'URI'question: 'What was the name of the painting that was attacked?'","[section]: front-page\t'headline':Court asked to intervene after email tells USAID workers to destroy classified documents\n [section]: politics\t'headline': N/A\n'URI': ./daily_news/20250312/usaid-trump-burn-order-shred-classified-documents-f042a51c0a9f74c96b0259b51a0d4a83.txt\n'article': Lawsuits are mounting over the abrupt shutdown of most U.S. foreign assistance and the targeting of the aid agency. In the latest court challenge, Personal Services Contractor Association, representing thousands of contractors now furloughed or fired from USAID, asked the judge to stop any document destruction to preserve evidence.\nThe email was sent under the name of Erica Carr — the acting executive secretary at USAID — and bears a USAID logo.\n“Thank you for your assistance in clearing our classified safes and personnel documents” at USAID headquarters in Washington, it begins.","'URI' ./daily_news/20250312/greece-art-gallery-vandalism-lawmaker-0ae9ec66bc25c7bf995f65c1d082bfa4.txt\n'section' entertainment\t'headline' Greek lawmaker attacks paintings in Athens’ National Gallery, claiming they are offensive\n"


### Tokenize and validate data before training

In [6]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', '7B-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

'/home/mpeters/mistral_models/7B-v0.3'

In [7]:
# Step 5: Load Model and Apply LoRA Fine-Tuning
base_model_name = "mistralai/Mistral-7B-v0.3"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import numpy as np

In [None]:

def validate_and_tokenize_data(df: pd.DataFrame, max_length: int = 512):
    """ Tokenizes the dataset and provides analytics to detect formatting issues. """

    # Ensure required columns exist
    required_columns = ["question", "context", "answer"]
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Convert everything to string
    df["question"] = df["question"].fillna("").astype(str)
    df["context"] = df["context"].fillna("").astype(str)
    df["answer"] = df["answer"].fillna("").astype(str)

    # Tokenize all fields separately
    tokenized_questions = tokenizer(df["question"].tolist(), padding=True, truncation=True, return_tensors="np")
    tokenized_contexts = tokenizer(df["context"].tolist(), padding=True, truncation=True, return_tensors="np")
    tokenized_answers = tokenizer(df["answer"].tolist(), padding=True, truncation=True, return_tensors="np")

    # Extract token lengths
    df["question_token_len"] = [len(q) for q in tokenized_questions["input_ids"]]
    df["context_token_len"] = [len(c) for c in tokenized_contexts["input_ids"]]
    df["answer_token_len"] = [len(a) for a in tokenized_answers["input_ids"]]

    # Summary statistics
    summary_stats = {
        "Question Length": df["question_token_len"].describe(),
        "Context Length": df["context_token_len"].describe(),
        "Answer Length": df["answer_token_len"].describe(),
    }

    # Detect potential issues
    def detect_anomalies(column_name):
        max_allowed = max_length  # Defined by model input size
        too_long = df[df[column_name] > max_allowed]
        too_short = df[df[column_name] < 5]  # Arbitrary min length threshold
        return too_long, too_short

    issues = {
        "long_questions": detect_anomalies("question_token_len"),
        "long_contexts": detect_anomalies("context_token_len"),
        "long_answers": detect_anomalies("answer_token_len"),
    }

    return df, summary_stats, issues



In [None]:
# Example usage
blessed_df, stats, anomalies = validate_and_tokenize_data(train_df)

# Print statistics
for key, value in stats.items():
    print(f"\n🔹 {key}:\n{value}\n")

# Print anomalies
for issue, (long, short) in anomalies.items():
    print(f"\n⚠️ {issue.replace('_', ' ').capitalize()}:")
    print(f" - {len(long)} entries are too long")
    print(f" - {len(short)} entries are too short")


### Format and Split Data into Train and Test Sets

In [None]:
def format_dataset(qa_dataset, tokenizer):
    """Ensure tokenizer has a padding token and tokenize dataset."""

    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use EOS token for padding

    def tokenize_sample(sample):
        """Tokenizes input and output text."""
        question = str(sample["question"]) if sample["question"] is not None else ""
        answer = str(sample["answer"]) if sample["answer"] is not None else ""

        inputs = tokenizer(question, padding=True, truncation=True, max_length=512)
        outputs = tokenizer(answer, padding=True, truncation=True, max_length=512)

        inputs["labels"] = outputs["input_ids"]  # Assign tokenized answers as labels
        return inputs

    # Drop rows where 'question' is NaN or empty
    qa_dataset = qa_dataset.dropna(subset=["question"])
    qa_dataset = qa_dataset[qa_dataset["question"].str.strip() != ""]  # Remove empty questions

    dataset = Dataset.from_pandas(qa_dataset)
    tokenized_dataset = dataset.map(tokenize_sample, remove_columns=["question", "answer"])

    return tokenized_dataset.train_test_split(test_size=0.2)

# Apply the function
split_dataset = format_dataset(train_df, tokenizer)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


Map:   0%|          | 0/21253 [00:00<?, ? examples/s]

In [None]:
# LoRA Configuration
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none"
)
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./news_finetune_model",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    fp16=True,
    optim="adamw_torch",
    remove_unused_columns=False,  # Ensure model gets correct inputs
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`context` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:


# Step 5: Evaluate the Fine-Tuned Model
def evaluate_model(sample_question):
    inputs = tokenizer(sample_question, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

sample_question = qa_dataset[0]["question"]
response = evaluate_model(sample_question)
print(f"Q: {sample_question}\nA: {response}")



import json

def save_debug_output(raw_outputs, file_path="debug_missing_questions.jsonl"):
    """ Save raw LLM responses where questions are missing """
    with open(file_path, "a", encoding="utf-8") as f:
        for entry in raw_outputs:
            f.write(json.dumps(entry) + "\n")

