In [None]:
import random
from typing import List

import pandas as pd
import spacy
import torch
from transformers import pipeline

In [None]:
class Llama33Model():
    def __init__(self, model_path="distilgpt2", batch_size=2): 
        #model_path="meta-llama/Llama-3.3-70B-Instruct"
        self.model_path = model_path
        self.batch_size = batch_size
        self.pipeline = pipeline(
            "text-generation",
            model=self.model_path,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
        )

        terminators = [
            self.pipeline.tokenizer.eos_token_id,
            self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        self.pipeline.tokenizer.padding_side = "left"

    def generate_prompts(self, prompts: List[str]):
        for prompt in prompts:
            message = [{"role": "user", "content": prompt}]

            yield self.pipeline.tokenizer.apply_chat_template(
                message,
                tokenize=False,
                add_generation_prompt=True
            )

    def generate_responses(self, prompts: List[str], max_new_tokens: int = 5000, temperature: float = 1,
                          top_p: float = 0.9, return_answer_only: bool = True):

        output = self.pipeline(
            self.generate_prompts(prompts),
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=self.pipeline.tokenizer.eos_token_id,
            batch_size=self.batch_size
        )

        if return_answer_only:
            return [outputs[0]['generated_text'].split('<|start_header_id|>assistant<|end_header_id|>')[1]
                    for outputs in output]
        else:
            return output

In [None]:
model3 = Llama33Model()

# Create chunk

In [None]:
n_samples = 500 
n_tokens_per_chunk = 500
n_token_upper_limit_per_chunk = 600

In [None]:
"""
Load file containing 10k documents info.
Put the file here that contains the source documents (filings raw text).
The file should be a pandas dataframe with a column named 'filing_text' that contains the text of the filings.
"""
df_data = pd.read_pickle("")

In [None]:
df_data_chunk = df_data.sample(n=n_samples).copy()
df_data_chunk = df_data_chunk.reset_index(drop=True)
df_data_chunk.head()

In [None]:
nlp = spacy.load("en_core_web_sm")

def get_random_chunk(nlp, doc_text, token_count=500, token_limit=600):
    doc = nlp(doc_text)

    tokens = [token.text for token in doc]
    total_tokens = len(tokens)
    
    # Extract sentences
    sentences = list(doc.sents)
    
    # Randomly select a starting sentence
    start_idx = random.randint(0, len(sentences) - 1)
    
    # Collect sentences until the token count is reached
    chunk = []
    chunk_token_count = 0

    for sent in sentences[start_idx:]:
        chunk.append(sent.text)
        new_token_count = chunk_token_count + len(sent)
        if new_token_count >= token_count and new_token_count <= token_limit:
            chunk_token_count = new_token_count
            break
        elif new_token_count >= token_count and new_token_count > token_limit:   
            partial_last_sentence = " ".join([token.text for token in sent[:(token_limit - chunk_token_count)]])
            chunk[-1] = partial_last_sentence
            chunk_token_count = token_limit
            break
        chunk_token_count = new_token_count 

    if chunk_token_count < token_count:
        for sentence_idx in range(start_idx, -1, -1):
            sent = sentences[sentence_idx]
            new_token_count = chunk_token_count + len(sent)
            if new_token_count >= token_count and new_token_count <= token_limit:
                chunk_token_count = new_token_count
                chunk = [sent.text] + chunk
                chunk_starting_sentence_idx = sentence_idx
                break
            elif new_token_count >= token_count and new_token_count > token_limit:   
                break
            chunk_token_count = new_token_count
            chunk = [sent.text] + chunk
            chunk_starting_sentence_idx = sentence_idx    
    
    
    # Join the sentences to form the chunk
    chunk_text = " ".join(chunk)

    return {
        "chunk": chunk_text,
        "start_sentence_idx": start_idx,
        "chunk_token_count": chunk_token_count,
        "total_tokens": total_tokens,
    }

In [None]:
for i in range(df_data_chunk.shape[0]):
    if i % 20 == 0:
        print(i)
    doc_text = df_data_chunk.at[i, 'filing_text']
    if len(doc_text) >= 1000000:
        doc_text = doc_text[-999999:]
    df_data_chunk.at[i, 'filing_text'] = doc_text
        
    chunk_result = get_random_chunk(nlp, doc_text, token_count=n_tokens_per_chunk, token_limit=n_token_upper_limit_per_chunk)
    df_data_chunk.at[i, 'context'] = chunk_result["chunk"]
    df_data_chunk.at[i, 'chunk_token_count'] = chunk_result["chunk_token_count"]
    df_data_chunk.at[i, 'document_token_count'] = chunk_result["total_tokens"]
    df_data_chunk.at[i, 'chunk_starting_sentence_idx'] = chunk_result["start_sentence_idx"]

In [None]:
df_data_chunk = df_data_chunk[df_data_chunk['chunk_token_count'] >= n_tokens_per_chunk].copy()
df_data_chunk = df_data_chunk.reset_index(drop=True)
df_data_chunk.head()

In [None]:
df_data_chunk['chunk_token_count'].hist()

In [None]:
df_data_chunk.shape

# Generate queries

In [None]:
df_generated_data = df_data_chunk.copy()

In [None]:
for i in range(df_generated_data.shape[0]):
    if i % 20 == 0:
        print(i)
    chunk =df_generated_data.at[i, 'context']

    PROMPT_QUERY_GENERATION = f"""
    You are a financial analyst. You are asked to write 1 question that can be answered by the information in the provided document chunk which is from 10-K filings. 
    ***[START OF DOCUMENT CHUNK]
    {chunk}
    ***[END OF DOCUMENT CHUNK]
    
    Guidelines:
        1. You are a financial analyst. Imagine you're given a tool that you can ask questions about a SEC filing so that it saves your time reading the document. Come up with a question that you want to ask. It needs to be financial meaningful.
        2. Try to avoid asking questions about small unmeaningful details in the SEC filings. Come up with realistic questions that a financial analyst might care about. 
        3. Make sure the question you generated can be answered with the information in the document chunk and does not require any other knowledge or information.
    
    You must return ONLY the question. Do not generate anything else other than the question.
    """
    df_generated_data.at[i, 'query'] = model3.generate_response(PROMPT_QUERY_GENERATION)

## Generate gold answer

In [None]:
for i in range(df_generated_data.shape[0]):
    if i % 20 == 0:
        print(i)    
    query = df_generated_data.at[i, 'query']
    chunk = df_generated_data.at[i, 'context']
    
    PROMPT_ANSWER_GENERATION = f"""
    Answer the following question based on the information in the given document chunk.
    [QUESTION]
    {query}
    [DOCUMENT CHUNK]
    {chunk}

    Provide answer to the QUESTION only using information from the DOCUMENT CHUNK provided. Make sure your answer is consistent with the information in the document chunk.
    You answer should only include information that is supported by the document chunk.
    If the document chunk does not contain enough information for you to answer the question, output "Information is not available in the document."
    You must output ONLY the answer. Do not generate anything else other than the answer.
    """

    df_generated_data.at[i, 'gold_answer'] = model3.generate_response(PROMPT_ANSWER_GENERATION)

In [None]:
df_generated_data.head()

### Generate hallucination answer

In [None]:
"""
Replace [EXAMPLE 1] through [EXAMPLE 3] with custom few-shot examples of hallucinations in PROMPT_HALLUCINATION_GENERATION. Use the following guidelines in creating your examples:

• Each example should present a clear question paired with a document excerpt and demonstrates how even small alterations in phrasing can lead to a hallucination.
• The examples should showcase the importance of closely matching the document's context—e.g., misinterpreting price trends, omitting critical qualifiers, etc.
• Examples can reveal how inserting extra assumptions (e.g., implying a strategic shift) can distort the intended meaning of the original content.
• Examples can serve as instructive demonstrations for how automated summarization or extraction may deviate subtly from primary data, risking misinterpretations.
"""


for i in range(df_generated_data.shape[0]):
    
    if i % 20 == 0:
        print(i)    
        
    query = df_generated_data.at[i, 'query']
    chunk = df_generated_data.at[i, 'context']
    answer = df_generated_data.at[i, 'gold_answer']


    PROMPT_HALLUCINATION_GENERATION = f"""
    Given a question, a correct answer and a reference document chunk, write a HALLUCINATION ANSWER to the question.
    
    [QUESTION]
    {query}
    [A CORRECT ANSWER]
    {answer}
    [DOCUMENT CHUNK]
    ***
    {chunk}
    ***
    
    [INSTUCTION]
    - You're given a CORRECT ANSWER to the QUESTION. The CORRECT ANSWER provided is consistent with the information in the DOCUMENT CHUNK.
    - This HALLUCINATION ANSWER you need to write is mostly correct, but contains information that is not fully supported by the DOCUMENT CHUNK. The unsupported content in the HALLUCINATION ANSWER is minor and subtle.
    - Be creative with writing the HALLUCINATION ANSWER. You know the domain terminology and jargon very well. Make it realistic sounding and hard to catch even for a domain expert. 
    - For example, write an answer that is mostly correct, but contains a small detail that does not match the context in the document chunk, or one part of the answer talks about something that does not exist in the document chunk, or one part of the answer is missing some details that causes the answer to be misleading.

    [EXAMPLES]
    Here are some examples of HALLUCINATION ANSWERs that are hard to catch.

    [EXAMPLE 1]
    
    [EXAMPLE 2]
    
    [EXAMPLE 3] 
    
    
    [END OF EXAMPLES]
    
    You must output ONLY the HALLUCINATION ANSWER. Don't write anything other than the HALLUCINATION ANSWER itself.
    """
    df_generated_data.at[i, 'hallucination_answer'] = model3.generate_response(PROMPT_HALLUCINATION_GENERATION)

In [None]:
df_generated_data.head()

In [None]:
df_generated_data.to_csv('df_10k_llama3_3_v1.csv')

## Reshape

In [None]:
df_final = pd.DataFrame(columns = ['query', 'context', 'answer', 'ground_truth_label'])

In [None]:
for i in range(df_generated_data.shape[0]):
    query = df_generated_data.at[i, 'query']
    context = df_generated_data.at[i, 'context']
    row_data1 = [query, context, df_generated_data.at[i, 'gold_answer'], 'not hallucination']
    row_data2 = [query, context, df_generated_data.at[i, 'hallucination_answer'], 'hallucination']
    df_final.loc[len(df_final)] = row_data1
    df_final.loc[len(df_final)] = row_data2

In [None]:
df_final.to_csv('Phantom_10k_seed.csv')