In [None]:
import time

import matplotlib.pyplot as plt
import pandas as pd
import spacy

### Load existing data

In [None]:
df_data_chunk = pd.read_csv('df_10k_llama3_3_v1.csv', index_col = 0) # Load seed dataset created in Phantom_generate_seed_hallucination_data.ipynb
df_data_chunk['chunk_token_count'] = df_data_chunk['chunk_token_count'].astype(int)
df_data_chunk['document_token_count'] = df_data_chunk['document_token_count'].astype(int)
df_data_chunk['chunk_starting_sentence_idx'] = df_data_chunk['chunk_starting_sentence_idx'].astype(int)

In [None]:
df_data_chunk.head()

In [None]:
df_data_chunk.shape

In [None]:
expand_token_count = 10000 ## Context Limit update for 2k,5k,10k,20k,30k
token_limit = 10500

In [None]:
df_data_chunk = df_data_chunk[df_data_chunk["document_token_count"] >= expand_token_count].copy()
df_data_chunk = df_data_chunk.reset_index(drop = True)
df_data_chunk.shape

In [None]:
df_data_chunk.head()

## Long context Expansion - END

In [None]:
df_generated_data_end = df_data_chunk

In [None]:
df_generated_data_end.head()

In [None]:
nlp = spacy.load("en_core_web_sm")

def expand_chunk_end(nlp, chunk_context, chunk_token_count, doc_text, chunk_starting_sentence_idx, expand_token_count=2000, token_limit=2200):
  
    doc = nlp(doc_text)
    
    # Extract sentences
    sentences = list(doc.sents)
    n_sent = len(sentences)
    
    chunk = [chunk_context]

    current_sent_idx = chunk_starting_sentence_idx - 1
    while True:
        current_sent_idx %= n_sent
        sent = sentences[current_sent_idx]

        new_token_count = chunk_token_count + len(sent)
        if new_token_count >= expand_token_count and new_token_count <= token_limit:
            chunk_token_count = new_token_count
            chunk = [sent.text] + chunk
            chunk_starting_sentence_idx = current_sent_idx
            break
        elif new_token_count >= expand_token_count and new_token_count > token_limit:   
            partial_first_sentence = " ".join([token.text for token in sent[(len(sent) - (token_limit - chunk_token_count)):]])
            chunk = [partial_first_sentence] + chunk
            chunk_token_count = token_limit
            break
        chunk_token_count = new_token_count
        chunk = [sent.text] + chunk
        chunk_starting_sentence_idx = current_sent_idx
        
        current_sent_idx -= 1
    
    # Join the sentences to form the chunk
    chunk_text = " ".join(chunk)
       
    return {
        "chunk": chunk_text,
        "chunk_token_count": chunk_token_count,
        "chunk_starting_sentence_idx": chunk_starting_sentence_idx
    }

In [None]:
## END

start_time_total = time.time()
start_time = time.time()
for i in range(df_generated_data_end.shape[0]):
    if i % 50 == 0:
        print(i)
        print("Process time: ", round(time.time()-start_time, 2), " seconds.")
        start_time = time.time()

    chunk_context = df_data_chunk.at[i, 'context']
    chunk_token_count = df_generated_data_end.at[i, 'chunk_token_count']
    doc_text = df_generated_data_end.at[i, 'filing_text']
    chunk_starting_sentence_idx = df_generated_data_end.at[i, 'chunk_starting_sentence_idx'] 

    chunk_result = expand_chunk_end(nlp, chunk_context, chunk_token_count, doc_text, chunk_starting_sentence_idx,
                                expand_token_count, token_limit)

    df_generated_data_end.at[i, 'context'] = chunk_result["chunk"]
    df_generated_data_end.at[i, 'chunk_token_count'] = chunk_result["chunk_token_count"]
    df_generated_data_end.at[i, 'chunk_starting_sentence_idx'] = chunk_result["chunk_starting_sentence_idx"]
print("Total process time: ", round(time.time()-start_time_total, 2), " seconds.")

In [None]:
plt.hist(df_generated_data_end['chunk_token_count'], bins=30)
plt.xlabel('Token count')
plt.ylabel('Frequency')
plt.title('Distribution of chunk length (token count)')
plt.show()

In [None]:
df_generated_data_end.to_csv('Phantom_10k_10000tokens_end_full.csv')

In [None]:
df_final_end = pd.DataFrame(columns = ['query', 'context', 'answer', 'ground_truth_label'])

In [None]:
for i in range(df_generated_data_end.shape[0]):
    query = df_generated_data_end.at[i, 'query']
    context = df_generated_data_end.at[i, 'context']
    row_data1 = [query, context, df_generated_data_end.at[i, 'gold_answer'], 'not hallucination']
    row_data2 = [query, context, df_generated_data_end.at[i, 'hallucination_answer'], 'hallucination']
    df_final_end.loc[len(df_final_end)] = row_data1
    df_final_end.loc[len(df_final_end)] = row_data2

In [None]:
df_final_end.to_csv('Phantom_10k_10000tokens_end.csv')

## Long context - BEGINNING

In [None]:
df_generated_data_begin = df_data_chunk

In [None]:
nlp = spacy.load("en_core_web_sm")

def expand_chunk_begin(nlp, doc_text, chunk_starting_sentence_idx, expand_token_count=2000, token_limit=2200):
  
    doc = nlp(doc_text)
    
    # Extract sentences
    sentences = list(doc.sents)
    n_sent = len(sentences)
    
    chunk = []
    chunk_token_count = 0

    current_sent_idx = chunk_starting_sentence_idx
    while True:
        current_sent_idx %= n_sent
        sent = sentences[current_sent_idx]
        
        chunk.append(sent.text)
        new_token_count = chunk_token_count + len(sent)
        if new_token_count >= expand_token_count and new_token_count <= token_limit:
            chunk_token_count = new_token_count
            break
        elif new_token_count >= expand_token_count and new_token_count > token_limit:   
            partial_last_sentence = " ".join([token.text for token in sent[:(token_limit - chunk_token_count)]])
            chunk[-1] = partial_last_sentence
            chunk_token_count = token_limit
            break
        chunk_token_count = new_token_count 
        current_sent_idx += 1
    
    # Join the sentences to form the chunk
    chunk_text = " ".join(chunk)
       
    return {
        "chunk": chunk_text,
        "chunk_token_count": chunk_token_count,
        "chunk_starting_sentence_idx": chunk_starting_sentence_idx
    }

In [None]:
start_time = time.time()
for i in range(df_generated_data_begin.shape[0]):
    if i % 50 == 0:
        print(i)
        print("Process time: ", round(time.time()-start_time, 2), " seconds.")
        start_time = time.time()
    
    doc_text = df_generated_data_begin.at[i, 'filing_text']
    chunk_starting_sentence_idx = df_generated_data_begin.at[i, 'chunk_starting_sentence_idx'] 
    
    chunk_result = expand_chunk_begin(nlp, doc_text, chunk_starting_sentence_idx, expand_token_count, token_limit)

    df_generated_data_begin.at[i, 'context'] = chunk_result["chunk"]
    df_generated_data_begin.at[i, 'chunk_token_count'] = chunk_result["chunk_token_count"]
    df_generated_data_begin.at[i, 'chunk_starting_sentence_idx'] = chunk_result["chunk_starting_sentence_idx"]

In [None]:
plt.hist(df_generated_data_begin['chunk_token_count'], bins=30)
plt.xlabel('Token count')
plt.ylabel('Frequency')
plt.title('Distribution of chunk length (token count)')
plt.show()

In [None]:
df_generated_data_begin.to_csv('Phantom_10k_10000tokens_beginning_full.csv')

In [None]:
df_final_begin = pd.DataFrame(columns = ['query', 'context', 'answer', 'ground_truth_label'])

In [None]:
for i in range(df_generated_data_begin.shape[0]):
    query = df_generated_data_begin.at[i, 'query']
    context = df_generated_data_begin.at[i, 'context']
    row_data1 = [query, context, df_generated_data_begin.at[i, 'gold_answer'], 'not hallucination']
    row_data2 = [query, context, df_generated_data_begin.at[i, 'hallucination_answer'], 'hallucination']
    df_final_begin.loc[len(df_final_begin)] = row_data1
    df_final_begin.loc[len(df_final_begin)] = row_data2

In [None]:
df_final_begin.to_csv('Phantom_10k_10000tokens_beginning.csv')

## Long context - MIDDLE

In [None]:
df_generated_data_middle = df_data_chunk

In [None]:
nlp = spacy.load("en_core_web_sm")

def expand_chunk_middle_1(nlp, doc_text, chunk_starting_sentence_idx, expand_token_count=2000, token_limit=2200):
  
    doc = nlp(doc_text)
    
    # Extract sentences
    sentences = list(doc.sents)
    n_sent = len(sentences)
    
    chunk = []
    chunk_token_count = 0

    current_sent_idx = chunk_starting_sentence_idx
    while True:
        current_sent_idx %= n_sent
        sent = sentences[current_sent_idx]
        
        chunk.append(sent.text)
        new_token_count = chunk_token_count + len(sent)
        if new_token_count >= expand_token_count and new_token_count <= token_limit:
            chunk_token_count = new_token_count
            break
        elif new_token_count >= expand_token_count and new_token_count > token_limit:   
            partial_last_sentence = " ".join([token.text for token in sent[:(token_limit - chunk_token_count)]])
            chunk[-1] = partial_last_sentence
            chunk_token_count = token_limit
            break
        chunk_token_count = new_token_count 
        current_sent_idx += 1
    
    # Join the sentences to form the chunk
    chunk_text = " ".join(chunk)
       
    return {
        "chunk": chunk_text,
        "chunk_token_count": chunk_token_count,
        "chunk_starting_sentence_idx": chunk_starting_sentence_idx,
        "sentences": sentences
    }



def expand_chunk_middle_2(sentences, chunk_context, chunk_token_count, doc_text, chunk_starting_sentence_idx, 
                          expand_token_count=2000, token_limit=2200):
    n_sent = len(sentences)
    
    chunk = [chunk_context]

    current_sent_idx = chunk_starting_sentence_idx - 1
    while True:
        current_sent_idx %= n_sent
        sent = sentences[current_sent_idx]

        new_token_count = chunk_token_count + len(sent)
        if new_token_count >= expand_token_count and new_token_count <= token_limit:
            chunk_token_count = new_token_count
            chunk = [sent.text] + chunk
            chunk_starting_sentence_idx = current_sent_idx
            break
        elif new_token_count >= expand_token_count and new_token_count > token_limit:   
            partial_first_sentence = " ".join([token.text for token in sent[(len(sent) - (token_limit - chunk_token_count)):]])
            chunk = [partial_first_sentence] + chunk
            chunk_token_count = token_limit
            break
        chunk_token_count = new_token_count
        chunk = [sent.text] + chunk
        chunk_starting_sentence_idx = current_sent_idx
        
        current_sent_idx -= 1
    
    # Join the sentences to form the chunk
    chunk_text = " ".join(chunk)
       
    return {
        "chunk": chunk_text,
        "chunk_token_count": chunk_token_count,
        "chunk_starting_sentence_idx": chunk_starting_sentence_idx
    }

In [None]:
start_time = time.time()
for i in range(df_generated_data_middle.shape[0]):
    if i % 50 == 0:
        print(i)
        print("Process time: ", round(time.time()-start_time, 2), " seconds.")
        start_time = time.time()


    chunk_context = df_generated_data_middle.at[i, 'context']
    chunk_token_count = df_generated_data_middle.at[i, 'chunk_token_count']
    doc_text = df_generated_data_middle.at[i, 'filing_text']
    chunk_starting_sentence_idx = df_generated_data_middle.at[i, 'chunk_starting_sentence_idx'] 

    temp_target_token_count = chunk_token_count + (expand_token_count - chunk_token_count) // 2
    temp_token_limit = temp_target_token_count + token_limit - expand_token_count
    temp_chunk_result = expand_chunk_middle_1(nlp, doc_text, chunk_starting_sentence_idx, temp_target_token_count, temp_token_limit)

    chunk_context = temp_chunk_result['chunk']
    chunk_token_count = temp_chunk_result['chunk_token_count']
    chunk_starting_sentence_idx = temp_chunk_result['chunk_starting_sentence_idx']
    sentences = temp_chunk_result['sentences']

    chunk_result = expand_chunk_middle_2(sentences, chunk_context, chunk_token_count, doc_text, chunk_starting_sentence_idx,
                                expand_token_count, token_limit)    

    df_generated_data_middle.at[i, 'context'] = chunk_result["chunk"]
    df_generated_data_middle.at[i, 'chunk_token_count'] = chunk_result["chunk_token_count"]
    df_generated_data_middle.at[i, 'chunk_starting_sentence_idx'] = chunk_result["chunk_starting_sentence_idx"]

In [None]:
plt.hist(df_generated_data_middle['chunk_token_count'], bins=30)
plt.xlabel('Token count')
plt.ylabel('Frequency')
plt.title('Distribution of chunk length (token count)')
plt.show()

In [None]:
df_data_chunk.to_csv('Phantom_10k_10000tokens_middle_full.csv')

In [None]:
df_final_middle = pd.DataFrame(columns = ['query', 'context', 'answer', 'ground_truth_label'])

In [None]:
for i in range(df_generated_data_middle.shape[0]):
    query = df_generated_data_middle.at[i, 'query']
    context = df_generated_data_middle.at[i, 'context']
    row_data1 = [query, context, df_generated_data_middle.at[i, 'gold_answer'], 'not hallucination']
    row_data2 = [query, context, df_generated_data_middle.at[i, 'hallucination_answer'], 'hallucination']
    df_final_middle.loc[len(df_final_middle)] = row_data1
    df_final_middle.loc[len(df_final_middle)] = row_data2

In [None]:
df_final_middle.to_csv('Phantom_10k_10000tokens_middle.csv')