# Read JSON file

In [7]:
import json

# Open and read the JSON file
# with open('data_crawl/combined-newsqa-data-v1-format.json', 'r') as file:
#     data = json.load(file)
with open('./data/combined-newsqa-data-v1.json', 'r') as file:
    data = json.load(file)

# Function to extract text, questions (q), and answers, filtering out {'noAnswer': True} and duplicates
def extract_data(data, limit=None):
    extracted_data = []
    
    # Loop through each story, limited by the specified count
    for index, story in enumerate(data.get("data", [])):
        if limit is not None and index >= limit:
            break

        text = story.get("text", "") ## Clean up new line
        questions = story.get("questions", [])
        
        # Create a list to hold questions and their corresponding answers
        questions_with_answers = []
        
        # Loop through each question and its associated answers
        for question in questions:
            q = question.get("q", "")
            answers = set()  # Use set to avoid duplicate answers
            
            # Extract and filter answers
            for answer in question.get("answers", []):
                sourcer_answers = answer.get("sourcerAnswers", [])
                
                # Filter out {'noAnswer': True} and convert positions to text
                for sourcer_answer in sourcer_answers:
                    if "noAnswer" not in sourcer_answer:
                        start = sourcer_answer.get("s", -1)
                        end = sourcer_answer.get("e", -1) - 1
                        
                        # Extract the word from the text
                        if start != -1 and end != -1:
                            extracted_text = text[start:end].strip()
                            answers.add(extracted_text)
            
            # Convert the set to a list of dictionaries
            unique_answers = list(answers)
            
            if unique_answers:  # Check if the answers list is not empty
                questions_with_answers.append({
                    "question": q,
                    "answers": unique_answers
                })
        
        # Append the story data with text and its questions with answers
        extracted_data.append({
            "text": text,
            "qa": questions_with_answers
        })
    
    return extracted_data

# Extracted data
extracted_data = extract_data(data, limit = 200) # Only took 100/12400 stories because takes all the stories makes the txt files too big


In [3]:
# Print extracted data
for item in extracted_data:
    print(f"Text: {item['text'][:50]}")
    print("----------------")
    for qa in item['qa']:
        print(f"Question: {qa['question']}")
        for answer in qa['answers']:
            # print(f"Start: {answer['start']}, End: {answer['end']}")
            # print(f"Answer: '{answer['extracted_text']}'")
            print(f"Answer: '{answer}'")
    print("#######################")

Text: NEW DELHI, India (CNN) -- A high court in northern
----------------
Question: What was the amount of children murdered?
Answer: '19'
Question: When was Pandher sentenced to death?
Answer: 'in February.'
Answer: 'February.'
Question: The court aquitted Moninder Singh Pandher of what crime?
Answer: 'A high'
Answer: 'rape and murder'
Question: who was acquitted
Answer: 'Moninder Singh Pandher'
Question: who was sentenced
Answer: 'Moninder Singh Pandher'
Answer: 'to death by a lower court in February.'
Question: What was Moninder Singh Pandher acquitted for?
Answer: 'the killing of a teen in a case dubbed "the house of horrors."'
Answer: 'the killing of a teen'
Answer: 'killing of a teen'
Question: Who was sentenced to death in February?
Answer: 'Moninder Singh Pandher'
Question: how many people died
Answer: '19'
Question: How many children and young women were murdered?
Answer: '19'
#######################
Text: (CNN) -- Fighting in the volatile Sudanese region 
----------------
Que

In [8]:
print(f"Numbers of articles: {len(extracted_data)}")

Numbers of articles: 200


# Save file

In [9]:
with open('./data/evaluation_data.txt', 'w', encoding='utf-8') as output_file:
    for item in extracted_data:
        output_file.write(f"{json.dumps(item)}\n")

# Save individual stories

In [13]:
import os

def save_all_stories(save_location, extracted_data):
    """Save all story texts from extracted_data to separate txt files."""
    # Create a directory for saving stories if it doesn't exist
    os.makedirs(f"{save_location}/stories", exist_ok=True)
    
    for index, story in enumerate(extracted_data):
        text = story.get("text", "").strip()  # Get the story text
        
        # Define the filename based on the index
        filename = f'{save_location}/stories/url_{index + 1}.txt'  # Adding 1 to start from 1 instead of 0
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text)
save_location = "./data"
save_all_stories(save_location, extracted_data)

# Load file

In [6]:
import json

# Path to your text file
data_file = "./data/evaluation_data.txt"

# Function to read and extract the "text" value from each line
def extract_text_values(file_path):
    text_values = []  # List to store the "text" values
    with open(file_path, "r") as file:
        for line in file:
            try:
                # Parse the JSON string into a dictionary
                json_data = json.loads(line)
                # Extract the "text" value and append it to the list
                text_values.append(json_data["text"])
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
            except KeyError as e:
                print(f"Key error: {e}")
    return text_values

# Call the function and get the "text" values
text_list = extract_text_values(data_file)

# Print the extracted text values
for index, text in enumerate(text_list):
    print(f"{index}: {text[:20]}")  # Print the first 20 characters of each text


0: NEW DELHI, India (CN
1: (CNN) -- Fighting in
2: Johannesburg (CNN) -
3: (CNN)  -- England in
4: BAGHDAD, Iraq (CNN) 
5: (CNN) -- Pope John P
6: CNN affiliates repor
7: WASHINGTON (CNN) -- 
8: TEHRAN, Iran (CNN) -
9: (CNN) -- Jewish orga
10: (CNN) -- A phone hac
11: ATLANTA, Georgia (CN
12: Berlin, Germany (CNN
13: SAVANNAH, Georgia (C
14: (CNN) -- Les Paul ne
15: United Nations (CNN)
16: BOSTON, Massachusett
17: (CNN Student News)  
18: UNITED NATIONS (CNN)
19: (CNN) -- Barcelona m
20: BAGHDAD, Iraq (CNN) 
21: (CNN) -- Two goals i
22: WASHINGTON (CNN) -- 
23: (CNN)  -- When Kello
24: (CNN) -- Kyrgyzstan'
25: Cleveland, Ohio (CNN
26: NEW YORK (CNN)  -- A
27: New York (CNN) -- De
28: (CNN) -- A former go
29: LONDON, England (CNN
30: London (CNN) -- Pers
31: (CNN)  -- Top Republ
32: WASHINGTON (CNN) -- 
33: LOS ANGELES, Califor
34: Washington (CNN) -- 
35: (CNN) -- Henry Josep
36: BOGOTA, Colombia (CN
37: (CNN) -- Authorities
38: LONDON, England (CNN
39: COLOMBO, Sri Lanka (
40: (CNN) 

In [4]:
import pickle

def convert_pkl_to_txt(pkl_file_path, txt_file_path):
    # Load the pickle file
    with open(pkl_file_path, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    
    # Save the data to a text file
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(str(data))

# Example usage
pkl_file_path = './data/eval_dict_qa_crawl_vn.pkl'
txt_file_path = './data/eval_dict_qa_crawl_vn.txt'
convert_pkl_to_txt(pkl_file_path, txt_file_path)