In [13]:
from datasets import load_dataset
import ast

ds = load_dataset("google/frames-benchmark")

question=ds['test'][19]['Prompt']
answer=ds['test'][19]['Answer']
references_doc=ast.literal_eval(ds['test'][19]['wiki_links'])

In [2]:
from dataclasses import dataclass, field

@dataclass
class Document:
    page_content: str
    metadata: dict = field(default_factory=dict)

    def __post_init__(self):
        # Initialize metadata title and url if not present
        self.metadata.setdefault('title', '')
        self.metadata.setdefault('url', '')

In [None]:
from langchain_community.document_loaders import WikipediaLoader
from urllib.parse import unquote
import json
import os



def get_wikipedia_title(url):
    # Split the URL to get the last part after "/wiki/"
    title_part = url.split('/wiki/')[-1]
    # Decode any percent-encoded characters, e.g., spaces represented as %20
    title = unquote(title_part)
    # Replace underscores with spaces if needed
    title = title.replace('_', ' ')
    return title


def link_to_json_file(wiki_link: str, language: str):
    wiki_query = get_wikipedia_title(wiki_link)
    try:
        pages = WikipediaLoader(query=wiki_query.strip(), lang=language, load_all_available_meta=False).load()
        file_name = wiki_query.strip()
        return file_name, pages
    except Exception as e:
        job_status = "Failed"
        message = "Failed To Process Wikipedia Query"
        error_message = str(e)
        file_name = wiki_query.strip()
        return file_name, {
            "job_status": job_status,
            "message": message,
            "error": error_message,
            "file_name": file_name
        }


for i in range(0, len(references_doc)):
    print(references_doc[i])
    file_name, pages = link_to_json_file(references_doc[i], 'en')
    file_name = file_name.replace(" ", "_")
    combined_content = "\n".join([page.page_content for page in pages])
    title=pages[0].metadata['title']
    print(title)
    # Ensure the directory exists
    os.makedirs('./doc/', exist_ok=True)

    # Save the pages to a file
    with open(f'./doc/{file_name}.json', 'w') as f:
        content=Document(
            page_content=combined_content,
            metadata= {'title':pages[0].metadata['title'],'url':references_doc[0]}
        )
        json.dump(content, f)

    print(f"Saved pages to /doc/{file_name}.json")



https://en.wikipedia.org/wiki/United_States_federal_executive_departments#Former_departments
United States federal executive departments
Saved pages to /doc/United States federal executive departments#Former departments.json
https://en.wikipedia.org/wiki/United_States_Secretary_of_Homeland_Security
United States Secretary of Homeland Security
Saved pages to /doc/United States Secretary of Homeland Security.json
https://en.wikipedia.org/wiki/Tom_Ridge




  lis = BeautifulSoup(html).find_all('li')


Tom Ridge
Saved pages to /doc/Tom Ridge.json


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
import hashlib


def split2chunks(text: str,chunk_size:int,chunk_overlap:int) -> List[str]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = splitter.split_text(text)
    return chunks


def generate_md5_hash(input_string):
    # Create an MD5 hash object
    md5_hash = hashlib.md5()
    
    # Update the hash object with the bytes of the input string
    md5_hash.update(input_string.encode('utf-8'))
    
    # Get the hexadecimal representation of the hash
    return md5_hash.hexdigest()

def docs_to_chunks_json(doc_data:Document,chunk_size:int,chunk_overlap:int):
    doc_chunks={
        "doc_id":doc_data.metadata['title'].replace(" ","_"),
        "original_uuid": generate_md5_hash(doc_data.metadata['title'].replace(" ","_")),
        "content":doc_data.page_content
    }
    chunk_list=split2chunks(doc_data.page_content,chunk_size,chunk_overlap)
    chunks=[]
    for i in range(len(chunk_list)):
        chunk_obj={
            "chunk_id":doc_chunks['doc_id']+"_chunk_"+str(i),
            "original_index": i,
            "content":chunk_list[i]
        }
        chunks.append(chunk_obj)
    doc_chunks['chunks']=chunks
    return doc_chunks


In [21]:
import json
import os


# Traverse files under the 'doc/' directory
for file_name in os.listdir('doc/'):
    if file_name.endswith('.json'):
        file_path = os.path.join('doc/', file_name)
        codebase_chunks=[]
        with open(file_path) as file:
            doc_data = json.load(file)
            document = Document(page_content=doc_data['page_content'], metadata=doc_data['metadata'])
            chunked_data = docs_to_chunks_json(document, chunk_size=1000, chunk_overlap=100)
            codebase_chunks.append(chunked_data)
            # Ensure the directory exists
            os.makedirs('./chunked/', exist_ok=True)

            # Save the chunked data to a file
            with open(f"./chunked/chunks_{file_name}", 'w') as f:
                json.dump(codebase_chunks, f,indent=4, ensure_ascii=False)