# Preppare data for the RAG App

In [1]:
import requests
from pathlib import Path

def download_file(url: str, filename: str) -> None:
    """
    Download a file from a URL and save it to a specified location.

    Parameters:
    url (str): The URL of the file to download.
    filename (str): The name to save the file as.

    Returns:
    None
    """
    # Construct the destination path
    destination = Path('./data') / filename
    destination.parent.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists

    response = requests.get(url)
    response.raise_for_status()  # Raise an exception if the request was unsuccessful

    with open(destination, 'wb') as f:
        f.write(response.content)

    print(f"File downloaded successfully to {destination}")

In [2]:
books = {
        "Pride and Prejudice":'https://www.gutenberg.org/cache/epub/1342/pg1342.txt',
         "The Adventures of Sherlock Holmes":'https://www.gutenberg.org/cache/epub/1661/pg1661.txt',
         "Alice’s Adventures in Wonderland":'https://www.gutenberg.org/cache/epub/11/pg11.txt',
         "The Picture of Dorian Gray":'https://www.gutenberg.org/cache/epub/174/pg174.txt',
         "Dracula":'https://www.gutenberg.org/cache/epub/345/pg345.txt'

}

for book in books:
    download_file(books[book], book)

File downloaded successfully to data/Pride and Prejudice
File downloaded successfully to data/The Adventures of Sherlock Holmes
File downloaded successfully to data/Alice’s Adventures in Wonderland
File downloaded successfully to data/The Picture of Dorian Gray
File downloaded successfully to data/Dracula


In [3]:
import reprlib
def display_pages(text):
    lines = text.splitlines()
    count = 0
    for line in lines:
        if line.strip(): 
            print(repr(line))
            count += 1
        if count >= 200:
            break

In [4]:
for book in books:
    with open(f'./data/{book}', 'r') as file:
        content = file.read()
        display_pages(content)
        print('\n')
        print('-'* 80)
        print('\n')

 

'\ufeffThe Project Gutenberg eBook of Pride and Prejudice'
'This ebook is for the use of anyone anywhere in the United States and'
'most other parts of the world at no cost and with almost no restrictions'
'whatsoever. You may copy it, give it away or re-use it under the terms'
'of the Project Gutenberg License included with this ebook or online'
'at www.gutenberg.org. If you are not located in the United States,'
'you will have to check the laws of the country where you are located'
'before using this eBook.'
'Title: Pride and Prejudice'
'Author: Jane Austen'
'Release date: June 1, 1998 [eBook #1342]'
'                Most recently updated: October 29, 2024'
'Language: English'
'Credits: Chuck Greif and the Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images available at The Internet Archive)'
'*** START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***'
'                            [Illustration:'
'                             GEORGE A