In [6]:
import os
import glob
import pandas as pd
from tqdm.notebook import tqdm
import shutil
import random

tqdm.pandas()

In [7]:
def get_top_n_books(file_paths, n=1000):
    """
    Given a list of file paths, select top 'n' books based on character count.
    """
    book_lengths = []
    for file_path in tqdm(file_paths, desc=f"Calculating character counts for {n} books"):
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()
                char_count = len(text)
                book_lengths.append((file_path, char_count))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    # Sort books by character count in descending order
    sorted_books = sorted(book_lengths, key=lambda x: x[1], reverse=True)
    
    # Select top 'n' books
    top_n_books = sorted_books[:n]
    
    return [book[0] for book in top_n_books]  # Return file paths

In [8]:
# Define path to original text files
DATA_PATH = '../data/Gutenberg_original/Gutenberg/txt/'

# Get list of all text files
txt_files = glob.glob(os.path.join(DATA_PATH, '*.txt'))

# Select top 1,000 longest books
top_1000_files = get_top_n_books(txt_files, n=1000)
random.seed(42)
# Select random 100 books from the top 1,000
random_100_files = random.sample(top_1000_files, 100)

Calculating character counts for 1000 books:   0%|          | 0/3022 [00:00<?, ?it/s]

Error reading ../data/Gutenberg_original/Gutenberg/txt\Charles Darwin___On the Origin of Species by Means of Natural Selection or the Preservation of Favoured Races in the Struggle for Life. (2nd edition).txt: [Errno 2] No such file or directory: '../data/Gutenberg_original/Gutenberg/txt\\Charles Darwin___On the Origin of Species by Means of Natural Selection or the Preservation of Favoured Races in the Struggle for Life. (2nd edition).txt'


In [9]:
# ## 4. Saving the Selected 100 Books

# Define directory to store the selected 100 books
SELECTED_100_DIR = '../data/selected_100_books/'

# Create the directory if it doesn't exist
os.makedirs(SELECTED_100_DIR, exist_ok=True)

# Initialize list to store book titles and their paths
selected_books_info = []

# Copy each of the top 100 books to the selected_100_books directory
for file_path in tqdm(random_100_files, desc="Copying Selected 100 Books"):
    try:
        # Extract the book title from the file name
        book_title = os.path.basename(file_path).replace('.txt', '').replace('_', ' ')
        
        # Define the destination path
        destination_path = os.path.join(SELECTED_100_DIR, f"{book_title}.txt")
        
        # Copy the file
        shutil.copy(file_path, destination_path)
        
        # Append information to the list
        selected_books_info.append({'book_title': book_title, 'file_path': destination_path})
    except Exception as e:
        print(f"Error copying {file_path}: {e}")

# Convert the list to a DataFrame
df_selected_books = pd.DataFrame(selected_books_info)

# Save the list of selected books to a CSV file for future reference
df_selected_books.to_csv('../data/selected_100_books_list.csv', index=False)

# Display a confirmation message
print("\nSelected 100 books have been copied successfully and saved to 'selected_100_books_list.csv'.")


Copying Selected 100 Books:   0%|          | 0/100 [00:00<?, ?it/s]


Selected 100 books have been copied successfully and saved to 'selected_100_books_list.csv'.
