# Project Gutenberg Analysis

Explain why and how

In [None]:
import numpy as np
import pandas as pd
import zipfile
import sys
import os
import pickle
import gzip
from datetime import datetime
from metainfo import readmetadata

In [None]:
DATA_ROOT = 'data'

In [None]:
time_start = datetime.now()
meta_data = readmetadata()
print('Read in ' + str(len(meta_data)) + ' books in ' + str(datetime.now() - time_start))
print(meta_data.shape)

In [None]:
print(meta_data.columns)

In [None]:
def get_book_text(book_id=None):    
    if not book_id:
#         print('Please pass in a book_id.')
        return
    file_path = os.path.join(DATA_ROOT, 'books', str(book_id) + '.zip')
    if not os.path.isfile(file_path):
#         print('File Does Not Exist')
        return
    
    with zipfile.ZipFile(file_path) as myzip:
        # Assuming we are after the only/first file
        with myzip.open(myzip.namelist()[0]) as myfile:
            # This could be done in one line but split for readability
            # Plus, this only needs to be run once per file and the results are then saved
            try:
                raw_data = myfile.read().decode('utf-8')
                # This removes the Project Gutenberg Header
                book_text = ''.join(raw_data.split('***')[2:])
                # Removes new lines
                book_text = book_text.replace('\n', ' ').replace('\r', ' ')
                return book_text
            except:
                return None
    return None

In [None]:
def get_all_books_text(meta_data):
    file_name = os.path.join(DATA_ROOT, 'books.text.pkl.gz')
    if os.path.isfile(file_name):
        return pickle.load(gzip.open(file_name, 'rb'))
    text = {}
    num_books = len(meta_data)
    start_time = datetime.now()
    for counter, book_id in enumerate(meta_data['id']):
        if counter % 1000 == 0:
            print('Processing book %s of %s in %s' % (counter, num_books, datetime.now() - start_time))
        book_text = get_book_text(book_id=book_id)
        if book_text:
            text[book_id] = book_text
    pickle.dump(text, gzip.open(file_name, 'wb'), protocol=-1)
    return text

In [None]:
time_start = datetime.now()
text = get_all_books_text(meta_data)
print('loaded ' + str(len(text)) + ' books in ' + str(datetime.now() - time_start))