In [3]:
 
import urllib.request
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
import scipy.io
import numpy as np
def process_ulysses(file_path):
    try:
        mat_contents = scipy.io.loadmat(file_path)
        words = mat_contents['words'].squeeze()  # Convert to 1D array
        counts = mat_contents['counts'].squeeze()  # Convert to 1D array
        if len(words) == len(counts):
            # Create a DataFrame from the words and counts
            df = pd.DataFrame({
                'Word': words,
                'Count': counts
            })
            return df
        else:
            print("The lengths of 'words' and 'counts' do not match.")
            return None
    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

def read_word_counts(book_title):
    if book_title == 'compte':  # Le Compte de Monte Cristo (in French)
        url, word_start, word_end = "https://www.gutenberg.org/cache/epub/17989/pg17989.txt", 313, 131872  # Adjust word indices
    if book_title == 'fall': 
        url, word_start, word_end = "https://www.gutenberg.org/cache/epub/2148/pg2148-images.html#chap2.8", 36604, 44001  # Adjust word indices
    if book_title == 'tell':
        url, word_start, word_end = "https://www.gutenberg.org/cache/epub/2148/pg2148-images.html#chap2.20", 90883, 93077
    if book_title == 'pride':
        url, word_start, word_end = "https://www.gutenberg.org/cache/epub/42671/pg42671.txt", 337, 123673  # Adjust word indices
    # Fetch the content
    response = urllib.request.urlopen(url)
    content = response.read().decode('utf-8')  # Decode the bytes to string
    # Use nltk's RegexpTokenizer to tokenize words from the entire content
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(content)  # Tokenize the entire content
    # Select the word range based on word indices (not characters)
    words = words[word_start:word_end]
    # Display the first and last 10 words in the selected range
    # find THE TELL-TALE HEART. in the list of words and print index
    # print(words.index('Usher'))

    print("First 10 words:", words[:50])
    print("Last 10 words:", words[-10:])
    # Count occurrences of each word
    word_counts = pd.Series(words).value_counts().reset_index()
    word_counts.columns = ['Word', 'Count']
    # Sort word_counts by count in ascending order
    word_counts = word_counts.sort_values(by='Count', ascending=True).reset_index(drop=True)
    return word_counts
# Function to estimate innovation rate
def estimate_innovation_rate(counts_df):
    counts_df = counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
    counts_df['Rank'] = np.arange(1, len(counts_df) + 1)
    rho_est = len(counts_df)/counts_df["Count"].sum()
    return rho_est, counts_df
# Function to calculate theoretical and empirical values
def calculate_theoretical_values(rho_est):
    n_1 = 1 / (2 - rho_est)
    n_2 = n_1 * (1 - rho_est) / (1 + (1 - rho_est) * 2)
    n_3 = n_2 * (2 - 2*rho_est) / (1 + (1 - rho_est) * 3)   
    return n_1, n_2, n_3
def get_word_counts(df):
    count_once = 0
    count_twice = 0
    count_thrice = 0
    # Loop through the dataframe and increment the respective counter
    for count in df['Count']:
        if count == 1:
            count_once += 1
        elif count == 2:
            count_twice += 1
        elif count == 3:
            count_thrice += 1
    # Print the results
    print("The total number of words is: ", df["Count"].sum(), " and the number of unique words is: ", len(df))
    print(f"Unique Words that appear once: {count_once}")
    print(f"Unique Words that appear twice: {count_twice}")
    print(f"Unique Words that appear three times: {count_thrice}")
    print("\n-------\n")
    print("Thus our n_1^g estimate is: ", round(count_once / len(df), 3))
    print("Thus our n_2^g estimate is: ", round(count_twice / len(df), 3))
    print("Thus our n_3^g estimate is: ", round(count_thrice / len(df), 3))
    print("\n-------------\n")
    # Innovation rate estimate
    rho_est = len(df)/df["Count"].sum()
    return rho_est
# Main execution function for each book
def analyze_book(book_title):
    print(f"\nBook: {book_title}")
    # if book title  is ulysses call process_ulysses
    if book_title == 'ulysses':
        file_path = '/Users/robinwoodfamily/Downloads/ulysses.mat'
        mat_data = process_ulysses(file_path)
        innovation_rate, processed_counts_df = estimate_innovation_rate(mat_data)
        theoretical_values_est = calculate_theoretical_values(0.1150)
        get_word_counts(processed_counts_df)
        print(f"Estimated Innovation Rate (rho): {innovation_rate:.4f}")
        print(f"Theoretical n_1: {theoretical_values_est[0]:.3f}, n_2: {theoretical_values_est[1]:.3f}, n_3: {theoretical_values_est[2]:.3f}")
        return 
    word_counts_df = read_word_counts(book_title)
    get_word_counts(word_counts_df)
    innovation_rate, processed_counts_df = estimate_innovation_rate(word_counts_df)
    # Calculate theoretical and empirical values
    theoretical_values_est = calculate_theoretical_values(innovation_rate)
    print(f"Estimated Innovation Rate (rho): {innovation_rate:.4f}")
    print(f"Theoretical n_1: {theoretical_values_est[0]:.3f}, n_2: {theoretical_values_est[1]:.3f}, n_3: {theoretical_values_est[2]:.3f}")
    return innovation_rate, theoretical_values_est

# Analyze all three books
# analyze_book('compte')
# analyze_book('pride')
analyze_book('compte')
 


Book: compte
First 10 words: ['Le', '24', 'février', '1815', 'la', 'vigie', 'de', 'Notre', 'Dame', 'de', 'la', 'Garde', 'signala', 'le', 'trois', 'mâts', 'le', '_Pharaon_', 'venant', 'de', 'Smyrne', 'Trieste', 'et', 'Naples', 'Comme', 'd', 'habitude', 'un', 'pilote', 'côtier', 'partit', 'aussitôt', 'du', 'port', 'rasa', 'le', 'château', 'd', 'If', 'et', 'alla', 'aborder', 'le', 'navire', 'entre', 'le', 'cap', 'de', 'Morgion', 'et']
Last 10 words: ['les', 'enchantements', 'de', 'ce', 'rêve', 'inouï', 'FIN', 'DU', 'TOME', 'PREMIER']
The total number of words is:  131559  and the number of unique words is:  11630
Unique Words that appear once: 5737
Unique Words that appear twice: 1782
Unique Words that appear three times: 961

-------

Thus our n_1^g estimate is:  0.493
Thus our n_2^g estimate is:  0.153
Thus our n_3^g estimate is:  0.083

-------------

Estimated Innovation Rate (rho): 0.0884
Theoretical n_1: 0.523, n_2: 0.169, n_3: 0.082


(np.float64(0.08840140165249052),
 (np.float64(0.5231223756203079),
  np.float64(0.16891403297578708),
  np.float64(0.08245794637893936)))