In [5]:
!pip install py-gutenberg



In [1]:
# Imports
import pandas as pd
import numpy as np
import requests
import nltk
from tqdm import tqdm

nltk.download("punkt")  # Download the punkt tokenizer data
from nltk.tokenize import sent_tokenize

# Top 10 Books on Project Gutenberg; Can expand if needed
book_links = [
    "https://www.gutenberg.org/cache/epub/84/pg84.txt", # Frankenstein
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt", # Moby Dick; Or, The Whale
    "https://www.gutenberg.org/cache/epub/2641/pg2641.txt", # A Room with a View
    "https://www.gutenberg.org/cache/epub/145/pg145.txt", # Middlemarch
    "https://www.gutenberg.org/cache/epub/1342/pg1342.txt", # Pride and Prejudice
    "https://www.gutenberg.org/cache/epub/100/pg100.txt", # The Complete Works of William Shakespeare
    "https://www.gutenberg.org/cache/epub/37106/pg37106.txt", # Little Women
    "https://www.gutenberg.org/cache/epub/16389/pg16389.txt", # The Enchanted April
    "https://www.gutenberg.org/cache/epub/67979/pg67979.txt" # The Blue Castle
]


# Fuction that reads in project gutenberg books
def book_reader(book_links):
    corpus = ""

    # Loop through each book and add to corpus
    for book_url in tqdm(book_links):
        response = requests.get(book_url)
        book_text = response.text

        # Clean the text
        lowered_book = str(book_text).lower()
        cleaned_text = lowered_book.replace("\r", "").replace("\n", "")

        # Add to corpus
        corpus = corpus + cleaned_text

    return corpus


# Read in the books and tokenize them
all_text = book_reader(book_links)
sentences = sent_tokenize(all_text)
sentences[:10]


[nltk_data] Downloading package punkt to /Users/Austin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 9/9 [00:06<00:00,  1.50it/s]


['\ufeffthe project gutenberg ebook of frankenstein; or, the modern prometheus    this ebook is for the use of anyone anywhere in the united states andmost other parts of the world at no cost and with almost no restrictionswhatsoever.',
 'you may copy it, give it away or re-use it under the termsof the project gutenberg license included with this ebook or onlineat www.gutenberg.org.',
 'if you are not located in the united states,you will have to check the laws of the country where you are locatedbefore using this ebook.title: frankenstein; or, the modern prometheusauthor: mary wollstonecraft shelleyrelease date: october 1, 1993 [ebook #84]                most recently updated: december 2, 2022language: englishcredits: judith boss, christy phillips, lynn hanninen and david meltzer.',
 'html version by al haines.',
 'further corrections by menno de leeuw.',
 '*** start of the project gutenberg ebook frankenstein; or, the modern prometheus ***frankenstein;or, the modern prometheusby mary

In [2]:
# Chat GPT Prompt: Give me a list of lists in python of 100 sets of homophones
homophones_list = [
    ["ate", "eight"],
    ["bare", "bear"],
    ["brake", "break"],
    ["capital", "capitol"],
    ["cell", "sell"],
    ["cite", "site", "sight"],
    ["complement", "compliment"],
    ["desert", "dessert"],
    ["die", "dye"],
    ["flour", "flower"],
    ["hear", "here"],
    ["hour", "our"],
    ["knight", "night"],
    ["know", "no"],
    ["mail", "male"],
    ["meat", "meet"],
    ["morning", "mourning"],
    ["one", "won"],
    ["pair", "pear"],
    ["peace", "piece"],
    ["principal", "principle"],
    ["rain", "reign", "rein"],
    ["right", "write"],
    ["sea", "see"],
    ["serial", "cereal"],
    ["sole", "soul"],
    ["stationary", "stationery"],
    ["tail", "tale"],
    ["threw", "through"],
    ["to", "too", "two"],
    ["weather", "whether"],
    ["week", "weak"],
    ["wear", "where"],
    ["which", "witch"],
    ["your", "you're"],
    ["allowed", "aloud"],
    ["board", "bored"],
    ["brake", "break"],
    ["capital", "capitol"],
    ["compliment", "complement"],
    ["desert", "dessert"],
    ["dual", "duel"],
    ["fair", "fare"],
    ["genre", "jinja"],
    ["hare", "hair"],
    ["here", "hear"],
    ["hoard", "horde"],
    ["loan", "lone"],
    ["pail", "pale"],
    ["peak", "peek", "pique"],
    ["profit", "prophet"],
    ["role", "roll"],
    ["root", "route"],
    ["sail", "sale"],
    ["scene", "seen"],
    ["serial", "cereal"],
    ["so", "sow"],
    ["stare", "stair"],
    ["steal", "steel"],
    ["their", "there", "they're"],
    ["throne", "thrown"],
    ["vain", "vein", "vane"],
    ["weak", "week"],
    ["wood", "would"],
    ["yew", "you"],
    ["bridal", "bridle"],
    ["cereal", "serial"],
    ["chord", "cord"],
    ["compliment", "complement"],
    ["dew", "due"],
    ["foul", "fowl"],
    ["grate", "great"],
    ["groan", "grown"],
    ["heal", "heel"],
    ["him", "hymn"],
    ["lay", "lie"],
    ["main", "mane"],
    ["marry", "merry"],
    ["mite", "might"],
    ["moose", "mousse"],
    ["mourn", "morn"],
    ["peace", "piece"],
    ["plum", "plumb"],
    ["pour", "pore"],
    ["rap", "wrap"],
    ["scene", "seen"],
    ["scent", "cent", "sent"],
    ["serial", "cereal"],
    ["shear", "sheer"],
    ["soar", "sore"],
    ["sow", "sew"],
    ["stake", "steak"],
    ["tide", "tied"],
    ["toe", "tow"],
    ["there", "their", "they're"],
    ["waist", "waste"],
    ["week", "weak"],
    ["write", "right", "rite"],
]


In [3]:
import random
from tqdm import tqdm


# Function that adds homophone errors at some given probability
def error_creator(sentences, homophones_list, p=0.3):
    # Flatten the homophones list
    all_homophones = [
        word for homophone_set in homophones_list for word in homophone_set
    ]

    # Initialize lists for output DF
    final_sentences = []
    has_homophone_list = []
    is_error_list = []
    error_list = []
    error_idx_list = []
    correct_word_list = []
    correct_sentence_list = []

    # Loop through sentence by sentence
    for sentence in tqdm(sentences):
        # Split words
        sentence_words = sentence.split(" ")

        # Get homophones in sentence, if any
        sentence_homophones = [
            (word, idx)
            for idx, word in enumerate(sentence_words)
            if word in all_homophones
        ]

        # If no homophones, then move on
        if len(sentence_homophones) == 0:
            final_sentence = sentence
            is_error = False
            error = None
            error_idx = None
            correct_word = None
            correct_sentence = sentence
            has_homophone = False

        else:
            has_homophone = True

            # Randomly decide if we want to add an error given set probability
            if p > random.random():
                # Original sentence is assumed to be correct
                correct_sentence = sentence
                is_error = True

                # Randomly select a homophone
                homophone_tuple = random.sample(sentence_homophones, 1)[0]
                homophone = homophone_tuple[0]

                # Get the index of the homophone; important in case a sentence contains the same one multiple times (to, for example, could easily appear several times in a sentence)
                error_idx = homophone_tuple[1]

                correct_word = homophone

                # Get the alternate homophones
                homophone_alternates = [
                    word for word in homophones_list if homophone in word
                ][0]
                homophone_alternates = [
                    word for word in homophone_alternates if word != homophone
                ]

                # Randomly select an alternate homophone to insert
                error = random.sample(homophone_alternates, 1)[0]

                # Replace the homophone with the alternate homophone
                sentence_words[error_idx] = error

                final_sentence = " ".join(sentence_words)

            # If we don't want to add an error, then just keep the sentence as is
            else:
                final_sentence = sentence
                is_error = False
                error = None
                error_idx = None
                correct_word = None
                correct_sentence = sentence

        # Append to lists
        final_sentences.append(final_sentence)
        is_error_list.append(is_error)
        error_list.append(error)
        error_idx_list.append(error_idx)
        correct_word_list.append(correct_word)
        correct_sentence_list.append(correct_sentence)
        has_homophone_list.append(has_homophone)

    # Create output DF
    output_df = pd.DataFrame(
        {
            "sentences": final_sentences,
            "has_homophone": has_homophone_list,
            "is_error": is_error_list,
            "error_idx": error_idx_list,
            "error": error_list,
            "correct_word": correct_word_list,
            "correct_sentence": correct_sentence_list,
        }
    )
    return output_df


# Run function
error_df = error_creator(sentences=sentences, homophones_list=homophones_list, p=0.3)
error_df.head()


100%|██████████| 68573/68573 [00:03<00:00, 18127.98it/s]


Unnamed: 0,sentences,has_homophone,is_error,error_idx,error,correct_word,correct_sentence
0,﻿the project gutenberg ebook of frankenstein; ...,True,True,33.0,know,no,﻿the project gutenberg ebook of frankenstein; ...
1,"you may copy it, give it away or re-use it und...",True,False,,,,"you may copy it, give it away or re-use it und..."
2,"if you are not located in the united states,yo...",True,True,18.0,wear,where,"if you are not located in the united states,yo..."
3,html version by al haines.,False,False,,,,html version by al haines.
4,further corrections by menno de leeuw.,False,False,,,,further corrections by menno de leeuw.


In [4]:
error_df.shape

(68573, 7)

In [5]:
error_df["has_homophone"].value_counts()

True     45596
False    22977
Name: has_homophone, dtype: int64

In [6]:
error_df["is_error"].value_counts()

False    54927
True     13646
Name: is_error, dtype: int64

In [7]:
error_df["correct_word"].value_counts()

to       4370
you      1720
so        649
your      639
would     577
         ... 
sew         1
lone        1
site        1
tow         1
cent        1
Name: correct_word, Length: 127, dtype: int64

In [8]:
error_df["error"].value_counts()

too       2279
two       2229
yew       1720
sow        650
you're     639
          ... 
pair         1
sent         1
loan         1
tale         1
waste        1
Name: error, Length: 128, dtype: int64

In [10]:
# We can also create a dataframe of only the sentences that have homophones
homophone_df = error_df[error_df['has_homophone']].reset_index(drop=True)
homophone_df.head(20)

Unnamed: 0,sentences,has_homophone,is_error,error_idx,error,correct_word,correct_sentence
0,﻿the project gutenberg ebook of frankenstein; ...,True,True,33.0,know,no,﻿the project gutenberg ebook of frankenstein; ...
1,"you may copy it, give it away or re-use it und...",True,False,,,,"you may copy it, give it away or re-use it und..."
2,"if you are not located in the united states,yo...",True,True,18.0,wear,where,"if you are not located in the united states,yo..."
3,"petersburgh, dec. 11th, 17—.you will rejoice t...",True,True,7.0,here,hear,"petersburgh, dec. 11th, 17—.you will rejoice t..."
4,"i arrived here yesterday, and my first task is...",True,False,,,,"i arrived here yesterday, and my first task is..."
5,do you understand thisfeeling?,True,False,,,,do you understand thisfeeling?
6,"this breeze, witch has travelled from the regi...",True,True,2.0,witch,which,"this breeze, which has travelled from the regi..."
7,i try in vain to be persuaded that the pole is...,True,False,,,,i try in vain to be persuaded that the pole is...
8,"there—for with your leave, my sister, i will p...",True,False,,,,"there—for with your leave, my sister, i will p..."
9,imay there discover the wondrous power which a...,True,False,,,,imay there discover the wondrous power which a...


In [11]:
homophone_df.to_csv('../data/gutenberg-homophone-errors.csv', index=False)