## Downloading Book Data

In [2]:
# Imports
import pandas as pd
import numpy as np
import requests
import nltk
from tqdm import tqdm

nltk.download("punkt")  # Download the punkt tokenizer data
from nltk.tokenize import sent_tokenize

# Top 10 Books on Project Gutenberg; Can expand if needed
book_links = [
    "https://www.gutenberg.org/cache/epub/84/pg84.txt", # Frankenstein
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt", # Moby Dick; Or, The Whale
    "https://www.gutenberg.org/cache/epub/2641/pg2641.txt", # A Room with a View
    "https://www.gutenberg.org/cache/epub/145/pg145.txt", # Middlemarch
    "https://www.gutenberg.org/cache/epub/1342/pg1342.txt", # Pride and Prejudice
    "https://www.gutenberg.org/cache/epub/100/pg100.txt", # The Complete Works of William Shakespeare
    "https://www.gutenberg.org/cache/epub/37106/pg37106.txt", # Little Women
    "https://www.gutenberg.org/cache/epub/16389/pg16389.txt", # The Enchanted April
    "https://www.gutenberg.org/cache/epub/67979/pg67979.txt" # The Blue Castle
]


# Fuction that reads in project gutenberg books
def book_reader(book_links):
    corpus = ""

    # Loop through each book and add to corpus
    for book_url in tqdm(book_links):
        response = requests.get(book_url)
        book_text = response.text

        # Clean the text
        lowered_book = str(book_text).lower()
        cleaned_text = lowered_book.replace("\r", "").replace("\n", "")

        # Add to corpus
        corpus = corpus + cleaned_text

    return corpus


# Read in the books and tokenize them
all_text = book_reader(book_links)
sentences = sent_tokenize(all_text)
sentences[:10]


[nltk_data] Downloading package punkt to /Users/Austin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 9/9 [00:03<00:00,  2.88it/s]


['\ufeffthe project gutenberg ebook of frankenstein; or, the modern prometheus    this ebook is for the use of anyone anywhere in the united states andmost other parts of the world at no cost and with almost no restrictionswhatsoever.',
 'you may copy it, give it away or re-use it under the termsof the project gutenberg license included with this ebook or onlineat www.gutenberg.org.',
 'if you are not located in the united states,you will have to check the laws of the country where you are locatedbefore using this ebook.title: frankenstein; or, the modern prometheusauthor: mary wollstonecraft shelleyrelease date: october 1, 1993 [ebook #84]                most recently updated: december 2, 2022language: englishcredits: judith boss, christy phillips, lynn hanninen and david meltzer.',
 'html version by al haines.',
 'further corrections by menno de leeuw.',
 '*** start of the project gutenberg ebook frankenstein; or, the modern prometheus ***frankenstein;or, the modern prometheusby mary

## Homophone List

In [4]:
homophones_list = [
    ['accessary', 'accessory'],
    ['ad', 'add'],
    ['ail', 'ale'],
    ['air', 'heir'],
    ['aisle', "I'll", 'isle'],
    ['all', 'awl'],
    ['allowed', 'aloud'],
    ['altar', 'alter'],
    ['arc', 'ark'],
    ['ant', 'aunt'],
    ['ate', 'eight'],
    ['auger', 'augur'],
    ['auk', 'orc'],
    ['aural', 'oral'],
    ['away', 'aweigh'],
    ['aw', 'awe'],
    ['ore', 'oar', 'or'],
    ['axel', 'axle'],
    ['aye', 'eye', 'I'],
    ['bail', 'bale'],
    ['bait', 'bate'],
    ['baize', 'bays'],
    ['bald', 'bawled'],
    ['ball', 'bawl'],
    ['band', 'banned'],
    ['bard', 'barred'],
    ['bare', 'bear'],
    ['bark', 'barque'],
    ['baron', 'barren'],
    ['base', 'bass'],
    ['based', 'baste'],
    ['bazaar', 'bizarre'],
    ['be', 'bee'],
    ['bay', 'bey'],
    ['beach', 'beech'],
    ['bean', 'been'],
    ['beat', 'beet'],
    ['beau', 'bow'],
    ['beer', 'bier'],
    ['bel', 'bell', 'belle'],
    ['berry', 'bury'],
    ['berth', 'birth'],
    ['bight', 'bite', 'byte'],
    ['billed', 'build'],
    ['bitten', 'bittern'],
    ['blew', 'blue'],
    ['bloc', 'block', 'bloque'],
    ['boar', 'bore'],
    ['board', 'bored'],
    ['boarder', 'border'],
    ['bold', 'bowled'],
    ['boos', 'booze'],
    ['born', 'borne'],
    ['bough', 'bow'],
    ['boy', 'buoy'],
    ['brae', 'bray'],
    ['braid', 'brayed'],
    ['braise', 'brays', 'braze'],
    ['brake', 'break'],
    ['bread', 'bred'],
    ['brews', 'bruise'],
    ['bridal', 'bridle'],
    ['broach', 'brooch'],
    ['bur', 'burr', 'brr'],
    ['but', 'butt'],
    ['buy', 'by', 'bye'],
    ['buyer', 'byre'],
    ['calendar', 'calender'],
    ['call', 'caul'],
    ['canvas', 'canvass'],
    ['cast', 'caste'],
    ['caster', 'castor'],
    ['caught', 'court'],
    ['caw', 'core', 'corps'],
    ['cede', 'seed'],
    ['ceiling', 'sealing'],
    ['cell', 'sell'],
    ['censer', 'censor', 'sensor'],
    ['cent', 'scent', 'sent'],
    ['cereal', 'serial'],
    ['cheap', 'cheep'],
    ['check', 'cheque'],
    ['choir', 'quire'],
    ['chord', 'cord'],
    ['cite', 'sight', 'site'],
    ['clack', 'claque'],
    ['clew', 'clue'],
    ['climb', 'clime'],
    ['close', 'cloze'],
    ['coal', 'kohl'],
    ['coarse', 'course'],
    ['coign', 'coin'],
    ['colonel', 'kernel'],
    ['complacent', 'complaisant'],
    ['complement', 'compliment'],
    ['coo', 'coup'],
    ['cops', 'copse'],
    ['council', 'counsel'],
    ['cousin', 'cozen'],
    ['creak', 'creek'],
    ['crews', 'cruise'],
    ['cue', 'kyu', 'queue'],
    ['curb', 'kerb'],
    ['currant', 'current'],
    ['cymbol', 'symbol'],
    ['dam', 'damn'],
    ['days', 'daze'],
    ['dear', 'deer'],
    ['descent', 'dissent'],
    ['desert', 'dessert'],
    ['deviser', 'divisor'],
    ['dew', 'due'],
    ['die', 'dye'],
    ['discreet', 'discrete'],
    ['doe', 'doh', 'dough'],
    ['done', 'dun'],
    ['douse', 'dowse'],
    ['draft', 'draught'],
    ['dual', 'duel'],
    ['earn', 'urn'],
    ['eery', 'eyrie'],
    ['ewe', 'yew', 'you'],
    ['faint', 'feint'],
    ['fah', 'far'],
    ['fair', 'fare'],
    ['fairy', 'ferry'],
    ['fate', 'fete'],
    ['farther', 'father'],
    ['faun', 'fawn'],
    ['faze', 'phase'],
    ['fay', 'fey'],
    ['feat', 'feet'],
    ['ferrule', 'ferule'],
    ['few', 'phew'],
    ['fie', 'phi'],
    ['file', 'phial'],
    ['find', 'fined'],
    ['fir', 'fur'],
    ['fizz', 'phiz'],
    ['flair', 'flare'],
    ['flaw', 'floor'],
    ['flea', 'flee'],
    ['flex', 'flecks'],
    ['flew', 'flu', 'flue'],
    ['floe', 'flow'],
    ['flour', 'flower'],
    ['for', 'fore', 'four'],
    ['foreword', 'forward'],
    ['fort', 'fought'],
    ['forth', 'fourth'],
    ['foul', 'fowl'],
    ['franc', 'frank'],
    ['freeze', 'frieze'],
    ['friar', 'fryer'],
    ['furs', 'furze'],
    ['gait', 'gate'],
    ['galipot', 'gallipot'],
    ['gamble', 'gambol'],
    ['gallop', 'galop'],
    ['gays', 'gaze'],
    ['genes', 'jeans'],
    ['gild', 'guild'],
    ['gilt', 'guilt'],
    ['giro', 'gyro'],
    ['gnaw', 'nor'],
    ['gneiss', 'nice'],
    ['gorilla', 'guerilla'],
    ['grate', 'great'],
    ['greave', 'grieve'],
    ['greys', 'graze'],
    ['grisly', 'grizzly'],
    ['groan', 'grown'],
    ['guessed', 'guest'],
    ['hail', 'hale'],
    ['hair', 'hare'],
    ['hall', 'haul'],
    ['hangar', 'hanger'],
    ['hart', 'heart'],
    ['haw', 'hoar', 'whore'],
    ['hay', 'hey'],
    ['heal', 'heel', "he'll"],
    ['here', 'hear'],
    ['heard', 'herd'],
    ["he'd", 'heed'],
    ['heroin', 'heroine'],
    ['hew', 'hue'],
    ['hi', 'high'],
    ['higher', 'hire'],
    ['him', 'hymn'],
    ['ho', 'hoe'],
    ['hoard', 'horde'],
    ['hoarse', 'horse'],
    ['holey', 'holy', 'wholly'],
    ['hour', 'our'],
    ['idle', 'idol'],
    ['in', 'inn'],
    ['indict', 'indite'],
    ["it's", 'its'],
    ['jewel', 'joule', 'juul'],
    ['key', 'quay'],
    ['knave', 'nave'],
    ['knead', 'need'],
    ['knew', 'new'],
    ['knight', 'night'],
    ['knit', 'nit'],
    ['knob', 'nob'],
    ['knock', 'nock'],
    ['knot', 'not'],
    ['know', 'no'],
    ['knows', 'nose'],
    ['laager', 'lager'],
    ['lac', 'lack'],
    ['lade', 'laid'],
    ['lain', 'lane'],
    ['lam', 'lamb'],
    ['laps', 'lapse'],
    ['larva', 'lava'],
    ['lase', 'laze'],
    ['law', 'lore'],
    ['lay', 'ley'],
    ['lea', 'lee'],
    ['leach', 'leech'],
    ['lead', 'led'],
    ['leak', 'leek'],
    ['lean', 'lien'],
    ['lessen', 'lesson'],
    ['levee', 'levy'],
    ['liar', 'lyre'],
    ['licence', 'license'],
    ['licker', 'liquor'],
    ['lie', 'lye'],
    ['lieu', 'loo'],
    ['links', 'lynx'],
    ['lo', 'low'],
    ['load', 'lode'],
    ['loan', 'lone'],
    ['locks', 'lox'],
    ['loop', 'loupe'],
    ['loot', 'lute'],
    ['made', 'maid'],
    ['mail', 'male'],
    ['main', 'mane'],
    ['maize', 'maze'],
    ['mall', 'maul'],
    ['manna', 'manner'],
    ['mantel', 'mantle'],
    ['mare', 'mayor'],
    ['mark', 'marque'],
    ['marshal', 'martial'],
    ['marten', 'martin'],
    ['mask', 'masque'],
    ['maw', 'more'],
    ['me', 'mi'],
    ['mean', 'mien'],
    ['meat', 'meet', 'mete'],
    ['medal', 'meddle'],
    ['metal', 'mettle'],
    ['meter', 'metre'],
    ['might', 'mite'],
    ['miner', 'minor', 'mynah'],
    ['mind', 'mined'],
    ['missed', 'mist'],
    ['moat', 'mote'],
    ['mode', 'mowed'],
    ['moor', 'more'],
    ['moose', 'mousse'],
    ['morning', 'mourning'],
    ['muscle', 'mussel'],
    ['naval', 'navel'],
    ['nay', 'neigh'],
    ['nigh', 'nye'],
    ['none', 'nun'],
    ['od', 'odd'],
    ['ode', 'owed'],
    ['oh', 'owe'],
    ['one', 'won'],
    ['packed', 'pact'],
    ['packs', 'pax'],
    ['pail', 'pale'],
    ['pain', 'pane'],
    ['pair', 'pare', 'pear'],
    ['palate', 'palette', 'pallet'],
    ['pascal', 'paschal'],
    ['paten', 'patten', 'pattern'],
    ['pause', 'paws', 'pores', 'pours'],
    ['peace', 'piece'],
    ['peak', 'peek', 'pique', 'peke'],
    ['pea', 'pee'],
    ['peal', 'peel'],
    ['pearl', 'purl'],
    ['pedal', 'peddle'],
    ['peer', 'pier'],
    ['pi', 'pie'],
    ['pica', 'pika'],
    ['place', 'plaice'],
    ['plain', 'plane'],
    ['pleas', 'please'],
    ['pole', 'poll'],
    ['plum', 'plumb'],
    ['poof', 'pouffe'],
    ['practice', 'practise'],
    ['praise', 'prays', 'preys'],
    ['principal', 'principle'],
    ['profit', 'prophet'],
    ['quarts', 'quartz'],
    ['quean', 'queen'],
    ['rain', 'reign', 'rein'],
    ['raise', 'rays', 'raze'],
    ['rap', 'wrap'],
    ['raw', 'roar'],
    ['read', 'reed'],
    ['read', 'red'],
    ['real', 'reel'],
    ['reek', 'wreak'],
    ['rest', 'wrest'],
    ['retch', 'wretch'],
    ['review', 'revue'],
    ['rheum', 'room'],
    ['right', 'rite', 'wright', 'write'],
    ['ring', 'wring'],
    ['road', 'rode'],
    ['roe', 'row'],
    ['role', 'roll'],
    ['roo', 'roux', 'rue'],
    ['rood', 'rude'],
    ['root', 'route'],
    ['rose', 'rows'],
    ['rota', 'rotor'],
    ['rote', 'wrote'],
    ['rough', 'ruff'],
    ['rouse', 'rows'],
    ['rung', 'wrung'],
    ['rye', 'wry'],
    ['saver', 'savour'],
    ['scull', 'skull'],
    ['spade', 'spayed'],
    ['sale', 'sail'],
    ['sane', 'seine'],
    ['satire', 'satyr'],
    ['sauce', 'source'],
    ['saw', 'soar', 'sore'],
    ['scene', 'seen'],
    ['sea', 'see'],
    ['seam', 'seem'],
    ['sear', 'seer', 'sere'],
    ['seas', 'sees', 'seize'],
    ['shake', 'sheikh'],
    ['sew', 'so', 'sow'],
    ['shear', 'sheer'],
    ['shoe', 'shoo'],
    ['sic', 'sick'],
    ['side', 'sighed'],
    ['sign', 'sine'],
    ['sink', 'synch'],
    ['slay', 'sleigh'],
    ['sloe', 'slow'],
    ['sole', 'soul'],
    ['some', 'sum'],
    ['son', 'sun'],
    ['sort', 'sought'],
    ['spa', 'spar'],
    ['staid', 'stayed'],
    ['stair', 'stare'],
    ['stake', 'steak'],
    ['stalk', 'stork'],
    ['stationary', 'stationery'],
    ['steal', 'steel'],
    ['stile', 'style'],
    ['storey', 'story'],
    ['straight', 'strait'],
    ['sweet', 'suite'],
    ['swat', 'swot'],
    ['tacks', 'tax'],
    ['tale', 'tail'],
    ['talk', 'torque'],
    ['tare', 'tear'],
    ['taught', 'taut', 'tort'],
    ['te', 'tea', 'tee', 't', 'ti'],
    ['team', 'teem'],
    ['tear', 'tier'],
    ['teas', 'tease'],
    ['terce', 'terse'],
    ['tern', 'turn'],
    ['there', 'their', "they're"],
    ['threw', 'through', 'thru'],
    ['throes', 'throws'],
    ['throne', 'thrown'],
    ['thyme', 'time'],
    ['tic', 'tick'],
    ['tide', 'tied'],
    ['tire', 'tyre'],
    ['to', 'too', 'two'],
    ['toad', 'toed', 'towed'],
    ['told', 'tolled'],
    ['tole', 'toll'],
    ['ton', 'tun'],
    ['tor', 'tore'],
    ['tough', 'tuff'],
    ['troop', 'troupe'],
    ['tuba', 'tuber'],
    ['vain', 'vane', 'vein'],
    ['vale', 'veil'],
    ['vial', 'vile'],
    ['vice', 'vise'],
    ['wade', 'weighed'],
    ['weak', 'week'],
    ['we', 'wee', 'whee'],
    ['way', 'weigh', 'whey'],
    ['wax', 'whacks'],
    ['wart', 'wort'],
    ['watt', 'what'],
    ['warn', 'worn'],
    ['ware', 'wear', 'where'],
    ['war', 'wore'],
    ['wall', 'waul'],
    ['waive', 'wave'],
    ['wait', 'weight'],
    ['wail', 'wale', 'whale'],
    ['wain', 'wane'],
    ["we'd", 'weed'],
    ['weal', "we'll", 'wheel'],
    ['wean', 'ween'],
    ['weather', 'whether'],
    ['weaver', 'weever'],
    ['weir', "we're"],
    ['were', 'whirr'],
    ['wet', 'whet'],
    ['wheald', 'wheeled'],
    ['which', 'witch'],
    ['whig', 'wig'],
    ['while', 'wile'],
    ['whine', 'wine'],
    ['whirl', 'whorl'],
    ['whirled', 'world'],
    ['whit', 'wit'],
    ['white', 'wight'],
    ["who's", 'whose'],
    ['woe', 'whoa'],
    ['wood', 'would'],
    ['yaw', 'yore', 'your', "you're"],
    ['yoke', 'yolk'],
    ["you'll", 'yule']
]

## Error Creation

In [91]:
import random
from tqdm import tqdm


# Function that adds homophone errors at some given probability
def error_creator(sentences, homophones_list, p=0.3):
    
    # Flatten the homophones list
    all_homophones = [
        word for homophone_set in homophones_list for word in homophone_set
    ]

    all_homophones_counts = dict(zip(all_homophones, np.zeros(len(all_homophones))))

    # Initialize lists for output DF
    final_sentences = []
    has_homophone_list = []
    is_error_list = []
    error_list = []
    error_idx_list = []
    correct_word_list = []
    correct_sentence_list = []

    # Loop through sentence by sentence
    for sentence in tqdm(sentences):
        # Split words
        sentence_words = sentence.split(" ")

        # Get homophones in sentence, if any
        sentence_homophones = [
            (word, idx)
            for idx, word in enumerate(sentence_words)
            if word in all_homophones
        ]

        # If no homophones, then move on
        if len(sentence_homophones) == 0:
            final_sentence = sentence
            is_error = False
            error = None
            error_idx = None
            correct_word = None
            correct_sentence = sentence
            has_homophone = False

        else:
            has_homophone = True

            # Randomly decide if we want to add an error given set probability
            if p > random.random():
                # Original sentence is assumed to be grammatically correct
                correct_sentence = sentence
                is_error = True

                # Randomly select a homophone
                if len(sentence_homophones) > 1:
                    # If there are multiple homophones in the sentence, then randomly select one based on the number of times it appears in the sentence
                    sentence_homophones_counts = [(word, all_homophones_counts[word]) for word, idx in sentence_homophones]

                    # Get the weights for each homophone, giving less weight to homophones that are already in the dataset more
                    epsilon = 1e-10  # a small positive value to avoid division by zero
                    max_count = max(count for word, count in sentence_homophones_counts)
                    weights = [1 - (count / (max_count + epsilon)) for word, count in sentence_homophones_counts]

                    # Make selection
                    homophone_tuple = random.choices(sentence_homophones, weights=weights, k=1)[0]        
                else:
                    homophone_tuple = random.sample(sentence_homophones, 1)[0]

                # Get the homophone
                homophone = homophone_tuple[0]

                # Get the index of the homophone; important in case a sentence contains the same one multiple times (to, for example, could easily appear several times in a sentence)
                error_idx = int(homophone_tuple[1])

                # Add to the homophone count
                all_homophones_counts[homophone] += 1

                # As before, the original sentence is assumed to be grammatically correct
                correct_word = homophone

                # Get the alternate homophones
                homophone_alternates = [
                    word for word in homophones_list if homophone in word
                ][0]
                homophone_alternates = [
                    word for word in homophone_alternates if word != homophone
                ]

                # Randomly select an alternate homophone to insert
                error = random.sample(homophone_alternates, 1)[0]

                # Replace the homophone with the alternate homophone
                sentence_words[error_idx] = error

                final_sentence = " ".join(sentence_words)

            # If we don't want to add an error, then just keep the sentence as is
            else:
                final_sentence = sentence
                is_error = False
                error = None
                error_idx = None
                correct_word = None
                correct_sentence = sentence

        # Append to lists
        final_sentences.append(final_sentence)
        is_error_list.append(is_error)
        error_list.append(error)
        error_idx_list.append(error_idx)
        correct_word_list.append(correct_word)
        correct_sentence_list.append(correct_sentence)
        has_homophone_list.append(has_homophone)

    # Create output DF
    output_df = pd.DataFrame(
        {
            "sentence": final_sentences,
            "has_homophone": has_homophone_list,
            "is_error": is_error_list,
            "error_idx": error_idx_list,
            "error": error_list,
            "correct_word": correct_word_list,
            "correct_sentence": correct_sentence_list,
        }
    )
    return output_df


# Run function
# We have seen that the model is good at avoiding false positives so we want to focus on error corrections
# Therefore will set the probability of adding an error to 0.7 for a high error count
error_df = error_creator(sentences=sentences, homophones_list=homophones_list, p=0.7)
error_df.head()


100%|██████████| 68573/68573 [00:17<00:00, 3866.55it/s]


Unnamed: 0,sentence,has_homophone,is_error,error_idx,error,correct_word,correct_sentence
0,﻿the project gutenberg ebook of frankenstein; ...,True,True,33.0,know,no,﻿the project gutenberg ebook of frankenstein; ...
1,"you may copy it, give it away or re-use it und...",True,False,,,,"you may copy it, give it away or re-use it und..."
2,"if you are not located in the united states,yo...",True,False,,,,"if you are not located in the united states,yo..."
3,html version bye al haines.,True,True,2.0,bye,by,html version by al haines.
4,further corrections bye menno de leeuw.,True,True,2.0,bye,by,further corrections by menno de leeuw.


In [92]:
error_df.shape

(68573, 7)

In [93]:
error_df["has_homophone"].value_counts()

True     56484
False    12089
Name: has_homophone, dtype: int64

In [94]:
error_df["is_error"].value_counts()

True     39446
False    29127
Name: is_error, dtype: int64

## Commonly Replaced Words

In [95]:
error_df["correct_word"].value_counts().head(25)

to       1077
in       1053
you       995
for       972
but       958
be        949
not       933
what      868
all       842
your      832
so        823
me        809
would     769
by        761
no        751
him       750
one       711
or        687
were      678
we        663
been      607
which     606
there     602
more      580
their     579
Name: correct_word, dtype: int64

In [96]:
import plotly.express as px

homophone_counts = error_df["correct_word"].value_counts()

# Sort by count
homophone_counts = homophone_counts.sort_values(ascending=False)
fig = px.histogram(
    error_df,
    x="correct_word",
    title="Distribution of Replaced Homophones",
    width=800,
    height=500,
    category_orders={"correct_word": homophone_counts.index},
)
fig.update_layout(xaxis_title="Homophone", yaxis_title="Count", hovermode="x")
fig.show()


In [97]:
error_df["error"].value_counts().head(25)

inn        1053
butt        958
bee         949
knot        933
watt        868
awl         842
mi          809
wood        769
know        751
hymn        750
two         726
won         711
too         701
whirr       678
bean        607
witch       606
they're     597
maw         580
sea         569
hour        539
no          535
sum         526
fore        519
ewe         499
yew         497
Name: error, dtype: int64

## Collecting only sentences with homophones

In [98]:
# We can also create a dataframe of only the sentences that have homophones
homophone_df = error_df[error_df['has_homophone']].reset_index(drop=True)
homophone_df.head(20)

Unnamed: 0,sentence,has_homophone,is_error,error_idx,error,correct_word,correct_sentence
0,﻿the project gutenberg ebook of frankenstein; ...,True,True,33.0,know,no,﻿the project gutenberg ebook of frankenstein; ...
1,"you may copy it, give it away or re-use it und...",True,False,,,,"you may copy it, give it away or re-use it und..."
2,"if you are not located in the united states,yo...",True,False,,,,"if you are not located in the united states,yo..."
3,html version bye al haines.,True,True,2.0,bye,by,html version by al haines.
4,further corrections bye menno de leeuw.,True,True,2.0,bye,by,further corrections by menno de leeuw.
5,"petersburgh, dec. 11th, 17—.you will rejoice t...",True,False,,,,"petersburgh, dec. 11th, 17—.you will rejoice t..."
6,"i arrived here yesterday, and my first task is...",True,True,53.0,mi,me,"i arrived here yesterday, and my first task is..."
7,do you understand thisfeeling?,True,False,,,,do you understand thisfeeling?
8,"this breeze, which has travelled from the regi...",True,True,13.0,mi,me,"this breeze, which has travelled from the regi..."
9,i try in vain to be persuaded that the pole is...,True,False,,,,i try in vain to be persuaded that the pole is...


## Final Dataframe Shape:

In [99]:
homophone_df.shape

(56484, 7)

In [100]:
# Checking it worked
homophone_df["has_homophone"].value_counts()

True    56484
Name: has_homophone, dtype: int64

## Saving to CSV

In [101]:
homophone_df.to_csv('../data/gutenberg-homophone-errors.csv', index=False)