# Prep some data for training

see:
- https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines

In [1]:
import wikipedia, re, random, json
import pandas as pd
from pathlib import Path

In [2]:
path = Path('data/wikipedia/raw')
path.mkdir(parents=True, exist_ok=True)

&darr; start by downloading wikipedia pages, if we don't have them already

In [3]:
for character in [chr(i) for i in range(65, 91)]:
    if (path/f'misspellings-{character}.txt').is_file():
        print('skipping', character, 'already done')
        continue
    page = wikipedia.page(f'Wikipedia:Lists of common misspellings/{character}')
    print('read', page)
    with open(path/f'misspellings-{character}.txt', 'w', encoding='utf-8-sig') as f:
        f.write(page.content)

skipping A already done
skipping B already done
skipping C already done
skipping D already done
skipping E already done
skipping F already done
skipping G already done
skipping H already done
skipping I already done
skipping J already done
skipping K already done
skipping L already done
skipping M already done
skipping N already done
skipping O already done
skipping P already done
skipping Q already done
skipping R already done
skipping S already done
skipping T already done
skipping U already done
skipping V already done
skipping W already done
skipping X already done
skipping Y already done
skipping Z already done


we need to put a few rules together to parse the wiki pages ...
```
Sahasralinga (Sahasralinga)
San Bernadino (San Bernadino [song], San Bernardino [California])
```

&darr; start by reading the lines we need from all files into a list

In [4]:
lines = []
for character in [chr(i) for i in range(65, 91)]:
    with open(path/f'misspellings-{character}.txt', encoding='utf-8-sig') as f:
        lines.extend(f.readlines()[11:])
lines = [l.strip() for l in lines if l.strip() != '']
print(len(lines))

5983


In [5]:
i = 582 # eyeball check data at the end of page A and start of page B
lines[i-3:i+3]

['awsome (awesome)',
 'awya (away)',
 'axe (ax) (acceptable variant)',
 "babys (babies [plural], baby's [possessive], babys [French plural])",
 'bacame (became)',
 'backpeddle (backpedal)']

In [6]:
def word_filter(first_word): # TODO: DRY
    "keep words that contain only characters and hyphens and are not too short"
    if len(first_word) < 4:
        return False
    if re.match(r'^[-a-zA-Z]+$', first_word) is None:
        return False
    return True
print(len(lines))
lines = [l for l in lines if word_filter(l[:l.index('(')-1])]
print(len(lines), random.sample(lines, 5))

5983
5868 ['oxyen (oxygen)', 'directer (director)', 'soudn (sound)', 'natual (natural)', 'prefered (preferred)']


In [7]:
def string_filter(s):
    if 'acceptable' in s:
        return False
    if 'correct' in s:
        return False
    if 'variant' in s:
        return False
    if 'allows both' in s:
        return False
    if 'false positive' in s:
        return False
    if "?" in s:
        return False
    return True
print(len(lines))
lines = [l for l in lines if string_filter(l)]
print(len(lines), random.sample(lines, 5))

5868
5539 ['imanent (eminent, immanent, imminent)', 'sucesful (successful)', 'frought (fraught)', 'burry (bury [consign to grave], burry [with burs, a burry voice])', 'outputted (output [verb])']


In [8]:
def _clean(s):
    s = s.replace(' [plural]', '')
    s = s.replace(' [singular]', '')
    s = s.replace(' [month]', '')
    s = s.replace(' [English]', '')
    s = s.replace(' [english]', '')
    return s
lines = [_clean(l) for l in lines]

In [9]:
pattern = r'^[-a-zA-Z]+ \([-a-zA-Z]+(?:, [-a-zA-Z]+)*\)$'
i = 0
for line in lines:
    if re.match(pattern, line) is None:
        i += 1
        bad_word = line[ : line.index('(')-1]
        correct_words = line[line.index('(')+1 : -1].split(', ')
        correct_words = [w for w in correct_words if word_filter(w)]
        if len(correct_words) > 0:
            print(bad_word, correct_words)
print(i)

adress ['address']
agregate ['aggregate']
agregation ['aggregation']
agression ['aggression']
allright ['alright']
alot ['allot']
amonth ['among', 'amongst']
arbouretum ['arboretum']
archiac ['archaic']
arival ['arrival']
babys ['babies']
batchs ['batches']
beachs ['beaches']
bodys ['bodies']
branchs ['branches']
calfs ['calves']
centennary ['centenary']
chanel ['channel']
childrens ['children']
coachs ['coaches']
comercial ['commercial']
correspondant ['correspondent', 'corresponding']
councellor ['councillor', 'counsellor']
countrys ['countries']
datas ['data']
determent ['deterrent', 'detriment']
diffamation ['defamation']
emision ['emission']
enterprize ['enterprise']
exemple ['example']
existant ['existent']
familys ['families']
faught ['fought']
feets ['feet']
feild ['field']
florescence ['fluorescence']
florescent ['fluorescent']
floride ['fluoride']
foots ['feet']
forcasted ['forecast']
forecasted ['forecast']
freshmans ['freshmen']
frist ['first']
fromer ['former']
gerat ['gre

In [10]:
data = []
for line in lines:
    bad_word = line[ : line.index('(')-1]
    correct_words = line[line.index('(')+1 : -1].split(', ')
    correct_words = [w for w in correct_words if word_filter(w)]
    for correct_word in correct_words:
        data.append(dict(correct=correct_word, mistake=bad_word))

In [11]:
random.sample(data, 20)

[{'correct': 'accommodation', 'mistake': 'accomodation'},
 {'correct': 'anthropomorphization', 'mistake': 'anthromorphization'},
 {'correct': 'violation', 'mistake': 'volation'},
 {'correct': 'Tuesday', 'mistake': 'Teusday'},
 {'correct': 'neighbors', 'mistake': 'neigbours'},
 {'correct': 'continuous', 'mistake': 'continious'},
 {'correct': 'piece', 'mistake': 'peice'},
 {'correct': 'occasionally', 'mistake': 'ocasionally'},
 {'correct': 'second', 'mistake': 'secound'},
 {'correct': 'permission', 'mistake': 'premission'},
 {'correct': 'sedative', 'mistake': 'sedatative'},
 {'correct': 'neighbour', 'mistake': 'nieghbour'},
 {'correct': 'successor', 'mistake': 'successer'},
 {'correct': 'happened', 'mistake': 'happend'},
 {'correct': 'aircraft', 'mistake': 'aircrafts'},
 {'correct': 'shipped', 'mistake': 'shiped'},
 {'correct': 'dignitary', 'mistake': 'dignatary'},
 {'correct': 'Michigan', 'mistake': 'Michagan'},
 {'correct': 'quindecimvir', 'mistake': 'quindecemvir'},
 {'correct': 'over

```
with open('data/wikipedia/wikipedia_common_misspellings.jsonl', 'w') as f:
    for row in data:
        json.dump(row, f)
        f.write('\n')
```

In [12]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,correct,mistake
0,aberrant,abberant
1,aberration,abberation
2,abbreviated,abbrieviated
3,abbreviated,abbriviated
4,abbreviation,abbriviation
...,...,...
5796,younger,yonger
5797,Yorkshire,Yorkhire
5798,young,younge
5799,yourself,youself


https://en.wikipedia.org/wiki/Commonly_misspelled_English_words

In [13]:
data = []
for line in """absence - absense, absentse, abcense, absance[3][10]
acceptable - acceptible[4]
accidentally/accidently - accidentaly[4]
accommodate - accomodate, acommodate[3][4]
achieve - acheive[3]
acknowledge - acknowlege, aknowledge[3]
acquaintance - acquaintence, aquaintance[3]
acquire - aquire, adquire[4]
acquit - aquit[4]
acreage - acrage, acerage[3]
address - adress[3]
adultery - adultary[3]
advisable - adviseable, advizable[3]
affect - effect[3] (both words exist, but are distinct)
aggression - agression[1]
aggressive - agressive[1]
allegiance - allegaince, allegience, alegiance[3]
almost - allmost[3]
a lot - alot (must be two words), allot[4]
amateur - amatuer, amature[4]
annually - anually, annualy[3]
apparent - apparant, aparent, apparrent, aparrent[4]
arctic - artic[3]
argument - arguement[1][4]
atheist - athiest, athist[3][4]
awful - awfull, aweful[3]
because - becuase, becasue[3]
beautiful - beatiful[3]
becoming - becomeing[3]
beginning - begining[3]
believe - beleive[4]
bellwether - bellweather[3][4]
benefit - benifit[3]
buoy - bouy[3]
buoyant - bouyant[3]
business - buisness[1]
C–D
calendar - calender[3][4]
camouflage - camoflage, camoflague[3]
capitol - capital[3] (both words exist, but are distinct)
Caribbean - Carribean[3]
category - catagory[3][4]
caught - cauhgt, caugt[3]
cemetery - cemetary,[1] cematery[3]
changeable - changable[3][4]
chief - cheif[3]
colleague - collaegue, collegue[3]
column - colum[4]
coming - comming[3]
committed - commited, comitted[3][4]
comparison - comparsion
concede - conceed[3]
congratulate - congradulate[3]
conscientious - consciencious[3][4]
conscious - concious, consious[4]
consensus - concensus[1][3][4]
controversy - contraversy[1]
coolly - cooly[3]
daiquiri - dacquiri, daquiri[4]
deceive - decieve[1][3]
definite - definate,[1] definit[4]
definitely - definitly,[4] definately, definatly, defiantly
desperate - desparate[1][3]
difference - diffrence[3]
dilemma - dilema[3]
disappoint - dissapoint[1]
disastrous - disasterous[3]
drunkenness - drunkeness[4]
dumbbell - dumbell[4]
E–H
embarrass - embarass[1][4]
equipment - equiptment (wrong in numerous webpages)[4]
exceed - excede[4]
exhilarate - exilerate[4]
existence - existance[4]
experience - experiance[4]
extreme - extreem[1]
fascinating - facinating[1]
fiery - firey[4]
fluorescent - flourescent[1]
foreign - foriegn[4]
friend - freind[1]
fulfil - fullfil (American: fulfill)[1]
gauge - guage[1][4]
grateful - gratefull, greatful[1][4]
great - grate, grat[1][4]
guarantee - garantee, garentee, garanty[1][4][10]
guidance - guidence[10]
harass - harrass[1][4]
height - heighth, heigth[4]
hierarchy - heirarchy[4]
hors d'oeuvres - hors derves, ordeurves[3]
humorous - humerous[4]
hygiene - hygene, hygine, hiygeine, higeine, hygeine[3]
hypocrisy/hypocrite - hipocrit[1][3]
I–K
ignorance - ignorence[4]
imitate - immitate[3]
immediately - imediately[1][4]
indict - indite[4]
independent - independant[4][7]
indispensable - indispensible[4]
inoculate - innoculate[4]
intelligence - inteligence, intelligance[4]
jewelry (US)/jewellery (UK) - jewelery[4]
judgment - judgement (only a misspelling in the U.S.)[4]
kernel - kernal (distinct from homophone "colonel")[4]
L–O
leisure - liesure[4]
liaison - liason[1][4]
library - libary, liberry[4]
license - lisence[4] (US always license, UK noun licence)[1]
lightning - lightening[4]
lose - loose[11]
maintenance - maintainance, maintnance[1]
marshmallow - marshmellow[1][4]
medieval - medeval, medevil, mideval[4]
memento - momento[4]
millennium - millenium, milennium[1][4]
miniature - miniture[1][4]
minuscule - miniscule[1][4]
mischievous - mischievious, mischevous, mischevious (The spelling "mischievious" and the corresponding pronunciation are still considered non-standard despite being current and existing since at least the 16th century.)[4][12]
misspell - mispell, misspel[1][4]
necessary - neccessary, necessery[1]
niece - neice[1]
neighbour - nieghbor[4]
noticeable - noticable[4]
occasion - occassion[1]
occasionally - occasionaly, occassionally[4]
occurrence - occurrance, occurence[4]
occurred - occured[1]
omission - ommision, omision[1]
original - orignal[1]
outrageous - outragous[1]
P–Q
parliament - parliment[1]
pastime - passtime, pasttime[4]
perceive - percieve[1]
perseverance - perseverence[4]
personnel - personell, personel[4]
plagiarize - plagerize[3]
playwright - playright, playwrite[4]
possession - posession, possesion[1][4]
potatoes - potatos[1]
precede - preceed[4]
presence - presance[1]
principle - principal[4]
privilege - privelege, priviledge[1][4]
professor - professer[3]
protester - protestor[13]
promise - promiss[10]
pronunciation - pronounciation[4]
proof - prufe[10]
prophecy (as noun) - prophesy (valid as verb)[3]
publicly - publically[4]
quarantine - quarentine[3]
queue - que[3]
questionnaire - questionaire, questionnair[1]
R–S
readable - readible[8]
really - realy[1]
receive - recieve[1][4]
receipt - reciept[4]
recommend - recomend, reccommend[1][4]
referred - refered[4]
reference - referance, refrence[4]
relevant - relevent, revelant[3][4]
religious - religous, religius[10]
repetition - repitition[10]
restaurant - restarant, restaraunt[4]
rhyme - rime[4]
rhythm - rythm, rythem[1][4]
secretary - secratary, secretery[1]
seize - sieze[1]
separate - seperate[1][4]
sergeant - sargent[4]
similar - similer[1]
skilful - skilfull (American: skillful)[1]
speech - speach, speeche (archaic)[10]
successful - succesful, successfull, sucessful[1]
supersede - supercede[4]
surprise - suprise, surprize[1]
T–Z
than - then[14]
their - there, they're[4]
tomatoes - tomatos[1]
tomorrow - tommorow, tommorrow[1]
twelfth - twelth[4]
tyranny - tyrany[4]
underrate - underate[4]
until - untill[4]
upholstery - upholstry[4]
usable/useable - usible[9]
vacuum - vaccuum, vaccum, vacume[4]
vehicle - vehical[1]
vicious - visious[1]
weather - wether, whether[4]
weird - wierd[1][4]
welfare - wellfare, welfair[3]
whether - wether (a wether is a castrated ram) [3]
wilful - wilfull (American: willful)[1]
withhold - withold[1]
writing - writting, writeing[10]""".split('\n'):
    line = re.sub(r'\[\d+\]', '', line)
    line = re.sub(r'\(\d+\)', '', line)
    if ' - ' not in line:
        continue
    correct_word, bad_words = line.split(' - ')
    if not word_filter(correct_word):
        continue
    bad_words = bad_words.split(', ')
    bad_words = [w for w in bad_words if word_filter(w)]
    for bad_word in bad_words:
        data.append(dict(correct=correct_word, mistake=bad_word))

In [14]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,correct,mistake
0,absence,absense
1,absence,absentse
2,absence,abcense
3,absence,absance
4,acceptable,acceptible
...,...,...
246,welfare,wellfare
247,welfare,welfair
248,withhold,withold
249,writing,writting


In [15]:
df = pd.concat([df1, df2])
for c in df.columns:
    df[c] = df[c].str.upper()
df = df.drop_duplicates().sort_values(['correct', 'mistake'])
df

Unnamed: 0,correct,mistake
12,ABANDON,ABONDON
13,ABANDONED,ABONDONED
15,ABBREVIATE,ABREVIATE
2,ABBREVIATED,ABBRIEVIATED
3,ABBREVIATED,ABBRIVIATED
...,...,...
5796,YOUNGER,YONGER
5800,YOUNGER,YUONGER
5799,YOURSELF,YOUSELF
4711,ZIONIST,SIONIST


In [16]:
df.to_csv('data/wikipedia/wikipedia_common_misspellings.csv', index=False)