### Spell Checker Notebook

Builds a frequencty dictionary tailored to corpus.
Builds a string to string lookup dictionary to speed up spellchecking.

Disclaimer:  
Spell corrections are done on a best effort basis.  
Since words are taken out of context, corrections may be wrong.  
Warning, some of the corrections found are explicit words.  
These corrections are not intended to offend the reader.  
Please pardon mistaken corrections as consequence of algorithmic generalization.

In [1]:
import os
import sys
import regex as re
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from collections import defaultdict
import pickle
from spellhelper import Spellhelper
import time

#### Text Cleaning Routines

In [2]:
def CleanNumeric(token):
    '''
    input numeric token
    clean the token
    return simplified token list
    '''
    n_map = {ord(c): " " for c in "0123456789"}
    alphas = token.translate(n_map)
    toks = ['number'] + nltk.word_tokenize(alphas)
    return toks

In [3]:
def CleanSymbol(token):
    '''
    input symbolic token
    clean the token
    return simplified string
    '''
    #simplify paths
    if (token.count('/')>2)|(token[:2]=='//')|(token[:4]=='www.'):
        clean_tok = ['path']
    else:
        # remove special characters
        sc_map = {ord(c): " " for c in \
                  "!@#$£¥¢©®™§†±≠%^&*()[]{};:,‚./<>?\|¨´'`~-=_+¬∞µ•√∏∫°áàâæëéîñöōüû"}
        clear_sym = token.translate(sc_map)
        ctoks = nltk.word_tokenize(clear_sym)
        clean_tok = []
        for tok in ctoks:
            if not bool(re.search(r'\d', tok)):
                clean_tok.append(tok)
            # process numericals
            else:
                clean_tok += CleanNumeric(tok)
    return clean_tok

In [4]:
def PreClean(token):
    '''
    sort token
    preclean by type
    return cleaned token list
    '''
    if bool(re.match("^[a-zA-Z0-9]*$", token)):
        if bool(re.search(r'\d', token)):
            clean = CleanNumeric(token)
        else:
            clean = [token]
    else:
        clean = CleanSymbol(token)
    return clean  

#### Data Loading Routines

In [5]:
def LoadCorpus(file):
    '''
    Load paragraphs into dataframe
    Convert paragraphs to tokens
    Return counter and tokens
    '''
    # Load Corpus of Paragraphs
    df_mess = pd.read_csv(file)

    # put all messages into a single string
    TEXT=""
    for i,nrows in df_mess.iterrows():
        TEXT += (nrows['message'])
        
    # tokenize the text string
    raw_tokens = nltk.word_tokenize(TEXT.lower())
    
    # pre-clean the tokens
    pc_tokens = []
    for token in raw_tokens:
        pc_tokens += PreClean(token)
    
    #build counter of corpus tokens
    C = nltk.FreqDist(pc_tokens)
    
    return C, pc_tokens

#### Processing Routines

In [6]:
def BuildSCdict(speller, C_corpus):
    '''
    import speller function and Corpus counter
    build a spell correction dictionary
    return spell correction dictionary and rejects list
    '''
    scdict = defaultdict(str)
    sp_rejects = []
    for token in sorted(C_corpus.keys()):
        if token in speller.counter:
            scdict[token] = token
        else:
            sc = speller.spellcheck(token)
            if sc != token:
                scdict[token] = sc
                print(token, ' -> ', sc)
            else:
                sp_rejects.append(token)
                
    return scdict, sp_rejects

In [7]:
def ApplyCorr(string_dict, tokens):
    '''
    import string dictionary
    import list of tokens to correct
    compiled corrected version
    render from token list to string and back
    return list of corrected tokens
    '''
    # input string corrections into list
    corr_strings = []
    for token in tokens:
        if token in string_dict:
            corr_strings.append(string_dict[token])
        else:
            corr_strings.append(token)
            
    # convert list to string then into tokens    
    big_string = ' '.join(corr_strings)
    corr_tokens = nltk.word_tokenize(big_string)
    
    return corr_tokens

In [8]:
def FindTypos(speller, C_corpus):
    '''
    import speller function and Corpus counter
    build an string lookup dictionary of correctly spelled tokens
    return string lookup dictionary and typos list
    '''
    scdict = defaultdict(str)
    sp_typos = []
    for token in sorted(C_corpus.keys()):
        # spelling okay
        if token in speller.counter:
            scdict[token] = token
        # typos
        else:
            sp_typos.append(token)
                
    return sp_typos, scdict

In [9]:
def SpellSeg(speller, token):
    '''
    import spelling function
    import token
    attempt spelling correction and score
    attempt segmentation correction and score
    compare scores and select winning score as correction
    return corrected string and score
    '''
    # spell check
    spell_cor = speller.spellcheck(token)
    if spell_cor != token:
        sc_score = speller.counter[spell_cor]
    else:
        sc_score = 0

    # seg check
    segs = speller.segcheck(token)
    valid = len(''.join(segs))/len(segs)
    if valid > 1.4:
        best = [t for t in segs if len(t)>1]
        seg_cor = ' '.join(best)
        scores = [speller.counter[t] for t in best]
        sg_score = np.mean(scores)
    else:
        sg_score = 0
        
    # flunked out of being corrected
    if (sc_score==0) & (sg_score==0):
        bstring = token
        score = 0
        
    # correction found
    else:
        if sc_score > sg_score:
            bstring = spell_cor
            score = sc_score
        else:
            bstring = seg_cor
            score = sg_score
    
    return bstring, score
    

In [11]:
def FirstPass(speller, string_dict, typos_list):
    '''
    import speller function 
    import string lookup dictionary
    import typo token list
    attempt spelling correction and score
    attempt segmentation correction and score
    compare scores and select winning score as correction
    add best effort correction to string dictionary
    return dictionary and flunked token list
    '''
    flunk_list = []
    for token in typos_list:
        # attempt correction
        bstring, score = SpellSeg(speller, token)
        # flunkies
        if score == 0:
            flunk_list.append(token)
        else:
            # update correction
            string_dict[token] = bstring
            for seg in bstring.split():
                if seg not in string_dict:
                    string_dict[seg] = seg
            print(token, ' -> ', bstring)

    return string_dict, flunk_list

In [24]:
def SecondPass(speller, string_dict, flunk_list):
    '''
    import speller function
    import string dictionary
    import flunked token list
    iterate through flunked texts
    split off probable noise
    permutate splitting within text 
    limit splits out to 30 characters
    attempt corrections on split sides
    recombine correction results
    adopt optimal split result for text
    return dictionary and noise
    '''
    # combine split with segmenting
    noise = []
    for text in flunk_list:
        # assume shorts are noise
        if len(text) < 8:
            noise.append(text)
        # attempt to find best correction pair
        else:
            # noise assumption as base case
            maxscore = 0
            maxres = text
            # slice text and attempt fix
            split_range = min(30, len(text))
            split_min = int(split_range*0.4)
            split_max = int(split_range*0.6)+1
            for i in range(split_min,split_max):
                # split
                text_left = text[:i]
                text_right = text[i:]
                # attempt corrections
                tl_string, tl_score = SpellSeg(speller, text_left)
                tr_string, tr_score = SpellSeg(speller, text_right)
                split_result = ''
                # recombine
                if tl_score > 0:
                    split_result += tl_string + ' '
                if tr_score > 0:
                    split_result += tr_string
                score = tl_score + tr_score
                # looking for the optimal scoring split pair
                if score > maxscore:
                    maxscore = score
                    maxres = split_result.strip()
            # process optimal split pair result
            if maxres == text:
                noise.append(text)
            else:
                print(text, ' -> ', maxres)
                for t in maxres.split():
                    if t not in string_dict:
                        string_dict[t] = t
                string_dict[text] = maxres
       
    return string_dict, noise

#### Main Routine

In [14]:
# Load Data

# Instantiate Spelling tool
speller = Spellhelper()

# big text corpus
C_corpus, corpus_tokens = LoadCorpus('messages.csv')

# estimate number of unique spelling errors
sp_errors = len([t for t in C_corpus if t not in speller.counter])

In [15]:
# Process new words

# new words in corpus but maybe not in english dictionary
new_words = ['ayuda', 'bbc', 'cbs', 'center', 'centered', 
             'cyber', 'debris', 'donde', 'euro', 'fecal', 
             'feces', 'fema', 'foxnews', 'franken', 'fyi', 
             'giardia', 'gmo', 'google', 'gps', 'gui', 
             'haiti', 'http', 'https', 'hungry', 'instagram',
             'mbc', 'meds', 'msnbc', 'nbc', 'nyc', 'omg', 'ppe', 
             'redcross', 'reiki', 'rescue', 'sandy', 'scary', 
             'skyfm', 'skynews', 'sulfate', 'sulfide', 'sulfur', 
             'tele', 'terre', 'tumblr', 'tweeting', 'tweets', 
             'twitter', 'ucla', 'unicef', 'vegan', 'volcano', 
             'wikipedia','wtf']

# Update speller's frequency dictionary
speller.addwords(C_corpus, new_words)

New words added to frequency dictionary.


In [16]:
# Process spell checking

# find typos
sp_typos, scdict = FindTypos(speller, C_corpus)

# attempt first pass corrections
sc_dict, flunk_list = FirstPass(speller, scdict, sp_typos)

# apply corrections
sc_tokens = ApplyCorr(scdict, corpus_tokens)

#build counter of corpus tokens
C_spell = nltk.FreqDist(sc_tokens)

# update speller frequency dict with corpus words
speller.updatefreq(C_spell)

aa  ->  a
aaa  ->  aka
aab  ->  lab
aabad  ->  bad
aabadbecause  ->  bad because
aabout  ->  about
aaca  ->  abaca
aacf  ->  back
aahar  ->  avatar
aai  ->  ali
aaj  ->  adj
aap  ->  map
aar  ->  car
aarthik  ->  parthia
ab  ->  a
ababa  ->  abba
abad  ->  bad
abain  ->  aba in
abala  ->  bal
abandomns  ->  abandons
abandonned  ->  abandoned
abanonned  ->  ban on ned
abaout  ->  about
abass  ->  bass
abattre  ->  batt re
abble  ->  able
abbotabad  ->  abbot bad
abbottabad  ->  abbott bad
abbyloves  ->  abby loves
abbymaroon  ->  abby maroon
abc  ->  acc
abcb  ->  abb
abclivingfloods  ->  living floods
abdi  ->  audi
abdirahman  ->  rah man
abdoul  ->  abdul
abdrahmanov  ->  rah nov
abdulla  ->  abdul la
abdulnasser  ->  abdul nasser
abdulsalami  ->  abdul salami
abebe  ->  be be
abeche  ->  be che
aberdares  ->  dares
abhas  ->  has
abhi  ->  abri
abi  ->  ali
abid  ->  bid
abidin  ->  bid in
abitation  ->  bit at on
abli  ->  able
abo  ->  ago
abomey  ->  above
abondoned  ->  bon do n

allall  ->  all all
allarecommon  ->  all are common
alle  ->  all
aller  ->  allen
allhadino  ->  all had no
alli  ->  all
allieonly  ->  allie only
alll  ->  all
allo  ->  all
allready  ->  all ready
allright  ->  all right
allsitting  ->  all sitting
allthis  ->  all this
allways  ->  all ways
allwhile  ->  all while
almo  ->  also
alo  ->  all
alongwith  ->  along with
alor  ->  al or
alos  ->  also
alot  ->  lot
alpraiwhat  ->  what
alraedy  ->  already
alrai  ->  alai
alreadi  ->  al read
alreadyames  ->  already mes
alreadystuck  ->  already stuck
alrededores  ->  al re de do res
alredy  ->  already
alresdy  ->  already
als  ->  as
alsoplease  ->  also please
alsothere  ->  also there
alsowe  ->  also we
alstom  ->  tom
altay  ->  alta
alternet  ->  internet
altidor  ->  alt id or
altit  ->  alt it
aluminum  ->  aluminium
alur  ->  blur
alway  ->  al way
alyans  ->  ans
alyas  ->  alias
alysonthbpinero  ->  pinero
alzamiento  ->  mien to
amachu  ->  apache
amad  ->  mad
amal  ->

artistsforpeaceandjustice  ->  artists for peace and justice
arusha  ->  rush
arv  ->  are
arwe  ->  are
asadho  ->  amado
asapi  ->  asap
asapplease  ->  asap please
asaude  ->  as aude
asbury  ->  as bury
asean  ->  asian
asg  ->  as
ashaa  ->  asian
ashelter  ->  shelter
asherif  ->  as her if
ashfall  ->  shall
ashong  ->  as hong
ashraf  ->  ashram
ashwal  ->  ash wal
asi  ->  as
asiamonet  ->  as monet
asice  ->  as ice
asistance  ->  as stance
askar  ->  asker
askes  ->  asked
askin  ->  ask in
askingrt  ->  asking
asma  ->  asia
asmaile  ->  smile
asman  ->  as man
asmial  ->  as al
asnlf  ->  self
aspeger  ->  asp eger
asphalting  ->  asp halting
aspx  ->  asp
assaba  ->  as saba
assemblue  ->  as sem blue
assesssments  ->  assessments
asshole  ->  ass hole
assiatncelaboule  ->  la boule
assiciation  ->  as sic at on
assisstance  ->  ass is stance
assistancei  ->  assistance
assistances  ->  assistance
assistive  ->  assisting
assistnce  ->  assistance
assitance  ->  assistanc

batterythe  ->  battery the
battlezone  ->  battle zone
battlin  ->  batt lin
bauchi  ->  chi
baucks  ->  bucks
bawe  ->  base
baygon  ->  dayton
bayilod  ->  baylor
bayingolin  ->  bay in go lin
bayled  ->  bay led
baylod  ->  baylor
baylode  ->  bay lode
bayongbong  ->  yong bong
bazars  ->  bazar
bb  ->  by
bbcbreaking  ->  bbc breaking
bbcworld  ->  bbc world
bbm  ->  bbc
bbq  ->  bbc
bbut  ->  but
bbwcandle  ->  candle
bc  ->  by
bca  ->  bra
bcas  ->  bias
bcs  ->  bus
bcten  ->  been
bcuz  ->  but
bcworld  ->  world
bdpumpkin  ->  pumpkin
bdrcs  ->  bars
bds  ->  ads
beacause  ->  be cause
beachgoers  ->  beach goers
beachline  ->  beach line
beachs  ->  be chs
beamups  ->  be am ups
bearthquake  ->  earthquake
beatwind  ->  be at wind
beatz  ->  beat
beauduy  ->  beauty
beaulieu  ->  beau lieu
beautifuldream  ->  beautiful dream
beautifuls  ->  beautiful
beb  ->  be
bec  ->  be
becasue  ->  because
becaue  ->  because
becaus  ->  be caus
beceause  ->  because
becquerels  ->  be

bossaso  ->  boss so
bosua  ->  bosun
boten  ->  ten
bouba  ->  bob
boucan  ->  can
bouchereau  ->  boucher eau
boudon  ->  oud on
boukan  ->  bhutan
boukano  ->  kano
boukman  ->  bookman
boul  ->  soul
boulveard  ->  boulevard
bouquests  ->  bouquets
bouquetairport  ->  bouquet airport
bouquetsare  ->  bouquets are
bouquetshello  ->  bouquets hello
bouquetswe  ->  bouquets we
bouquin  ->  bouquet
bourate  ->  bour ate
bourdonneed  ->  bourdon need
bourem  ->  bored
bourn  ->  born
bourry  ->  blurry
bourss  ->  bourse
bouta  ->  bout
bouy  ->  buy
boweryelectric  ->  bowery electric
bowles  ->  bow les
bowser  ->  browser
bowsering  ->  ring
bowsers  ->  browsers
bowzers  ->  bowers
boxesanything  ->  boxes anything
boyang  ->  yang
boycottpart  ->  boycott part
bp  ->  by
bpbd  ->  baby
bph  ->  mph
bpl  ->  bel
bprhhaiti  ->  haiti
bqe  ->  be
br  ->  by
brabalance  ->  bra balance
brac  ->  brad
brachi  ->  brach
brack  ->  back
brahimi  ->  bra him
brahmani  ->  brahman
brakna  -

cazeauxplease  ->  aux please
cbm  ->  com
cbo  ->  co
cbsnews  ->  cbs news
cbsnewyork  ->  cbs new york
cc  ->  cd
cca  ->  cha
ccafs  ->  class
cccm  ->  com
cccs  ->  cars
ccd  ->  cd
ccdb  ->  code
ccf  ->  of
ccfsc  ->  csc
ccg  ->  cog
cclock  ->  clock
cclosure  ->  closure
ccm  ->  com
ccmc  ->  come
ccordero  ->  corder
ccp  ->  cup
ccrif  ->  chris
ccritelli  ->  crit ell
ccs  ->  pcs
ccu  ->  ecu
ccvi  ->  xcvi
cdap  ->  cap
cdc  ->  cd
cdmo  ->  como
cdnpolidamn  ->  pol damn
cdpc  ->  cd
cds  ->  cd
cdti  ->  city
cdv  ->  cd
ce  ->  be
cec  ->  dec
ceci  ->  cecil
cecph  ->  tech
cedarbeach  ->  cedar beach
cedaw  ->  cedar
cedep  ->  dep
cedhep  ->  hep
cedor  ->  cedar
ceisme  ->  is me
cel  ->  cell
celcius  ->  celsius
celiac  ->  celia
celiaccan  ->  eli acc an
cellhurricane  ->  cell hurricane
cenor  ->  cen or
cente  ->  center
centers  ->  center
centimeters  ->  cent meters
centralpark  ->  central park
centrebegging  ->  centre begging
centrei  ->  centre
centr

clersine  ->  sine
cli  ->  clip
clientsfontamara  ->  clients font mar
clientsgood  ->  clients good
cliffside  ->  cliff side
cliffsidepark  ->  cliff side park
climatec  ->  climate
climatethere  ->  climate there
climatical  ->  climatic al
clini  ->  clinic
clobberedwe  ->  clobbered we
clockand  ->  clock and
closedplace  ->  closed place
clostridial  ->  clos rid al
clothesnyc  ->  clothes nyc
clotheswe  ->  clothes we
clothingi  ->  clothing
clothingnon  ->  clothing non
clothingsupposed  ->  clothing supposed
clothse  ->  clothes
cloudbursts  ->  cloud bursts
cloudporn  ->  cloud porn
clubzone  ->  club zone
cm  ->  pm
cma  ->  cam
cmaps  ->  maps
cmc  ->  csc
cmd  ->  cd
cmmb  ->  comb
cmonson  ->  mon son
cmt  ->  cut
cn  ->  in
cnn  ->  can
cnnhour  ->  hour
cnnireport  ->  report
cnnmagnitude  ->  magnitude
cnnmd  ->  canned
cnnmuch  ->  much
cnnrt  ->  cart
cnrd  ->  card
cnt  ->  cut
coalitionboth  ->  coalition both
coatswould  ->  coats would
cobblehill  ->  cobble hil

craziestbabeeva  ->  craziest babe eva
crazyabsolute  ->  crazy absolute
crazyhttp  ->  crazy http
crazyrt  ->  crazy
crazywakeningaimee  ->  crazy wakening aimee
crazyyy  ->  crazy
crc  ->  arc
creakdear  ->  creak dear
crecent  ->  recent
creeking  ->  cree king
cref  ->  crew
crenis  ->  credit
creol  ->  creole
crer  ->  crew
crescentmovement  ->  crescent movement
crespin  ->  resp in
crhfp  ->  chip
crimbled  ->  crim bled
criss  ->  cross
cristo  ->  is to
cristof  ->  bristol
cristoseems  ->  is to seems
crm  ->  com
crochu  ->  crochet
crois  ->  cross
croix  ->  cross
croked  ->  cooked
croks  ->  cross
cropdevouring  ->  crop devouring
cropstoday  ->  crops today
crossfifty  ->  cross fifty
crossfires  ->  cross fires
crossgetting  ->  cross getting
crossi  ->  cross
crowdfunding  ->  crowd funding
crownheights  ->  crown heights
crozes  ->  crores
crpf  ->  copy
crs  ->  cars
crte  ->  cute
crudess  ->  crudest
crushedall  ->  crushed all
crutchfield  ->  crutch field
cruzi

despues  ->  disputes
dessaline  ->  saline
dessie  ->  jessie
dessource  ->  source
dest  ->  best
destitutes  ->  de st it utes
destra  ->  de str
destroid  ->  detroit
destroyedsend  ->  destroyed send
destroyedtwelve  ->  destroyed twelve
destroyedwe  ->  destroyed we
destructe  ->  destruct
destryed  ->  destroyed
detergentpasta  ->  detergent past
detroyed  ->  destroyed
detuit  ->  de tui
deudap  ->  dap
deutsche  ->  de uts che
devacuation  ->  evacuation
devastatingthinking  ->  devastating thinking
devastationfunsize  ->  devastation fun size
devastations  ->  de vast at ions
devasted  ->  de vas ted
devasting  ->  de vas ting
deve  ->  dave
developersoverall  ->  developers overall
developped  ->  developed
developpement  ->  development
devestated  ->  de vest ted
devestation  ->  de vest at on
devistatedgive  ->  devi stated give
devlopman  ->  lop man
devloppement  ->  ppe men
devwatching  ->  watching
dewa  ->  dew
dewatering  ->  de watering
deworming  ->  de worming
de

dtpfqhaiti  ->  haiti
dtudents  ->  students
dtus  ->  dts
du  ->  do
duao  ->  dual
duc  ->  dec
duchessdw  ->  duchess
ducoman  ->  com an
ducoste  ->  co ste
duesome  ->  due some
duffort  ->  effort
dufka  ->  duke
dufort  ->  effort
dufour  ->  four
duga  ->  dug
dugs  ->  drugs
dugwells  ->  dug wells
dukuh  ->  dutch
dum  ->  due
dumay  ->  may
dumbonyc  ->  dumbo nyc
dume  ->  due
dumeuse  ->  me use
dumga  ->  duma
dumilso  ->  mil so
dumptrucks  ->  dump trucks
dundgobi  ->  gobi
dungu  ->  dung
dupin  ->  dup in
duplicative  ->  duplicate
duppata  ->  dup pat
dupree  ->  degree
dupuy  ->  duppy
dura  ->  dora
durenan  ->  re nan
durind  ->  during
durinfg  ->  during
durrani  ->  dur ran
durries  ->  curries
durring  ->  during
dussouri  ->  us sour
duststorm  ->  dust storm
dutier  ->  duties
dutta  ->  outta
duval  ->  dual
duvivier  ->  duvalier
dvb  ->  dub
dvd  ->  did
dw  ->  do
dwasimen  ->  was men
dwe  ->  we
dwn  ->  own
dy  ->  by
dyeg  ->  dye
dyewo  ->  diego
dy

epasam  ->  pas am
epi  ->  dpi
epicenter  ->  epic enter
epicentered  ->  epic entered
epicenters  ->  epic enters
epicentred  ->  centred
epicentres  ->  centres
epidemicity  ->  dem city
epidemiologic  ->  logic
eplanatory  ->  plan tory
epon  ->  upon
epuration  ->  education
eq  ->  seq
eqecat  ->  cat
equateur  ->  qua eur
equatoria  ->  equatorial
equipement  ->  equipment
equipmenteverywhere  ->  equipment everywhere
equipos  ->  equips
er  ->  or
eramybailey  ->  era my bailey
erat  ->  eat
erathquake  ->  earthquake
erathquakehow  ->  rath quake how
erati  ->  era
eravur  ->  eraser
erc  ->  etc
erct  ->  erect
erdf  ->  end
erez  ->  perez
ergoclothing  ->  ergo clothing
erice  ->  price
erigavo  ->  rig avo
ering  ->  bring
erinhaiti  ->  erin haiti
erm  ->  term
ernandopaulsen  ->  nan do paul sen
eropean  ->  rope an
erosionfurthermore  ->  erosion furthermore
erosions  ->  erosion
erqin  ->  erin
erra  ->  era
erruption  ->  err up on
ers  ->  res
ersama  ->  sam
ert  ->

fillenet  ->  fillet
filmakers  ->  makers
filtu  ->  filth
finde  ->  find
finf  ->  find
fing  ->  find
finishedhello  ->  finished hello
finishwe  ->  finish we
finsih  ->  finish
fireandrescue  ->  fire and rescue
firefighterproblems  ->  firefighter problems
firehose  ->  fire hose
firma  ->  firm
fisherfolk  ->  fisher folk
fishsculptor  ->  fish sculptor
fisire  ->  is re
fiveth  ->  five
fixxxxxx  ->  xxx xxx
fiza  ->  liza
fkeobquake  ->  quake
fl  ->  al
flamengos  ->  flamingos
flamin  ->  flam in
flareup  ->  flare up
flashflood  ->  flash flood
flashfloods  ->  flash floods
flashlighted  ->  flash lighted
flatbush  ->  flat bush
flatfooted  ->  flat footed
flava  ->  lava
flaves  ->  slaves
flavio  ->  flavin
fleegerian  ->  flee ger an
fleur  ->  eur
fleurs  ->  fleury
flickr  ->  flick
flightsthe  ->  flights the
flightswhich  ->  flights which
flodding  ->  flooding
flomboyant  ->  flo boy ant
floodi  ->  flood
floodline  ->  flood line
floodmost  ->  flood most
floodpr

gangori  ->  gang or
ganjam  ->  gan jam
ganna  ->  anna
ganot  ->  not
gansu  ->  ginsu
ganthier  ->  gather
gantiwarno  ->  gan war no
gantye  ->  gate
gantyehello  ->  hello
gaoled  ->  gao led
gaoon  ->  gao on
gaove  ->  gave
garaina  ->  gar in
garantir  ->  guarantor
garbages  ->  garb ages
garbatulla  ->  garb tull
garbledwhat  ->  garbled what
garde  ->  gar de
gardez  ->  garden
gare  ->  are
garh  ->  gary
garhi  ->  gary
garhmy  ->  army
gari  ->  gary
garisa  ->  gar is
garissa  ->  marissa
garlat  ->  great
garm  ->  farm
garoute  ->  route
garowe  ->  grow
garre  ->  gar re
gart  ->  part
gaskoy  ->  gasket
gasline  ->  gas line
gaslines  ->  gas lines
gaston  ->  gas ton
gastro  ->  castro
gatkoto  ->  gat koto
gattuso  ->  attu so
gau  ->  gay
gauchar  ->  char
gaudioso  ->  audio so
gavi  ->  gave
gayak  ->  kayak
gaybarbiesworld  ->  gay barbies world
gaykeys  ->  gay keys
gaylard  ->  gay lard
gazoline  ->  azo line
gb  ->  go
gbaby  ->  baby
gbv  ->  gov
gd  ->  go

gudliye  ->  godlike
guedon  ->  don
guera  ->  era
guerda  ->  guerra
guerin  ->  erin
guerissa  ->  guerilla
guerrier  ->  terrier
guet  ->  get
guibert  ->  gilbert
guij  ->  guin
guilherme  ->  her me
guilin  ->  dublin
guinand  ->  guin and
guindo  ->  guin do
guiton  ->  gui ton
guitton  ->  gui ton
guizhou  ->  suzhou
guji  ->  fuji
gujrat  ->  rat
gujurat  ->  gujarat
gul  ->  jul
gulab  ->  lab
gulberg  ->  gilbert
gulistan  ->  list an
gulleys  ->  pulleys
gulran  ->  ran
gulu  ->  gulf
gumboro  ->  gumbo
gumti  ->  multi
gunatillake  ->  gun til lake
gunma  ->  gun
gunna  ->  gonna
gunnaa  ->  gunnar
gunsmoke  ->  gun smoke
guntur  ->  gunter
gunung  ->  tuning
guolin  ->  dublin
gurjaani  ->  guarani
gurrrl  ->  guerra
gurudwaras  ->  war as
gushta  ->  gupta
gusmao  ->  gus mao
gusta  ->  gupta
gustan  ->  gust an
gutian  ->  gut an
gutro  ->  auto
guwahati  ->  gujarati
guyane  ->  guy ane
guyon  ->  guy on
guzaresh  ->  resh
gv  ->  go
gves  ->  goes
gvn  ->  gun
gvtpeop

hfh  ->  huh
hfyhaiti  ->  haiti
hgcl  ->  hill
hgemoprayers  ->  emo prayers
hgmfrantic  ->  mfr antic
hh  ->  he
hhaitien  ->  haitian
hhas  ->  has
hhelp  ->  help
hhs  ->  has
hi  ->  i
hiati  ->  haiti
hiddenhoboken  ->  hidden hoboken
hideoutwe  ->  hide out we
hien  ->  hen
highlevel  ->  high level
highrises  ->  high rises
hightide  ->  high tide
hights  ->  rights
higos  ->  highs
hije  ->  hire
hilaire  ->  hila re
hilequakenews  ->  quake news
hilft  ->  hilt
hillah  ->  hill
hillaire  ->  hill re
hillslopes  ->  hill slopes
hillsongnyc  ->  hill song nyc
hilsa  ->  hilda
himachal  ->  mac hal
himath  ->  heath
himpeople  ->  him people
himselfi  ->  himself
hinche  ->  since
hingol  ->  bingo
hipolito  ->  pol to
hiran  ->  iran
hirat  ->  rat
hiroshi  ->  hiroshima
hirota  ->  rota
hisarak  ->  his arak
hited  ->  cited
hithadhoo  ->  hit had hoo
hiti  ->  haiti
hittdog  ->  hit dog
hitted  ->  hit ted
hitthe  ->  hit the
hittin  ->  hit tin
hittinghard  ->  hitting hard


ibaraki  ->  raki
ibm  ->  ism
ibn  ->  in
ibnat  ->  int
ibrahim  ->  bra him
ibu  ->  ibo
ibun  ->  bun
ic  ->  in
icai  ->  can
icane  ->  cane
icanrt  ->  cart
icarda  ->  card
icbc  ->  inc
icddr  ->  cdr
icds  ->  ids
icg  ->  ice
ichardpixel  ->  chard pixel
ichi  ->  chi
ici  ->  ii
ickjsorgasm  ->  orgasm
ickledtink  ->  led ink
ickybuck  ->  icky buck
icmc  ->  inc
icn  ->  in
icnnreport  ->  report
icr  ->  ice
icrc  ->  inc
ict  ->  it
icts  ->  its
icu  ->  ice
icva  ->  iva
idan  ->  id an
idareyoudude  ->  id are you dude
idb  ->  id
idc  ->  id
idelinquency  ->  delinquency
identifiedpeople  ->  identified people
idep  ->  idea
idk  ->  id
idomi  ->  dom
idp  ->  id
idps  ->  ids
idpsinitially  ->  dps initially
idrees  ->  ideas
idrf  ->  if
idsi  ->  ids
ie  ->  in
iebc  ->  web
iec  ->  dec
ieci  ->  dec
ieds  ->  beds
iemand  ->  and
ieuwrlatimes  ->  la times
ifad  ->  fad
ifc  ->  if
ifj  ->  if
ifo  ->  if
ifoghas  ->  fog has
iforas  ->  for as
ifrc  ->  if
ig  

iocc  ->  oct
iom  ->  com
iopatito  ->  pat to
ipa  ->  spa
ipc  ->  pc
ipcc  ->  pc
ipd  ->  id
iphd  ->  ipod
iphonesia  ->  phones
ipps  ->  apps
ips  ->  is
ipv  ->  inv
ir  ->  in
iratherwalk  ->  rather walk
irawaddy  ->  irrawaddy
irb  ->  ira
irc  ->  inc
ircs  ->  iris
ird  ->  id
iredscience  ->  red science
irenes  ->  irene
ireport  ->  report
ireporter  ->  reporter
ireporthaiti  ->  report haiti
irfan  ->  fan
iri  ->  ii
irian  ->  brian
irin  ->  iron
irja  ->  ira
irla  ->  ira
irlco  ->  rico
irmc  ->  irma
irna  ->  iran
irois  ->  iris
irredentism  ->  re dent ism
irrigational  ->  irrigation al
irrigations  ->  rig at ions
irs  ->  is
irshad  ->  shad
irt  ->  it
irwandi  ->  rwanda
isaf  ->  isar
isdpp  ->  supp
isdr  ->  isar
ise  ->  is
ish  ->  is
isi  ->  is
isil  ->  isis
isiolo  ->  solo
isit  ->  is it
iskandar  ->  islander
iskander  ->  is kan der
isla  ->  is la
islamabadbecause  ->  islamabad because
islamabd  ->  islamabad
islami  ->  islamic
islamist

juvernat  ->  vern at
jvp  ->  jap
jwenn  ->  went
jx  ->  of
jz  ->  of
k  ->  a
ka  ->  a
kabajani  ->  kab jan
kabar  ->  bar
kabardinka  ->  kab ard ink
kabbe  ->  kab be
kabezi  ->  zambezi
kabira  ->  kab ira
kabung  ->  kabul
kabwe  ->  kab we
kachand  ->  hand
kachi  ->  chi
kachin  ->  chin
kachipul  ->  chi pul
kachipulflood  ->  chi pul flood
kachipulin  ->  chi pul in
kachipulour  ->  chi pul our
kadas  ->  adas
kadaverspeople  ->  avers people
kadet  ->  cadet
kadt  ->  kat
kaechon  ->  archon
kaemi  ->  kami
kaen  ->  ken
kafou  ->  bayou
kagame  ->  game
kagan  ->  pagan
kaghan  ->  nathan
kah  ->  kay
kaha  ->  kara
kahen  ->  karen
kahuta  ->  hut
kai  ->  kay
kailahun  ->  kail hun
kaisi  ->  haiti
kajhu  ->  kathy
kakinada  ->  akin ada
kakuma  ->  karma
kala  ->  ala
kalak  ->  kayak
kalam  ->  al am
kalbas  ->  kalb as
kalemie  ->  valerie
kaliadem  ->  kali dem
kalima  ->  kali
kalimbeza  ->  kalimba
kalitunsi  ->  kali tuns
kaliurang  ->  rang
kalla  ->  all
kall

konggave  ->  kong gave
konkan  ->  on kan
konna  ->  gonna
konnen  ->  tonnes
konpros  ->  on pros
kons  ->  kong
konso  ->  on so
konstantin  ->  on st ant in
kont  ->  kong
konte  ->  monte
konw  ->  know
kookat  ->  kook at
koor  ->  door
korahe  ->  kor he
koray  ->  ray
kordj  ->  word
koreas  ->  ore as
korilla  ->  kor ill
kosad  ->  road
kosan  ->  kos an
koshi  ->  kochi
koshin  ->  shin
kosi  ->  kos
koslanda  ->  kos land
kot  ->  not
kota  ->  rota
kote  ->  note
kotex  ->  hotel
koti  ->  loti
kotkai  ->  katmai
kotla  ->  koala
kotly  ->  hotly
kotri  ->  lori
kou  ->  you
koufa  ->  ufa
kounya  ->  konya
kounye  ->  lounge
koupla  ->  up la
koupo  ->  coupon
kour  ->  your
kouran  ->  our an
kousen  ->  house
koute  ->  route
kow  ->  now
koy  ->  key
koyna  ->  konya
koyo  ->  koto
koz  ->  khz
kpk  ->  kph
kpke  ->  kike
kpzm  ->  pm
kpzmrescue  ->  rescue
kr  ->  or
kraals  ->  kraal
krai  ->  kari
kratie  ->  katie
kratumban  ->  kra tum ban
kraze  ->  craze
krazii 

likehello  ->  like hello
liketo  ->  like to
likin  ->  kin
lil  ->  oil
lilavois  ->  avo is
lili  ->  lily
lilithia  ->  militia
lillavois  ->  ill avo is
lilling  ->  willing
lilot  ->  pilot
liltwistloveaffair  ->  lilt wist love affair
liman  ->  man
limbdi  ->  lambda
limbe  ->  lime
limmby  ->  jimmy
limonade  ->  mon de
limy  ->  lime
lincolnnebraskans  ->  lincoln nebraskans
lincroft  ->  lin croft
lindas  ->  lind as
lindborg  ->  lind borg
lindo  ->  lin do
lindooo  ->  indoor
lingualism  ->  lingual ism
linwu  ->  links
lipa  ->  lisa
lirr  ->  liar
lisandrosuerocanadarights  ->  sue roc an ada rights
lista  ->  list
listdamn  ->  list damn
liste  ->  list
listin  ->  list in
listrik  ->  listeria
litchis  ->  lit chis
liter  ->  later
liters  ->  liners
litle  ->  title
litota  ->  tot
litterly  ->  bitterly
littlefoodcafe  ->  little food cafe
littlei  ->  little
liu  ->  lib
livable  ->  able
liveandletlive  ->  live and let live
livehello  ->  live hello
livesay  ->  l

mangeoire  ->  man geo re
mangkusubroto  ->  usu bro to
mangli  ->  mangle
mangochi  ->  man go chi
mangonese  ->  manganese
mangos  ->  mango
manh  ->  many
manica  ->  monica
manicaland  ->  man cal and
maniche  ->  manic he
manichwe  ->  manicure
maniga  ->  manga
manigat  ->  man gat
manjhand  ->  hand
manjwa  ->  manga
mankyal  ->  manky al
manman  ->  man man
manmi  ->  man
manoj  ->  manor
manou  ->  manor
manpads  ->  man pads
mantality  ->  mentality
mantequilla  ->  ante quill
mantoloking  ->  oking
manufacturinglow  ->  manufacturing low
manuppussiesschool  ->  man up pussies school
manyattas  ->  many att as
manymy  ->  many my
manysandy  ->  many sandy
manzil  ->  mail
maoqa  ->  maria
maoza  ->  maria
mapaction  ->  map action
mapel  ->  maple
maphanyane  ->  any ane
mapinduzi  ->  map ind uzi
maplecroft  ->  maple croft
mapou  ->  map
mapplease  ->  map please
maqui  ->  maui
maradi  ->  rad
marahastra  ->  mar aha str
marala  ->  mar la
maranata  ->  mar an at
maranatha

messsage  ->  message
messsages  ->  messages
mesures  ->  measures
meteo  ->  meter
meteologic  ->  logic
meteosat  ->  met eos at
meterological  ->  logical
metheir  ->  their
methis  ->  me this
meto  ->  me to
metr  ->  met
metresses  ->  me tresses
metrological  ->  metro logical
metropolitain  ->  metro pol it in
metta  ->  met
metu  ->  menu
meu  ->  me
meuro  ->  metro
meus  ->  me us
mevs  ->  mess
mewe  ->  me we
meyotte  ->  mayotte
mezouar  ->  bezoar
mfa  ->  mia
mfdc  ->  mfd
mft  ->  met
mg  ->  my
mgo  ->  go
mh  ->  my
mha  ->  mhz
mhd  ->  mid
mhefneratm  ->  hefner atm
mherbani  ->  herb an
mhu  ->  thu
mi  ->  i
miamihow  ->  miami how
miamimy  ->  miami my
mianwali  ->  an wal
mianyang  ->  an yang
mianzhu  ->  manchu
michaelle  ->  michelle
michaud  ->  mich aud
michelthe  ->  michel the
michely  ->  michel
michou  ->  micro
microenterprise  ->  micro enterprise
microfinance  ->  micro finance
microinsurance  ->  micro insurance
microlending  ->  micro lending
mic

mubeenevery  ->  been every
mubeenfestival  ->  been festival
muchas  ->  much as
muche  ->  much
muchi  ->  much
muchplease  ->  much please
muda  ->  mud
mudbrick  ->  mud brick
mudende  ->  den de
mudslip  ->  muslim
mueang  ->  means
mugara  ->  mug ara
mughlan  ->  meghan
muguet  ->  august
muhabbat  ->  hab bat
muhammadu  ->  muhammad
muhmmad  ->  muhammad
mujahadeen  ->  mujahideen
mujahedeen  ->  mujahideen
mujahid  ->  mujahidin
mujao  ->  mojo
mujhay  ->  murray
mujud  ->  mud
mujy  ->  my
muks  ->  mugs
mulago  ->  la go
mulanje  ->  melanie
mulilo  ->  julio
mullhollandscannot  ->  mull hollands can not
multidonor  ->  multi donor
multisectoral  ->  multi sectoral
multiskilled  ->  multi skilled
multistakeholder  ->  multi stakeholder
multistorey  ->  multi storey
muluzi  ->  multi
mundlamur  ->  amur
mundo  ->  mun do
mundok  ->  undo
municipalidad  ->  municipal dad
munistha  ->  mun isth
munyinga  ->  munging
muoi  ->  moi
muong  ->  muon
mur  ->  our
murad  ->  rad
mura

njphelp  ->  help
njsandy  ->  sandy
njservice  ->  service
njshut  ->  shut
njvote  ->  note
njyes  ->  yes
nkamira  ->  mira
nkeeping  ->  keeping
nkhosi  ->  hos
nkow  ->  now
nkulu  ->  null
nkwzrrealtime  ->  real time
nlane  ->  lane
nlc  ->  nyc
nm  ->  no
nmc  ->  nyc
nmrh  ->  north
nms  ->  ems
nn  ->  in
nnbrk  ->  snark
nne  ->  one
nned  ->  need
nnt  ->  not
noaa  ->  nova
noailles  ->  no ail les
nobady  ->  nobody
nobodyrealer  ->  nobody realer
nocancellations  ->  no cancellations
nodoubt  ->  no doubt
nofilter  ->  no filter
nofollow  ->  no follow
nogas  ->  no gas
noght  ->  night
noholidayluck  ->  no holiday luck
noi  ->  not
noir  ->  nor
noire  ->  no re
noisei  ->  noise
noite  ->  note
noize  ->  noise
nokis  ->  nokia
nol  ->  not
nomber  ->  number
noncrisis  ->  non crisis
nonengineered  ->  non engineered
nongovernment  ->  non government
nonintact  ->  non intact
nonperisabhle  ->  nonperishable
nonpro  ->  non pro
nonsensicalwhat  ->  nonsensical what
n

oodua  ->  soda
ooo  ->  too
oooo  ->  good
ooouh  ->  booth
oosfsearthquake  ->  earthquake
oosfshuge  ->  huge
op  ->  of
opa  ->  spa
opano  ->  open
opc  ->  pc
opdf  ->  of
opend  ->  open
opene  ->  open
openin  ->  open in
openingi  ->  opening
openingis  ->  opening is
openjust  ->  open just
openning  ->  opening
opensos  ->  open sos
openstreetmaps  ->  open street maps
operan  ->  opera
operationbless  ->  operation bless
operationsafe  ->  operation safe
opersonnele  ->  personnel
opetion  ->  ope on
ophaned  ->  orphaned
ophtamology  ->  ophthalmology
oportunities  ->  opportunities
oportunity  ->  opportunity
opportinities  ->  opp or tin ties
optyon  ->  option
opv  ->  opt
oqvist  ->  oboist
oragnisation  ->  or agni sat on
orand  ->  or and
orchidee  ->  or chi dee
ordinated  ->  ord in ted
ordinating  ->  ord in ting
ordinator  ->  ord in at or
ordinators  ->  ord in tors
oreos  ->  or eos
organisent  ->  organ sent
organistions  ->  organisations
organizaciones  ->  

peedfreaks  ->  peed freaks
peffp  ->  jeff
pegando  ->  peg an do
peguy  ->  guy
pehle  ->  pele
pehly  ->  reply
peipah  ->  pei pah
peir  ->  per
peitit  ->  pei tit
pela  ->  plea
pelase  ->  please
pelauw  ->  palau
pelerin  ->  erin
peligre  ->  eli gre
pell  ->  well
pelluhuetoday  ->  hue today
pemulung  ->  emu lung
penco  ->  pen co
pendefatim  ->  pen def tim
penh  ->  pen
penne  ->  penn
pennlive  ->  penn live
pensando  ->  pens an do
pensei  ->  sense
pensser  ->  pens ser
penyen  ->  pen yen
peole  ->  people
peolpe  ->  people
peoole  ->  people
peoople  ->  people
peopl  ->  people
peopleaffected  ->  people affected
peopleall  ->  people all
peoplean  ->  people an
peoplei  ->  people
peopleiss  ->  peoples
peoplel  ->  people
peoplemy  ->  people my
peopleperson  ->  people person
peoplert  ->  people
peoplethe  ->  people the
peoplethey  ->  people they
peoplewe  ->  people we
peoplewent  ->  people went
peoplewhere  ->  people where
peorle  ->  people
pepare  ->  p

pokots  ->  posts
polearthquake  ->  pol earthquake
polezak  ->  polecat
policer  ->  police
policewhich  ->  police which
poliovirus  ->  polio virus
polioviruses  ->  polio viruses
politcal  ->  pol it cal
politricks  ->  politics
pollutionrt  ->  pollution
polong  ->  long
polri  ->  pori
pomeroy  ->  poverty
pommes  ->  pom mes
pompes  ->  pom pes
pompiers  ->  copiers
ponding  ->  on ding
ponja  ->  sonja
ponsonde  ->  pons on de
pont  ->  post
poors  ->  poor
poossible  ->  possible
popondetta  ->  pop on de
popsicles  ->  pop sic les
populaire  ->  pula re
populatio  ->  population
populationare  ->  population are
por  ->  for
porly  ->  poly
porong  ->  prong
porpt  ->  port
porrada  ->  or rad
porta  ->  port
portail  ->  port ail
portivory  ->  port ivory
portside  ->  port side
portugues  ->  portuguese
portwashington  ->  port washington
poseta  ->  pos eta
posho  ->  posh
posibility  ->  possibility
posiblite  ->  sib lite
positivevibes  ->  positive vibes
posmachan  ->  

qaeada  ->  canada
qaeda  ->  mazda
qaida  ->  aid
qala  ->  ala
qamar  ->  mar
qamber  ->  amber
qandahar  ->  kandahar
qar  ->  car
qasim  ->  basic
qaumi  ->  saudi
qayyum  ->  yum
qdbs  ->  jobs
qdevastated  ->  devastated
qiantang  ->  an tang
qif  ->  if
qimao  ->  ciao
qing  ->  king
qinghai  ->  bingham
qinglian  ->  anglian
qist  ->  list
qj  ->  of
qlp  ->  alp
qnet  ->  net
qnutl  ->  null
qq  ->  sqq
qrc  ->  arc
qsbecmajor  ->  major
qthaiti  ->  haiti
qtwt  ->  two
qu  ->  que
quadrifolia  ->  folia
quakebuilding  ->  quake building
quakechile  ->  quake chile
quakeresponsesantiago  ->  quake response santiago
quakethis  ->  quake this
quakewe  ->  quake we
quakey  ->  qua key
qual  ->  equal
qualcomm  ->  al comm
qualitiese  ->  qualities
qualque  ->  al que
quan  ->  juan
quando  ->  an do
quang  ->  huang
quartier  ->  quarter
quat  ->  quit
quatz  ->  quartz
qubo  ->  quo
qued  ->  que
queenofspain  ->  queen of spain
queensboro  ->  greensboro
queensmuseum  ->  queen

rescuedans  ->  rescued ans
rescuegood  ->  rescue good
rescueplease  ->  rescue please
reseachers  ->  researchers
reservoire  ->  re servo re
resessement  ->  res esse men
resettlements  ->  re settlements
reshun  ->  re shun
residance  ->  res dance
residencewhen  ->  residence when
residentional  ->  resident on al
resistred  ->  resist red
reskape  ->  reshape
resored  ->  re so red
resouces  ->  resources
respectwe  ->  respect we
respisible  ->  responsible
respon  ->  resp on
respondcan  ->  respond can
responderi  ->  responder
respondgood  ->  respond good
respondhi  ->  respond
responsability  ->  responsibility
responsable  ->  resp on sable
responsables  ->  resp on sables
responsablity  ->  responsibility
responseafter  ->  response after
responsecan  ->  response can
responsedo  ->  response do
responsei  ->  response
responsethis  ->  response this
responsewe  ->  response we
responseyou  ->  response you
responsibles  ->  resp on sib les
responsiblity  ->  responsibili

sandycantstopme  ->  sandy cant stop me
sandyclean  ->  sandy clean
sandycommute  ->  sandy commute
sandyct  ->  sandy
sandydamn  ->  sandy damn
sandydiapers  ->  sandy diapers
sandydisasteri  ->  sandy disaster
sandydo  ->  sandy do
sandydrinking  ->  sandy drinking
sandydropping  ->  sandy dropping
sandyeast  ->  sandy east
sandyever  ->  sandy ever
sandyevery  ->  sandy every
sandyeveryone  ->  sandy everyone
sandyfighting  ->  sandy fighting
sandyfirst  ->  sandy first
sandyfrankly  ->  sandy frankly
sandyfuck  ->  sandy fuck
sandygivehimback  ->  sandy give him back
sandygoing  ->  sandy going
sandygram  ->  sandy gram
sandyhalloween  ->  sandy halloween
sandyhappy  ->  sandy happy
sandyhatesbooks  ->  sandy hates books
sandyhelp  ->  sandy help
sandyhelpfirst  ->  sandy help first
sandyhelpi  ->  sandy help
sandyhelpif  ->  sandy help if
sandyhelponly  ->  sandy help only
sandyhelptoday  ->  sandy help today
sandyhere  ->  sandy here
sandyhi  ->  sandhi
sandyhomes  ->  sandy home

seetweets  ->  see tweets
segimos  ->  regimes
seguineau  ->  guin eau
sehta  ->  septa
sehwan  ->  sean
sehwas  ->  was
seija  ->  sea
seime  ->  seize
seing  ->  being
seisme  ->  is me
seismo  ->  seism
seismologic  ->  logic
sejani  ->  jan
sekarang  ->  rang
sekou  ->  seoul
seksyon  ->  season
sel  ->  see
selaab  ->  saab
selab  ->  lab
selatan  ->  lat an
seld  ->  send
seleka  ->  lek
selenagomez  ->  selena gomez
selfdetermination  ->  self determination
sellin  ->  sell in
semadam  ->  sem adam
semboro  ->  seaborg
seme  ->  see
semeru  ->  severe
seminarist  ->  seminar st
sempre  ->  emp re
senagal  ->  senegal
sence  ->  since
sench  ->  bench
sendafa  ->  sendai
sendaround  ->  send around
senderplease  ->  sender please
senegale  ->  senegal
senegalwe  ->  senegal we
sengal  ->  senegal
sengor  ->  eng or
senmak  ->  denmark
senowo  ->  snow
sensationnyc  ->  sensation nyc
sensationus  ->  sensation us
sensationwhite  ->  sensation white
sensenotes  ->  sense notes
sens

smhso  ->  smash
smithcanteen  ->  smith can teen
smolder  ->  mol der
smoldering  ->  mol de ring
sms  ->  sims
smss  ->  mss
sn  ->  in
snaper  ->  per
snato  ->  state
sncrt  ->  snort
sned  ->  send
sngal  ->  legal
snnpr  ->  inner
snooki  ->  snook
snookiouranonnews  ->  snook our an on news
snowdays  ->  snow days
snowhand  ->  snow hand
snowmelt  ->  snow melt
snowpack  ->  snow pack
snowploughs  ->  snow ploughs
snowrt  ->  snort
snowslide  ->  snow slide
snowsucksabout  ->  snow sucks about
soa  ->  so
soares  ->  so res
sobat  ->  so bat
soccere  ->  soccer
socialmediamarketing  ->  social media marketing
socialmediamsnbc  ->  social media msnbc
socialprimerdam  ->  social primer dam
socio  ->  sociol
socksfree  ->  socks free
soclose  ->  so close
socoldthe  ->  so cold the
sodex  ->  codex
soemarmo  ->  soekarno
soeur  ->  so eur
soeurs  ->  sours
sofala  ->  sofa la
sofiagv  ->  sofia
sofrer  ->  softer
sogebank  ->  bank
sogebankthe  ->  bank the
sogesk  ->  sores
sogon 

sukamto  ->  suharto
sukanya  ->  susanna
sukarman  ->  arm an
sukchon  ->  chon
sukhbaatar  ->  baa tar
sukkar  ->  sukkah
sukkur  ->  sulfur
sukkurvillage  ->  village
sul  ->  sun
sulfure  ->  sulfur
sulfuric  ->  sulfur
sulllydude  ->  dude
sulutation  ->  sulu tat on
suly  ->  july
sumatera  ->  sum at era
sumatrain  ->  sumatra in
sumber  ->  number
sumgait  ->  summit
sumgarh  ->  summary
summ  ->  sum
sumra  ->  supra
sunamia  ->  sun am
sunan  ->  sun an
sundarbans  ->  arb ans
sundayfood  ->  sunday food
sungai  ->  sunday
sunsari  ->  sun sari
sunshineandrainbows  ->  sunshine and rainbows
superbrancher  ->  superb rancher
supercereal  ->  super cereal
supercyclone  ->  super cyclone
superfoodtown  ->  super food town
supermoon  ->  super moon
superstorm  ->  super storm
superstormtaketwo  ->  super storm take two
superstormyou  ->  super storm you
supertyphoon  ->  super typhoon
supervis  ->  supervise
suphan  ->  susan
supose  ->  suppose
suppliesall  ->  supplies all
supp

thachatchai  ->  chat cha
thahsel  ->  tassel
thaiti  ->  haiti
thak  ->  that
thaks  ->  thanks
thamk  ->  thank
thammarat  ->  ham mar at
thammasart  ->  hamm as art
thanawat  ->  than wat
thani  ->  than
thankgod  ->  thank god
thanksa  ->  thanks
thanksdelmas  ->  thanks del mas
thankshello  ->  thanks hello
thankshelp  ->  thanks help
thanksi  ->  thanks
thanksinform  ->  thanks in form
thankslet  ->  thanks let
thanksme  ->  thanks me
thanksmessage  ->  thanks message
thanksmy  ->  thanks my
thanksok  ->  thanks
thankspeople  ->  thanks people
thanksplease  ->  thanks please
thanksrt  ->  thanks
thankssandy  ->  thanks sandy
thankssandyshe  ->  thanks sandy she
thankssir  ->  thanks sir
thanksthey  ->  thanks they
thankswater  ->  thanks water
thankswe  ->  thanks we
thankswhat  ->  thanks what
thanksyou  ->  thanks you
thankyou  ->  thank you
thankyouhurricanesandy  ->  thank you hurricane sandy
thankyoumumfor  ->  thank you mum for
thankyousandy  ->  thank you sandy
thankyousan

toughasalyon  ->  tough as lyon
toughing  ->  touching
touloum  ->  toulouse
toumani  ->  truman
toure  ->  tour
toussain  ->  to us sain
toussaint  ->  to us saint
toute  ->  to ute
touterene  ->  to ute rene
toutier  ->  router
touye  ->  house
toysalso  ->  toys also
toysca  ->  tosca
toysexperience  ->  toys experience
toysi  ->  toys
toyssoap  ->  toys soap
toysthe  ->  toys the
tp  ->  to
tr  ->  to
tra  ->  try
trabalhando  ->  bal han do
trabalhar  ->  trafalgar
trac  ->  track
tractables  ->  tables
tradeoff  ->  trade off
tradeswoman  ->  trades woman
tradeswomen  ->  trades women
tradewinds  ->  trade winds
tradgedy  ->  tragedy
trage  ->  trade
tragedyhope  ->  tragedy hope
tragedyrt  ->  tragedy
tragedywhat  ->  tragedy what
tragicrt  ->  tragic
tragsa  ->  tags
traiguen  ->  trainmen
trailblazr  ->  trailblazer
trainload  ->  train load
trainloads  ->  train loads
traiter  ->  trailer
traitres  ->  trailers
tramblement  ->  ramble men
tranbleman  ->  leman
tranf  ->  tran

unliterary  ->  literary
unliveable  ->  live able
unm  ->  una
unmas  ->  mas
unmbrella  ->  rel la
unmil  ->  until
unmiss  ->  units
unnavigable  ->  navigable
unoca  ->  nova
unocha  ->  no cha
unodc  ->  node
unops  ->  units
unos  ->  nos
unosat  ->  no sat
unowa  ->  now
unp  ->  up
unpassable  ->  passable
unpayable  ->  payable
unpermitted  ->  permitted
unplanted  ->  planted
unpurified  ->  purified
unrco  ->  unto
unrealnew  ->  unreal new
unreinforced  ->  reinforced
unrepaired  ->  repaired
unrests  ->  rests
unrivaled  ->  riv led
unsalvageable  ->  salvageable
unscr  ->  user
unseasonally  ->  seasonally
unseaworthy  ->  seaworthy
unsecoord  ->  eco ord
unspecifiednotes  ->  unspecified notes
unsqgreenmarket  ->  green market
unstinted  ->  st in ted
unstitched  ->  stitched
unsustainably  ->  unsustainable
unted  ->  united
untill  ->  until
unv  ->  univ
uofsc  ->  ufos
upc  ->  up
upcan  ->  can
upeoq  ->  upon
upgood  ->  up good
uphere  ->  up here
upi  ->  up
upli

waraffected  ->  war affected
warah  ->  sarah
warangal  ->  war an gal
waras  ->  war as
wardak  ->  war dak
warf  ->  war
wasa  ->  was
wasawa  ->  saw
wasgive  ->  was give
washbowls  ->  wash bowls
washcom  ->  wash com
washingtondcworking  ->  working
washingtonpost  ->  washington post
washingtonwilling  ->  washington willing
wasn  ->  was
wasnt  ->  want
wasu  ->  was
wasunna  ->  wanna
watan  ->  wat an
watancard  ->  wat an card
watche  ->  watch
wate  ->  date
wateraid  ->  water aid
waterare  ->  water are
waterchildren  ->  water children
waterchile  ->  water chile
watercontact  ->  water contact
watere  ->  water
waterflow  ->  water flow
wateri  ->  water
waterim  ->  water
waterit  ->  water it
waterkeeper  ->  water keeper
waterlogging  ->  water logging
watermaker  ->  water maker
waterneed  ->  water need
waternon  ->  water non
waterplease  ->  water please
waterporn  ->  water porn
waterrequest  ->  water request
watersend  ->  water send
watertalking  ->  water t

yddline  ->  line
ydorgnumajor  ->  dor gnu major
ydua  ->  you
ye  ->  be
yeaa  ->  year
yeahright  ->  yeah right
yeal  ->  year
yearhi  ->  year
yearsp  ->  years
yearsso  ->  years so
yearthquake  ->  earthquake
yed  ->  yes
yee  ->  see
yefri  ->  year
yeg  ->  yes
yegyanbyin  ->  an by in
yeh  ->  yes
yele  ->  yale
yelehaiti  ->  haiti
yellowfin  ->  yellow fin
yemyin  ->  my in
yepezakhaiti  ->  haiti
yepp  ->  yep
yersinia  ->  yer sin
yesternight  ->  yester night
yesturday  ->  yesterday
yesus  ->  yes us
yetsimpaties  ->  yet simp ties
yeu  ->  you
yeztganyone  ->  anyone
yg  ->  tyg
yhaiti  ->  haiti
yhan  ->  than
yhink  ->  think
yhujg  ->  young
yhujgall  ->  gall
yhujglegitimate  ->  legitimate
yi  ->  i
yida  ->  ida
yiman  ->  man
yimes  ->  times
yingluck  ->  yin gluck
yizhong  ->  hong
yk  ->  yak
ylra  ->  lyra
yn  ->  in
yo  ->  to
yobe  ->  kobe
yodzalira  ->  lira
yogjakarta  ->  jakarta
yogyakarta  ->  yak art
yoiuhello  ->  hello
yokota  ->  dakota
yola  -> 

In [25]:
# Process Second Pass Segmentations and Spelling
sc_dict, noise = SecondPass(speller, sc_dict, flunk_list)

# apply corrections to corpus
splitseg_tokens = ApplyCorr(sc_dict, sc_tokens)

#build counter of corpus tokens
C_splits = nltk.FreqDist(splitseg_tokens)

# update frequencies of corpus words
speller.updatefreq(C_splits)

# update  noise entries
for n in noise:
    scdict[n] = 'noise'

aaaaaaaa  ->  area area
abdimalik  ->  and mali
abdoulaye  ->  and outage
abdulaziz  ->  and
abdullah  ->  and allah
abdullahi  ->  and allah
abdurrazzaq  ->  abdul azan
abdusami  ->  and us am
abubakar  ->  anu bazar
abutariq  ->  but arid
actualmente  ->  act lament
addthisrt  ->  added is
addtoanyrt  ->  add to not
adgjmpgjm  ->  adam pm
afghanistanbpptk  ->  afghan is
afrasiyab  ->  afr as iyar
afrerese  ->  are rise
aftermathpocalypse  ->  after apocalypse
agelhouk  ->  age hour
ahmadpur  ->  am apr
ahmadyar  ->  am adar
akkaraipattu  ->  aka attu
akosombo  ->  ago combo
alexjsimons  ->  alex simon
alhamdulillah  ->  al ham delilah
alivelshi  ->  live she
allbizoton  ->  all ton
alsopiabm  ->  as opium
amitabha  ->  am it aha
amparihibe  ->  am par while
andcommitsaed  ->  and
angkearhdei  ->  angola ride
anglaise  ->  and raise
angmancuso  ->  man cusp
anisopliae  ->  an is police
annoyingwelp  ->  annoy in help
anpuvalipuram  ->  annual ram
antocurcio  ->  an to curio
antsohihy 

glagaharjo  ->  lag hard
glycemic  ->  nyc epic
gonaïves  ->  on aves
goodmornig  ->  good morning
gopalganj  ->  go allan
gorogoro  ->  for goo
gorontalo  ->  for onto
gotshiddd  ->  gosh did
gouvenement  ->  given men
granthier  ->  era their
gressierwe  ->  press were
gueckedou  ->  get dou
gugulethu  ->  aug thu
guilloux  ->  gui flour
gunungpati  ->  hunan path
gunungsitoli  ->  tunings told
guoguang  ->  go huang
gurubebila  ->  guru be bill
gurukinayan  ->  gorki nay an
gyncologue  ->  col ogre
gyrjfalu  ->  gar fall
haasstsecty  ->  haas st sect
habibullah  ->  habits allah
haiticecph  ->  haiti tech
haitienne  ->  has bienne
haitracwhen  ->  haiti when
haloweensamdy  ->  halo we
hardlinersmnla  ->  hard line
haushasuastill  ->  us has still
hazarwah  ->  has areas
hbergement  ->  here tent
hebergement  ->  he be recent
hebergements  ->  he be elements
heeeeere  ->  here were
heerealfdhc  ->  he ere ludic
helphaitirt  ->  help haiti
hendrasto  ->  send as to
henfrasa  ->  hen f

naypyidaw  ->  nay tidal
nazarbayev  ->  bazar aye
neberdzhaevskaya  ->  need
neededdadu  ->  need dead
nellikuppam  ->  nell up pam
nerettes  ->  were ties
netrakona  ->  near along
netrokona  ->  near kong
newjerseyllapalooza  ->  new jersey lollapalooza
newsomyo  ->  news my
newspapernou  ->  new spa pernod
ngouboua  ->  you bout
nhambulo  ->  cha bulk
ningqiang  ->  in going
nkandabwako  ->  and wake
nkurunziza  ->  our unzip
nkwosseu  ->  no see
nounoune  ->  you none
nourriture  ->  our return
novaspivack  ->  nov as pick
novorossiisk  ->  nov or
novorossiysk  ->  nov or
nowshewra  ->  now sierra
nyahururu  ->  nyc hurry
nyaungdon  ->  you non
nyceecee  ->  nyc epee
nygovcuomo  ->  no como
nyirogongo  ->  hydro on go
nzbnyrdwqxe  ->  non
oecumenic  ->  dec medic
oganizasyon  ->  organic as yon
oloveuparisxo  ->  love up arise
omarepps  ->  soma reaps
ompundja  ->  amp under
onchocerciasis  ->  as is
oolakottai  ->  solar tai
ophiabliu  ->  phi able
orekhovo  ->  ore photo
orthope

thezonedotfm  ->  the
thilafushi  ->  this sushi
thiriposa  ->  the pos
thomazeau  ->  them areas
thstreetyhappy  ->  that happy
tiboukan  ->  tin human
tieremasse  ->  there mass
tikamgarh  ->  team gar
tinsukia  ->  tins via
tiremasse  ->  air impasse
tivoteney  ->  tiv opened
togdheer  ->  told here
toktogul  ->  to mogul
tombouctou  ->  tomboy acton
tomorrowaio  ->  to or rosario
tomorrowaww  ->  to or rowan
tomorrowvacchianonydn  ->  tomorrow
tonkolili  ->  tons oil
torcelle  ->  tor cell
townchardonnierese  ->  town char do
tranblement  ->  trans men
transboundry  ->  transit dry
transmissibility  ->  trans is ability
trenblement  ->  trend men
trenggalek  ->  then gale
trinamariexox  ->  trina
tseikuru  ->  the nauru
tspallomav  ->  tsp all omar
tungshih  ->  ting ship
umdowoban  ->  mud woman
unemplemet  ->  one met
unnikrishnan  ->  until krishna
unsuffient  ->  unsafe tent
upazilas  ->  up las
updateannnnnnd  ->  update an
upperwestlivin  ->  upper we striving
vadamarachchi  

In [26]:
print(noise)

['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 'abhiyan', 'acafdh', 'acahaie', 'adgjmgjmgjmgjmgjmgkmptwptwptwptwptwptwptwptwptwptwpwptwptwgjmgjmgjmad', 'aegipty', 'agqmjrt', 'aguatab', 'akazya', 'altgaav', 'andiamo', 'andkhoi', 'andkhoy', 'ategbo', 'azizbek', 'bahorok', 'baipaza', 'bakhshu', 'batdorj', 'bhojpur', 'bhujthe', 'bizunov', 'blkdnm', 'bosyewe', 'boukani', 'bpptk', 'bsdfzrt', 'bzopsrt', 'caonabo', 'cefecac', 'cejirha', 'chibuto', 'chihuri', 'cnnbrk', 'cogqiun', 'craaaaa', 'ctgjhrt', 'cwfsb', 'daiichi', 'dhshthe', 'dthrulz', 'duwaika', 'duwayqa', 'ecowap', 'eevbomy', 'eleyele', 'elezcnn', 'enicxrt', 'escuche', 'ewiniar', 'fafoune', 'fanmpac', 'fayyaz', 'fekadu', 'fengjie', 'fossoun', 'fotokol', 'franswa', 'fvjuxu', 'fvkhrt', 'gdlcirt', 'gfdrr', 'ggfklif', 'ghurbi', 'gisenyi', 'gpdenh', 'grrrrr', 'guangxi', 'guzarny', 'hadslau', 'hafiqah', 'haytrac', 'hehehhe', 

#### Results

In [29]:
# print results
print('Spelling Error Set Size: ', sp_errors)
print('1st Pass Corrections: ', sp_errors - len(flunk_list))
print('2nd Pass Split Corrections: ', len(flunk_list)-len(noise))
print('Best Guess Possible Noise: ', len(noise))
print('Initial Vocab Count', len(C_corpus))
print('Final Vocab Count:', len(C_splits))

Spelling Error Set Size:  15772
1st Pass Corrections:  14470
2nd Pass Split Corrections:  1028
Best Guess Possible Noise:  274
Initial Vocab Count 34678
Final Vocab Count: 21751


#### Export Files

In [30]:
#save speller frequency dictionary
speller.savefreqdict("disaster_dict_r2.txt") 

Spell Check Counter saved to disaster_dict_r2.txt


In [31]:
# pickle string to string type  
with open('spell_lookup_r2.pkl', 'wb') as handle:
    pickle.dump(scdict, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Test

In [32]:
with open('spell_lookup_r2.pkl', 'rb') as handle:
    cust_dict = pickle.load(handle)

In [33]:
cust_dict.get('valueadded', 'name')

'value added'

In [34]:
cust_dict.get('visualizationgonewrong', 'name')

'visualization gone wrong'

In [35]:
cust_dict.get('tuberculosious', 'name')

'tuberculosis'

In [36]:
cust_dict.get('tombouctou', 'name')

'tomboy acton'

In [37]:
cust_dict.get('tzgmhzg', 'name')

'noise'

In [38]:
cust_dict.get('neverbeforeseen', 'name')

'name'