# Prep some data for training

see:
- https://www.dcs.bbk.ac.uk/~ROGER/corpora.html

In [1]:
import re, random, json
import pandas as pd
from pathlib import Path

In [2]:
path = Path('data/RogerMitton')

In [3]:
def word_filter(first_word):
    "keep words that contain only characters and hyphens and are not too short"
    if len(first_word) < 4:
        return False
    if re.match(r'^[-a-zA-Z]+$', first_word) is None:
        return False
    return True

In [4]:
data = []
for file_path in (path/'raw').glob('*.dat'):
    if 'tagged' in file_path.name:
        print('skipping tagged version', file_path) # holbrook-missp contains the misspellings extracted from the tagged file ...
        continue
    print('reading', file_path)
    with open(file_path) as f:
        correct_word = None
        for line in f:
            try:
                if line.startswith('$'):
                    correct_word = line[1:].strip()
                else:
                    bad_word = line.split(' ')[0].strip()
                    if word_filter(correct_word) and word_filter(bad_word):
                        data.append(dict(correct=correct_word, mistake=bad_word))
            except Exception as ex:
                print(line)
                raise ex
df = pd.DataFrame(data)
for c in df.columns:
    df[c] = df[c].str.upper()
df = df.drop_duplicates().sort_values(['correct', 'mistake'])
df

reading data\RogerMitton\raw\aspell.dat
reading data\RogerMitton\raw\holbrook-missp.dat
skipping tagged version data\RogerMitton\raw\holbrook-tagged.dat
reading data\RogerMitton\raw\missp.dat
reading data\RogerMitton\raw\wikipedia.dat


Unnamed: 0,correct,mistake
1846,A-QUIVER,AQUIVER
34814,ABANDON,ABONDON
34815,ABANDONED,ABONDONED
34816,ABANDONING,ABONDONING
34817,ABANDONS,ABONDONS
...,...,...
34729,YOUTH,YUTH
34730,ZEALOUS,ZELOUS
34731,ZENITH,ZEENITH
34812,ZIONIST,SIONIST


In [5]:
df.to_csv(path/'roger_mitton_common_misspellings.csv', index=False)