# Data Utilities

Some (temp) python scripts for munging dictionaries and other data tasks.

In [27]:
import re

DATA_DIR = '../data';

alpha_only = re.compile(r'[a-zA-Z]+')

def get_words(dict_file_name):
  with open(f"{DATA_DIR}/{dict_file_name}", encoding='latin-1', mode='r') as f:
    words = f.read().splitlines()

  print(f"{len(words):,} words read from {dict_file_name}")

  words = [word.lower() for word in words if alpha_only.match(word)]
  print(f"... {len(words):,} are alpha only")

  words = [word for word in words if len(word) == 5]
  print(f"... {len(words):,} are 5 letters long\n")

  return set(words)

In [28]:
dict = get_words('dict.txt')
usa = get_words('usa2.txt')
english = get_words('english3.txt')


42,700 words read from dict.txt
... 42,697 are alpha only
... 4,430 are 5 letters long

77,722 words read from usa2.txt
... 77,718 are alpha only
... 5,442 are 5 letters long

194,433 words read from english3.txt
... 194,433 are alpha only
... 11,435 are 5 letters long



In [29]:
# My dictionary missing many words!
# isect = dict.intersection(usa)
isect = usa.intersection(english)
print(f"{len(isect):,} words in uses ISECT english")

5,282 words in uses ISECT english


In [30]:
# Write a json file with this cleanup up dictionary
import json

with open(f'{DATA_DIR}/words.json', 'w') as f:
  json.dump(sorted(isect), f, indent=2)


# Test Words

In [31]:
import csv

words = []

with open(f"{DATA_DIR}/darrellp.csv", 'r') as f:
  reader = csv.reader(f)
  words = [row[0] for row in reader]

print(f"{len(words):,} words read from darrellp.csv")

with open(f"{DATA_DIR}/test-words.json", 'w') as f:
  json.dump(words, f, indent=2)

722 words read from darrellp.csv
