# Corpora Ingestion: en_50k_2018

## Overview

I downloaded this file from HermitDave's FrequencyWord repo: https://github.com/hermitdave/FrequencyWords/blob/master/content/2018/en/en_50k.txt

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

## Global Variables

In [2]:
valid_chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'
               , 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

## Helper Functions

In [20]:
def get_char_counter(words, frequencies):
    # take a list of words and a corresponding list of frequencies
    # return a counter with the sum of characters in all words, weighted by frequency    
    counter = Counter()
    for word, frequency in zip(words, frequencies):
        new_counter = Counter(str(word))
        for _ in new_counter.keys():
            new_counter[_] = new_counter[_] * frequency
        counter.update(new_counter)
        
    return counter

def is_invalid_word(word, invalid_char_list):
    # take a word and a list of invalid chars
    # if any of the invalid chars are in the word, return false
    # else, return true
    return any(invalid_char in str(word) for invalid_char in invalid_char_list)

## Raw Text EDA

In [4]:
file_path = r'D:\code_repos\LexGen\data\corpora\en_50k_2018\en_50k_2018-RAW.txt'
raw_words = pd.read_csv(file_path, sep=' ', header=None, names=['word','frequency'])

In [5]:
raw_words.head()

Unnamed: 0,word,frequency
0,you,28787591
1,i,27086011
2,the,22761659
3,to,17099834
4,a,14484562


In [6]:
char_counter = get_char_counter(raw_words['word'].tolist(), raw_words['frequency'].tolist())

In [7]:
print(char_counter)

Counter({'e': 317758171, 't': 252863727, 'o': 243632065, 'a': 208469923, 'i': 192792447, 'n': 177477796, 'h': 159841703, 's': 159386595, 'r': 143335320, 'l': 114242322, 'u': 100426995, 'd': 95888458, 'y': 90697550, 'm': 74980830, 'w': 72508000, 'g': 66429118, 'c': 59766718, 'f': 45788687, 'b': 41078194, 'p': 40235384, "'": 38976555, 'k': 36833453, 'v': 26855865, 'j': 7002669, 'x': 3367545, 'z': 1745765, '-': 1623514, 'q': 1510497, '.': 1430364, '0': 114275, '1': 88837, '2': 56690, 'é': 52966, '`': 34653, '3': 31264, '4': 25179, '5': 24570, '8': 23131, '9': 22464, '6': 17965, 'ο': 17920, '7': 17633, 'ö': 15343, 'ñ': 10737, 'í': 8925, 'á': 7199, 'ü': 4462, 'ó': 4370, 'ç': 4166, 'è': 3892, 'ã': 3514, 'à': 2040, 'ú': 1587, 'â': 1152, 'ø': 982, 'ô': 785, 'ä': 742, 'î': 737, 'τ': 697, 'υ': 646, 'ò': 557, 'η': 553, 'ì': 532, 'ë': 498, 'ê': 469, 'ﬁ': 418, 'ﬂ': 415, 'æ': 373, 'ş': 366, 'İ': 328, 'ï': 320, 'ν': 308, 'û': 270, 'µ': 270, 'å': 207, 'у': 195, 'ý': 166})


In [8]:
good_char_counter = {k: char_counter[k] for k in valid_chars}

In [9]:
print(good_char_counter)

{'a': 208469923, 'b': 41078194, 'c': 59766718, 'd': 95888458, 'e': 317758171, 'f': 45788687, 'g': 66429118, 'h': 159841703, 'i': 192792447, 'j': 7002669, 'k': 36833453, 'l': 114242322, 'm': 74980830, 'n': 177477796, 'o': 243632065, 'p': 40235384, 'q': 1510497, 'r': 143335320, 's': 159386595, 't': 252863727, 'u': 100426995, 'v': 26855865, 'w': 72508000, 'x': 3367545, 'y': 90697550, 'z': 1745765}


In [10]:
invalid_chars = [char for char in char_counter.keys() if char not in valid_chars]

In [11]:
bad_char_counter = {k: char_counter[k] for k in invalid_chars}

In [12]:
print(bad_char_counter)

{"'": 38976555, '.': 1430364, '-': 1623514, 'ö': 15343, '2': 56690, '1': 88837, '3': 31264, '0': 114275, '`': 34653, 'é': 52966, '4': 25179, 'ñ': 10737, '8': 23131, '5': 24570, '9': 22464, '7': 17633, 'ο': 17920, '6': 17965, 'ç': 4166, 'í': 8925, 'ü': 4462, 'á': 7199, 'à': 2040, 'ã': 3514, 'ó': 4370, 'è': 3892, 'ä': 742, 'η': 553, 'â': 1152, 'ø': 982, 'ú': 1587, 'ê': 469, 'υ': 646, 'ş': 366, 'İ': 328, 'ï': 320, 'ô': 785, 'ν': 308, 'τ': 697, 'ò': 557, 'û': 270, 'ì': 532, 'µ': 270, 'î': 737, 'ë': 498, 'ﬂ': 415, 'ﬁ': 418, 'æ': 373, 'å': 207, 'у': 195, 'ý': 166}


In [13]:
print(len(invalid_chars))

51


In [14]:
res = is_invalid_word('wor.d', invalid_chars)
print(res)

True


In [15]:
raw_words['invalid'] = raw_words['word'].apply(is_invalid_word, invalid_char_list=invalid_chars)

In [16]:
raw_words.head()

Unnamed: 0,word,frequency,invalid
0,you,28787591,False
1,i,27086011,False
2,the,22761659,False
3,to,17099834,False
4,a,14484562,False


In [18]:
raw_words['invalid'].value_counts()

False    46717
True      3283
Name: invalid, dtype: int64

In [19]:
raw_words[raw_words['invalid']]

Unnamed: 0,word,frequency,invalid
5,'s,14291013,True
9,'t,9628970,True
25,'m,4386306,True
31,'re,4059719,True
43,'ll,2913428,True
...,...,...,...
49960,podnapisi.net,159,True
49963,word-,159,True
49985,girls-,159,True
49996,hyeon-to,159,True


In [24]:
total_chars = 0
for word, frequency in zip(raw_words['word'].tolist(), raw_words['frequency'].tolist()):
    total_chars += len(str(word)) * frequency
print(total_chars)

2777551998


In [25]:
total_chars_check = 0
for val in char_counter.values():
    total_chars_check += val
print(total_chars_check)

2777551998


In [26]:
total_chars_check2 = 0
for val in good_char_counter.values():
    total_chars_check2 += val
for val in bad_char_counter.values():
    total_chars_check2 += val
print(total_chars_check2)

2777551998
