# Derive Valid Characters

## Import Statements

In [18]:
from nate_givens_toolkit import cloud_io as cloud
from collections import Counter
import pandas as pd

## Global Variables

In [4]:
RAW_CORPORA_DIR = 'raw_corpora/'
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
BUCKET = 'lexgen'

## Custom Functions

In [13]:
def get_char_counter(words, frequencies):
    # take a list of words and a corresponding list of frequencies
    # return a counter with the sum of characters in all words, weighted by frequency    
    counter = Counter()
    for word, frequency in zip(words, frequencies):
        new_counter = Counter(str(word))
        for _ in new_counter.keys():
            new_counter[_] = new_counter[_] * frequency
        counter.update(new_counter)
        
    return counter

## Logic

### Read in raw corpora

In [7]:
raw_corpora = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [8]:
raw_corpora.head()

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...
4,de_full_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.967039,HermitDave's version of the full 2018 German f...


### Read in valid chars

In [32]:
valid_chars_table = cloud.read_csv_from_s3('valid_chars.dat', DATA_DIR, BUCKET, sep='|')

In [35]:
valid_chars_table.head()

Unnamed: 0,lang_code,valid_chars
0,en,"a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,..."


### Read in a raw corpus

In [11]:
raw_corpus_filename = 'de_full_2018.txt'

In [12]:
if raw_corpus_filename in raw_corpora['filename'].tolist():
    raw_corpus = cloud.read_csv_from_s3(raw_corpus_filename, RAW_CORPORA_DIR, BUCKET, sep=' ', header = None, names = ['word', 'freq'])

In [16]:
raw_corpus.head()

Unnamed: 0,word,freq
0,ich,5890279
1,sie,3806767
2,das,3122198
3,ist,3025610
4,du,2947020


### Get a counter with all characters

In [19]:
all_char_counter = get_char_counter(raw_corpus['word'].tolist(), raw_corpus['freq'].tolist())

### Derive valid chars

In [20]:
print(all_char_counter)

Counter({'e': 110037950, 'n': 69631298, 'i': 67774894, 's': 53684871, 'r': 47256356, 'a': 46870308, 't': 44486226, 'h': 43631118, 'd': 33252087, 'c': 28056596, 'u': 26122793, 'l': 25585891, 'm': 21624302, 'g': 18235674, 'o': 18098994, 'w': 16256823, 'b': 14513070, 'k': 10501365, 'f': 10374888, 'z': 6821909, 'v': 5160971, 'p': 4741384, 'ü': 4170553, 'j': 2957530, 'ä': 2883019, 'ö': 2191455, 'y': 1721890, 'ß': 1492100, '-': 536179, '.': 362103, 'x': 298728, 'q': 124688, '0': 49740, 'é': 29773, '1': 23139, '2': 18019, '4': 16776, '3': 13201, '5': 11018, '9': 8173, '6': 8002, '7': 7810, '8': 7433, 'ã': 7219, 'ñ': 6549, 'è': 4626, 'á': 3976, 'í': 3185, 'ó': 2924, 'ç': 2095, 'à': 1667, 'ô': 1634, '`': 1441, 'ø': 1068, 'û': 953, 'ú': 875, 'å': 798, 'ο': 696, 'ÿ': 691, 'ý': 648, 'â': 639, 'ð': 638, 'ê': 634, 'ë': 607, 'ï': 576, 'î': 543, 'ν': 485, 'ì': 444, ',': 425, 'ı': 379, 'ﬂ': 286, 'İ': 285, 'š': 285, 'ş': 272, 'æ': 269, 'þ': 253, 'ﬁ': 231, 'ò': 230, 'ğ': 216, 'κ': 178, 'œ': 173, 'õ': 172

In [23]:
print(all_char_counter.keys())

dict_keys(['i', 'c', 'h', 's', 'e', 'd', 'a', 't', 'u', 'n', 'r', 'w', 'z', 'j', 'm', 'f', 'b', 'o', 'ü', 'v', 'k', 'g', 'l', 'ß', 'ö', 'y', 'ä', '.', 'p', 'x', 'q', '0', '4', 'ñ', '-', 'é', 'ç', '2', 'ã', 'à', '9', '3', '6', '1', 'í', 'è', 'á', '8', 'ó', '7', '5', 'ô', 'İ', 'ø', 'â', 'î', 'ÿ', '`', 'û', 'ï', 'ú', 'ı', 'å', 'ë', 'ğ', 'ý', 'þ', 'ê', 'ð', 'ş', 'ò', 'ì', 'ù', 'ν', 'ο', ',', 'æ', 'α', 'č', 'ρ', 'μ', 'š', 'ﬂ', 'ă', 'œ', 'õ', 'ε', 'ō', 'ž', 'ě', 'ﬁ', 'κ', 'ė', 'ā', 'ł', 'η', 'ż', 'ь', 'µ', 'ń', 'τ', 'ť', 'ř', 'ĺ', 'ū', 'ď', 'β', 'ć', 'д', 'ц', 'º', 'ҫ', 'ǎ', '黤', 'ţ', 'χ', 'ệ', 'ĩ', 'ś', 'ą', 'ę', 'ª', 'я', 'б', 'ѐ', 'ē', '膕', '適', '遳', 'ů', 'ﬀ', 'ő'])


In [24]:
# copy-paste the keys and then ues Google + frequencies above to eliminate the invalid chars
valid_char_list = ['i', 'c', 'h', 's', 'e', 'd', 'a', 't', 'u', 'n', 'r', 'w', 'z', 'j', 'm', 'f', 'b', 'o', 'ü', 'v', 'k', 'g', 'l', 'ß', 'ö', 'y', 'ä', 'p', 'x', 'q']

In [26]:
# check the length
len(valid_char_list)

30

In [27]:
# convert to a string
valid_chars = ','.join(valid_char_list)
print(valid_chars)

i,c,h,s,e,d,a,t,u,n,r,w,z,j,m,f,b,o,ü,v,k,g,l,ß,ö,y,ä,p,x,q


### Write valid chars to S3

In [36]:
new_row = {
    'lang_code':'de'
    ,'valid_chars':valid_chars
}

In [39]:
if new_row['lang_code'] not in valid_chars_table['lang_code'].tolist():
    valid_chars_table = valid_chars_table.append(new_row, ignore_index = True)
    cloud.write_csv_to_s3('valid_chars.dat', DATA_DIR, BUCKET, valid_chars_table, sep='|', index=False)