# Derive Valid Characters

## Import Statements

In [1]:
from nate_givens_toolkit import cloud_io as cloud
from collections import Counter
import pandas as pd

## Global Variables

In [2]:
RAW_CORPORA_DIR = 'raw_corpora/'
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
BUCKET = 'lexgen'

## Custom Functions

In [3]:
def get_char_counter(words, frequencies):
    # take a list of words and a corresponding list of frequencies
    # return a counter with the sum of characters in all words, weighted by frequency    
    counter = Counter()
    for word, frequency in zip(words, frequencies):
        new_counter = Counter(str(word))
        for _ in new_counter.keys():
            new_counter[_] = new_counter[_] * frequency
        counter.update(new_counter)
        
    return counter

## Logic

### Read in raw corpora

In [4]:
raw_corpora = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [5]:
raw_corpora.head()

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...
4,de_full_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.967039,HermitDave's version of the full 2018 German f...


### Read in valid chars

In [6]:
valid_chars_table = cloud.read_csv_from_s3('valid_chars.dat', DATA_DIR, BUCKET, sep='|')

In [7]:
valid_chars_table.head()

Unnamed: 0,lang_code,valid_chars
0,en,"a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,..."
1,de,"i,c,h,s,e,d,a,t,u,n,r,w,z,j,m,f,b,o,ü,v,k,g,l,..."


### Read in a raw corpus

In [8]:
raw_corpus_filename = 'af_full_2018.txt'

In [9]:
if raw_corpus_filename in raw_corpora['filename'].tolist():
    raw_corpus = cloud.read_csv_from_s3(raw_corpus_filename, RAW_CORPORA_DIR, BUCKET, sep=' ', header = None, names = ['word', 'freq'])

In [10]:
raw_corpus.head()

Unnamed: 0,word,freq
0,die,12974
1,nie,12403
2,ek,12328
3,is,9816
4,het,8332


### Get a counter with all characters

In [11]:
all_char_counter = get_char_counter(raw_corpus['word'].tolist(), raw_corpus['freq'].tolist())

### Derive valid chars

In [12]:
print(all_char_counter)

Counter({'e': 219585, 'a': 116906, 'i': 107748, 'n': 104310, 'o': 89998, 's': 84357, 'r': 78152, 't': 72934, 'd': 66182, 'l': 53109, 'k': 52964, 'm': 44400, 'g': 43203, 'h': 32355, 'u': 31772, 'y': 28072, 'w': 26592, 'v': 22984, 'j': 18548, 'b': 16779, 'p': 15742, 'f': 9219, 'c': 2415, 'ê': 2161, '-': 1055, 'ë': 777, 'z': 767, 'ô': 268, 'é': 231, 'x': 207, '.': 157, 'ö': 101, 'ï': 78, '0': 76, '1': 75, 'q': 67, 'è': 62, 'ü': 60, '2': 49, '4': 45, '6': 43, '5': 34, '3': 29, 'ä': 20, 'ł': 20, '7': 18, '8': 16, 'â': 11, 'ó': 9, '9': 8, 'ȇ': 7, 'û': 6, 'ń': 6, 'á': 5, 'ś': 3, 'ñ': 2, 'ę': 2, 'í': 2, 'ż': 2, ',': 2, 'ç': 2, 'ź': 1, 'à': 1, 'î': 1, 'ß': 1})


In [13]:
print(all_char_counter.keys())

dict_keys(['d', 'i', 'e', 'n', 'k', 's', 'h', 't', 'j', 'y', 'm', 'o', 'w', 'a', 'u', 'v', 'r', 'g', 'l', 'p', 'b', 'ê', 'f', 'ô', 'ë', 'z', 'c', 'é', 'x', 'ü', 'q', '6', 'è', 'ö', 'ä', '.', '-', 'ï', 'â', 'û', '3', '4', '0', '1', '5', 'ȇ', '2', '7', 'ó', '9', '8', 'ł', 'ñ', 'ę', 'í', 'á', 'ń', 'ś', 'ź', 'ż', 'à', 'î', ',', 'ß', 'ç'])


In [17]:
# copy-pasted list of what I expect for this language
valid_chars_ext = ['a', 'b', 'c', 'd', 'e', 'è', 'ê', 'ë', 'f', 'g', 'h', 'i', 'î', 'ï', 'j', 'k', 'l', 'm', 'n', 'o', 'ô', 'ö', 'p', 'q', 'r', 's', 't', 'u', 'û', 'ü', 'v', 'w', 'x', 'y', 'z']

In [24]:
# copy-paste the keys and then ues Google + frequencies above to eliminate the invalid chars
# valid_char_list = ['d', 'i', 'e', 'n', 'k', 's', 'h', 't', 'j', 'y', 'm', 'o', 'w', 'a', 'u', 'v', 'r', 'g', 'l', 'p', 'b', 'ê', 'f', 'ô', 'ë', 'z', 'c', 'é', 'x', 'ü', 'q', '6', 'è', 'ö', 'ä', 'ï', 'â', 'û', '3', '4', '0', '1', '5', 'ȇ', '2', '7', 'ó', '9', '8', 'ł', 'ñ', 'ę', 'í', 'á', 'ń', 'ś', 'ź', 'ż', 'à', 'î', ',', 'ß', 'ç']

In [15]:
valid_char_list = [char for char in all_char_counter.keys() if char in valid_chars_ext]

In [16]:
# check the length
len(valid_char_list)

35

In [18]:
# convert to a string
valid_chars = ','.join(valid_char_list)
print(valid_chars)

d,i,e,n,k,s,h,t,j,y,m,o,w,a,u,v,r,g,l,p,b,ê,f,ô,ë,z,c,x,ü,q,è,ö,ï,û,î


### Write valid chars to S3

In [19]:
new_row = {
    'lang_code':'af'
    ,'valid_chars':valid_chars
}

In [20]:
if new_row['lang_code'] not in valid_chars_table['lang_code'].tolist():
    valid_chars_table = valid_chars_table.append(new_row, ignore_index = True)
    cloud.write_csv_to_s3('valid_chars.dat', DATA_DIR, BUCKET, valid_chars_table, sep='|', index=False)