# Clean Corpora - One Off

## Imports and Settings

In [1]:
from nate_givens_toolkit import cloud_io as cloud
import pandas as pd
from datetime import datetime

## Global Variables

In [2]:
RAW_CORPORA_DIR = 'raw_corpora/'
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
BUCKET = 'lexgen'

## Functions

In [3]:
def is_invalid_word(word, valid_chars):
    # take a word and list of valid chars
    # return true if the word contains any chars *not* in valid_chars
    # otherwise return false
    return any([x not in valid_chars for x in str(word)])

## Logic

### Specify Clean Corpus Variables

In [4]:
raw_corpus_filename = 'af_full_2018.txt'
clean_corpus_filename = 'af_full_2018_A.txt'
clean_corpus_note = 'Cleaned version of af_full_2018 with top 200 words having frequency reset to exclusive mean'

### Read in Data Tables

#### Raw Corpora

In [5]:
raw_corpora = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [6]:
raw_corpora.head()

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...
4,de_full_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.967039,HermitDave's version of the full 2018 German f...


#### Clean Corpora

In [7]:
clean_corpora = cloud.read_csv_from_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [8]:
clean_corpora.head()

Unnamed: 0,filename,raw_corpora_filename,last_load_dtime,note
0,en_50k_2018_A.txt,en_50k_2018.txt,2021-04-01 01:07:27.259981,Cleaned version of en_50k_2018 with top 200 wo...
1,de_50k_2018_A.txt,de_50k_2018.txt,2021-04-06 20:25:02.967162,Cleaned version of de_50k_2018 with top 200 wo...


#### Valid Characters

In [9]:
valid_chars_table = cloud.read_csv_from_s3('valid_chars.dat', DATA_DIR, BUCKET, sep='|')

In [10]:
valid_chars_table.head()

Unnamed: 0,lang_code,valid_chars
0,en,"a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,..."
1,de,"i,c,h,s,e,d,a,t,u,n,r,w,z,j,m,f,b,o,ü,v,k,g,l,..."
2,af,"d,i,e,n,k,s,h,t,j,y,m,o,w,a,u,v,r,g,l,p,b,ê,f,..."


## Populate Secondary Variables

#### Use raw_corpus_filename to get lang_code

In [11]:
lang_code = raw_corpora.loc[raw_corpora['filename'] == raw_corpus_filename]['lang_code'].values[0]

#### Use lang_code to get clean_chars

In [12]:
valid_chars = valid_chars_table.loc[valid_chars_table['lang_code'] == lang_code]['valid_chars'].values[0].split(',')

### Read in Raw Corpus

In [13]:
raw_corpus = cloud.read_csv_from_s3(raw_corpus_filename, RAW_CORPORA_DIR, BUCKET, sep=' ', header = None, names = ['word', 'freq'])

In [14]:
raw_corpus.head()

Unnamed: 0,word,freq
0,die,12974
1,nie,12403
2,ek,12328
3,is,9816
4,het,8332


### Clean Raw Corpus

#### Flag and then remove words that contain invalid characters

In [15]:
raw_corpus['invalid'] = raw_corpus['word'].apply(is_invalid_word, valid_chars=valid_chars)

In [16]:
valid_corpus = raw_corpus.copy(deep=True)

In [17]:
valid_corpus = valid_corpus[raw_corpus['invalid'] == False]

In [18]:
valid_corpus.drop(labels='invalid', axis=1, inplace=True)

In [19]:
valid_corpus.reset_index

<bound method DataFrame.reset_index of                   word   freq
0                  die  12974
1                  nie  12403
2                   ek  12328
3                   is   9816
4                  het   8332
...                ...    ...
18504            manna      1
18505          varkvet      1
18506     delikatessen      1
18507  fabriekswerkers      1
18509         gemagtig      1

[17526 rows x 2 columns]>

#### Replace the frequency of the top-200 words with the average frequency

In [20]:
mean_freq = valid_corpus.loc[200:, 'freq'].mean()

In [21]:
valid_corpus.loc[:199, 'freq'] = mean_freq

### Write Clean Corpora to S3

In [22]:
cloud.write_csv_to_s3(clean_corpus_filename, CLEAN_CORPORA_DIR, BUCKET, valid_corpus, sep='|', index=False)

### Update Clean Corpora Inventory

In [23]:
new_row = {
    'filename': clean_corpus_filename
    ,'raw_corpora_filename': raw_corpus_filename
    ,'last_load_dtime': str(datetime.utcnow())
    ,'note': clean_corpus_note
}

In [24]:
clean_corpora = clean_corpora.append(new_row, ignore_index=True)

In [25]:
cloud.write_csv_to_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, clean_corpora, sep='|', index=False)