# Clean Corpora - One Off

## Imports and Settings

In [125]:
from nate_givens_toolkit import cloud_io as cloud
import pandas as pd
from datetime import datetime

## Global Variables

In [19]:
RAW_CORPORA_DIR = 'raw_corpora/'
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
BUCKET = 'lexgen'

## Functions

In [66]:
def is_invalid_word(word, valid_chars):
    # take a word and list of valid chars
    # return true if the word contains any chars *not* in valid_chars
    # otherwise return false
    return any([x not in valid_chars for x in str(word)])

## Logic

### Specify Clean Corpus Variables

In [121]:
raw_corpus_filename = 'en_50k_2018.txt'
clean_corpus_filename = 'en_50k_2018_A.txt'
clean_corpus_note = 'Cleaned version of en_50k_2018 with top 200 words having frequency reset to exclusive mean'

### Read in Data Tables

#### Raw Corpora

In [9]:
raw_corpora = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [10]:
raw_corpora.head()

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...


#### Clean Corpora

In [153]:
clean_corpora = cloud.read_csv_from_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [154]:
clean_corpora.head()

Unnamed: 0,filename,raw_corpora_filename,last_load_dtime,note


#### Clean Characters

In [131]:
clean_chars_table = cloud.read_csv_from_s3('clean_chars.dat', DATA_DIR, BUCKET, sep='|')

In [132]:
clean_chars_table.head()

Unnamed: 0,lang_code,clean_char_list
0,en,"a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,..."


## Populate Secondary Variables

#### Use raw_corpus_filename to get lang_code

In [133]:
lang_code = raw_corpora.loc[raw_corpora['filename'] == raw_corpus_filename]['lang_code'].values[0]

#### Use lang_code to get clean_chars

In [134]:
clean_chars = clean_chars_table.loc[clean_chars_table['lang_code'] == lang_code]['clean_char_list'].values[0].split(',')

### Read in Raw Corpus

In [135]:
raw_corpus = cloud.read_csv_from_s3(raw_corpus_filename, RAW_CORPORA_DIR, BUCKET, sep=' ', header = None, names = ['word', 'freq'])

In [136]:
raw_corpus.head()

Unnamed: 0,word,freq
0,you,28787591
1,i,27086011
2,the,22761659
3,to,17099834
4,a,14484562


### Clean Raw Corpus

#### Flag and then remove words that contain invalid characters

In [137]:
raw_corpus['invalid'] = raw_corpus['word'].apply(is_invalid_word, valid_chars=clean_chars)

In [138]:
valid_corpus = raw_corpus.copy(deep=True)

In [139]:
valid_corpus = valid_corpus[raw_corpus['invalid'] == False]

In [140]:
valid_corpus.drop(labels='invalid', axis=1, inplace=True)

In [141]:
valid_corpus.reset_index

<bound method DataFrame.reset_index of               word      freq
0              you  28787591
1                i  27086011
2              the  22761659
3               to  17099834
4                a  14484562
...            ...       ...
49993         mati       159
49994  pericardial       159
49995        rolli       159
49997      redline       159
49998          pho       159

[46717 rows x 2 columns]>

#### Replace the frequency of the top-200 words with the average frequency

In [142]:
mean_freq = valid_corpus.loc[200:, 'freq'].mean()

In [143]:
valid_corpus.loc[:199, 'freq'] = mean_freq

### Write Clean Corpora to S3

In [152]:
cloud.write_csv_to_s3(clean_corpus_filename, CLEAN_CORPORA_DIR, BUCKET, valid_corpus, sep='|', index=False)

### Update Clean Corpora Inventory

In [155]:
new_row = {
    'filename': clean_corpus_filename
    ,'raw_corpora_filename': raw_corpus_filename
    ,'last_load_dtime': str(datetime.utcnow())
    ,'note': clean_corpus_note
}

In [159]:
clean_corpora = clean_corpora.append(new_row, ignore_index=True)

In [160]:
cloud.write_csv_to_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, clean_corpora, sep='|', index=False)