# Raw Corpora Intake - Batch from File

## Imports and Settings

In [1]:
from nate_givens_toolkit import cloud_io as cloud
import pandas as pd
from datetime import datetime
import ntpath

## Global Variables

In [2]:
LOCAL_DIR = '/data/lexgen/raw_corpora/'
CORPORA_DIR = 'raw_corpora/'
DATA_DIR = 'data_files/'
BUCKET = 'lexgen'
FILE = 'batch_lang_import.csv'

## Logic

### Specify Batch Variables

In [3]:
batch = []

In [4]:
# read file, discard header
with open(FILE) as f:
    content = f.readlines()
content = content[1:]

In [5]:
for line in content:
    tokens = line.split(',')
    batch.append({
        'filename': tokens[0]
        , 'lang_code': tokens[1]
        , 'source_url': tokens[2]
        , 'last_load_dtime': str(datetime.utcnow())
        , 'note': tokens[3].strip('\n').replace("\\'", "'")
    })

In [6]:
batch

[{'filename': 'pt_50k_2016.txt',
  'lang_code': 'pt',
  'source_url': 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/pt/pt_50k.txt',
  'last_load_dtime': '2021-05-11 22:10:29.685382',
  'note': "Hermit Dave's version of the top 50k words from OpenSubtitles (2016) for Portuguese"},
 {'filename': 'pt_full_2016.txt',
  'lang_code': 'pt',
  'source_url': 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/pt/pt_full.txt',
  'last_load_dtime': '2021-05-11 22:10:29.685392',
  'note': "Hermit Dave's version of the full list of words from OpenSubtitles (2016) for Portuguese"},
 {'filename': 'pt_50k_2018.txt',
  'lang_code': 'pt',
  'source_url': 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/pt/pt_50k.txt',
  'last_load_dtime': '2021-05-11 22:10:29.685396',
  'note': "Hermit Dave's version of the top 50k words from OpenSubtitles (2018) for Portuguese"},
 {'filename': 'pt_full_2018.txt',
  'lang_cod

### Read in Raw Corpora Inventory Table

In [7]:
raw_corpora_inventory = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [8]:
raw_corpora_inventory

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...
4,de_full_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.967039,HermitDave's version of the full 2018 German f...
...,...,...,...,...,...
96,no_full_2018.txt,no,https://raw.githubusercontent.com/hermitdave/F...,2021-05-11 21:45:44.840764,Hermit Dave's version of the full list of word...
97,pl_50k_2016.txt,pl,https://raw.githubusercontent.com/hermitdave/F...,2021-05-11 21:45:44.840766,Hermit Dave's version of the top 50k words fro...
98,pl_full_2016.txt,pl,https://raw.githubusercontent.com/hermitdave/F...,2021-05-11 21:45:44.840769,Hermit Dave's version of the full list of word...
99,pl_50k_2018.txt,pl,https://raw.githubusercontent.com/hermitdave/F...,2021-05-11 21:45:44.840772,Hermit Dave's version of the top 50k words fro...


### Check Batch for Conflicts In Raw Corpora Inventory Table

We don't want to overwrite any existing filenames or duplicate any source urls.

In [9]:
can_proceed = True

In [10]:
for source_corpora in batch:
    if source_corpora['filename'] in raw_corpora_inventory['filename'].unique():
        can_proceed = False
        print(f'filaneme must be unique, but {source_corpora["filename"]} is already present in raw_corpora_inventory. Canceling intake.')
        break
    if source_corpora['source_url'] in raw_corpora_inventory['source_url'].unique():
        can_proceed = False
        print(f'source_url must be unique, but {source_corpora["filename"]} is already present in raw_corpora_inventory. Canceling intake.')
        break

### Process Batch

In [11]:
if can_proceed:
    for source_corpora in batch:
        url_filename = ntpath.basename(source_corpora['source_url'])
        url_path = ntpath.dirname(source_corpora['source_url']) +'/'
        # copy file locally
        cloud.pull_data_from_url(url_filename, url_path, LOCAL_DIR, local_filename=source_corpora['filename'], overwrite=True)
        # copy file to s3
        cloud.push_file_to_s3(source_corpora['filename'], LOCAL_DIR, CORPORA_DIR, BUCKET, overwrite=True)
        # update raw_corpora_inventory locally
        raw_corpora_inventory = raw_corpora_inventory.append(source_corpora, ignore_index=True)
        # push raw_corpora_inventory to s3
        cloud.write_csv_to_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, raw_corpora_inventory, sep='|', index=False)