## Average letters extracted per column

1. Given a year. Get the column dates CSV into a dataframe
* Count the letters obtained for the associated filename
* Draw a graph with number of letters per column on average
* Export new CSV with page_image, date, avg_letter_count

In [1]:
import os
import re
import csv
import glob
import pandas as pd

current_directory = os.getcwd()
prj_root = os.path.dirname(current_directory)
data_dir = f'{prj_root}/data'
txt_pre_dir = f'{prj_root}/data/TXT_PRE'
column_dates_dir = f'{prj_root}/data/column_dates'

proc_year = '1978'

data = pd.read_csv('{}/{}.tsv'.format(column_dates_dir, proc_year), 
                   delimiter='\t', 
                   usecols=['page_image_name', 'cleaned_date'],
                   parse_dates=["cleaned_date"],
                  )

data.head(5)

Unnamed: 0,page_image_name,cleaned_date
0,dds-90325-page-8,1978-01-03
1,dds-90326-page-8,1978-01-04
2,dds-90327-page-8,1978-01-05
3,dds-90328-page-8,1978-01-06
4,dds-90329-page-8,1978-01-07


In [2]:
import fnmatch
import os

all_txt_pres = sorted(glob.glob(f'{txt_pre_dir}/{proc_year}/*.txt'))
# exclude path to the file
all_txt_pres = [os.path.basename(txt_pre) for txt_pre in all_txt_pres]

d = [] # dataframe placeholder
for index, row in data.iterrows():
    page_image_name = row['page_image_name']
    date_str = row['cleaned_date']

    letter_count = 0
    for file in all_txt_pres:
        if file.startswith(page_image_name):
            letter_count += 1

    d.append([page_image_name, date_str, letter_count])
    
    # print only the first 10
    if index < 10:
        print("%-20s ====> %10s =====> %20s" % (page_image_name, date_str, letter_count))

dds-90325-page-8     ====> 1978-01-03 00:00:00 =====>                   19
dds-90326-page-8     ====> 1978-01-04 00:00:00 =====>                   18
dds-90327-page-8     ====> 1978-01-05 00:00:00 =====>                   17
dds-90328-page-8     ====> 1978-01-06 00:00:00 =====>                   14
dds-90329-page-8     ====> 1978-01-07 00:00:00 =====>                   12
dds-90330-page-8     ====> 1978-01-09 00:00:00 =====>                   11
dds-90331-page-8     ====> 1978-01-10 00:00:00 =====>                   18
dds-90332-page-8     ====> 1978-01-11 00:00:00 =====>                   18
dds-90333-page-8     ====> 1978-01-12 00:00:00 =====>                   15
dds-90334-page-8     ====> 1978-01-14 00:00:00 =====>                   14


In [3]:
headers = ['page_image_name', 'cleaned_date', 'letters_count']
new_data = pd.DataFrame(d, columns=headers)

cleaned_csv = os.path.join(current_directory, f'COUNT_{proc_year}.csv')
new_data.to_csv(cleaned_csv, 
                sep='\t', 
                encoding='utf-8', 
                index=False,
                columns = headers)

new_data.head(10)

Unnamed: 0,page_image_name,cleaned_date,letters_count
0,dds-90325-page-8,1978-01-03,19
1,dds-90326-page-8,1978-01-04,18
2,dds-90327-page-8,1978-01-05,17
3,dds-90328-page-8,1978-01-06,14
4,dds-90329-page-8,1978-01-07,12
5,dds-90330-page-8,1978-01-09,11
6,dds-90331-page-8,1978-01-10,18
7,dds-90332-page-8,1978-01-11,18
8,dds-90333-page-8,1978-01-12,15
9,dds-90334-page-8,1978-01-14,14
