## Extract bylines from OCR letters

1. Loop through a directory letters. Extracted by year.
* Parse each letter simply to extract the last occuring sentences before an empty line.
* Create a tsv file with extracted byline and file name

In [1]:
import os
import sys
import glob
from pathlib import Path
from pprint import pprint
import math
import re
from collections import Counter

current_directory = os.getcwd()
prj_root = os.path.dirname(current_directory)
data_dir = f'{prj_root}/data'
txt_xml_dir = f'{prj_root}/data/TXT_XML'

proc_year = "1978"
path_list = []
for f in sorted(Path(txt_xml_dir).glob(f'{proc_year}/*.txt')):
    txt_path = str(f) # cast PosixPath to str
    txt_name = os.path.basename(txt_path)
    path_list.append(txt_name)

print(f"Total files extract in {proc_year} columns are: {len(path_list)}\n")
pprint(path_list[0:10], indent=2)

Total files extract in 1978 columns are: 4424

[ 'dds-90325-page-8-article-01.txt',
  'dds-90325-page-8-article-03.txt',
  'dds-90325-page-8-article-05.txt',
  'dds-90325-page-8-article-06.txt',
  'dds-90325-page-8-article-08.txt',
  'dds-90325-page-8-article-10.txt',
  'dds-90325-page-8-article-12.txt',
  'dds-90325-page-8-article-13.txt',
  'dds-90325-page-8-article-14.txt',
  'dds-90325-page-8-article-15.txt']


### Create list of titles

**Extract the first 'paragraph' in a text file.** Since the OCR output separated the title from the body text using an empty line in the extract texts, this characteristic was used to retrieve titles.

#### Split the byline depending on length
1. If the byline has one line. Split the byline into two using the last comma character. The first section being the name and the following section as location.
* If three. The first line is name/title. The second is company/organization and the third is location.
* If four. The first line is name. The second is title/position. The third is company/organization and the fourth is location.

In [2]:
def proc_byline(byline_list):
    lines = len(byline_list)
    reader_name = ''
    reader_title = ''
    reader_org = ''
    reader_location = ''
    
    if lines == 1:
        stripped_byline = byline_list[0].rstrip('\,') # strip any ending commas
        byline = stripped_byline.rpartition(',')
        reader_name = byline[0]
        reader_location = byline[1]
    elif lines == 2:
        reader_name = byline_list[0]
        reader_location = byline_list[1]
    elif lines == 3:
        reader_name = byline_list[0]
        reader_title = byline_list[1]
        reader_location = byline_list[2]
    elif lines == 4:
        reader_name = byline_list[0]
        reader_title = byline_list[1]
        reader_org = byline_list[2]        
        reader_location = byline_list[3] 
    
    return reader_name, reader_location, reader_title, reader_org

In [3]:
import pandas as pd

bylines_and_files_dir = f'{data_dir}/bylines_and_files'

if not os.path.exists(bylines_and_files_dir):
    os.makedirs(bylines_and_files_dir)

def words(text): return re.findall(r'\w+', text)
# def words(text): return re.findall(r'[^-,\.\n\r\s]+', text, flags=re.ASCII) # [^,\s]+ match any text that is not a tab and not a whitespace.

valid_files = 0
bylines_list = []
for idx, path in enumerate(path_list):
    # print(f'------++++++++++++==================== BEGIN {path} ================++++++++++------')
    txt_name = os.path.basename(path)
    txt_sans_ext = os.path.splitext(txt_name)[0]
    txt_path = f"{txt_xml_dir}/{proc_year}/{path}"
    
    # only handle non-empty files
    if os.stat(txt_path).st_size >= 20: # more 20 bytes at least
        valid_files += 1
        with open(txt_path, 'r', encoding='utf-8') as infile:
            file_lines = infile.readlines()
            last_pos = 0
            try:
                # to get last element occurrence of paragraph 
                last_pos = max(idx for idx, aline in enumerate(file_lines)  
                                                    if aline == '\n')
            except ValueError as e:
                pass
            
            byline_section = file_lines[last_pos+1:]
            if last_pos > 0 and len(byline_section) <= 3:  # position found
                byline_fragments = []
                
                byline_section = [x.strip() for x in byline_section]
                reader_name, reader_location, reader_title, reader_org = proc_byline(byline_section)
                reader_name = re.sub('^[^a-zA-Z]*|[^a-zA-Z]*$','',reader_name)
                bylines_list.append([reader_name, reader_location, reader_title, reader_org, txt_name, len(byline_section)])
            else: # means it was never set due to error caught above
                pass

pprint(bylines_list[50:60], indent=2)

titles_headers = ['reader_name', 'reader_location', 'reader_title', 'reader_org', 'txt_name', 'lines_count']
titles_df = pd.DataFrame(bylines_list, columns=titles_headers)

[ ['', '', '', '', 'dds-90330-page-8-article-21.txt', 1],
  ['', '', '', '', 'dds-90331-page-8-article-01.txt', 1],
  ['a', 'jot', '', '', 'dds-90331-page-8-article-14.txt', 2],
  [ 'HAND IT “TO YOu, MATE',
    'LIKE YOU CAN',
    '—— NONE OF US CANGET THE GIRLS',
    '',
    'dds-90331-page-8-article-17.txt',
    3],
  ['', '', '', '', 'dds-90331-page-8-article-21.txt', 1],
  ['egal', 'with', '', '', 'dds-90331-page-8-article-25.txt', 2],
  ['', '', '', '', 'dds-90331-page-8-article-29.txt', 1],
  ['', '', '', '', 'dds-90332-page-8-article-04.txt', 1],
  [ 'SESTT SCS Ua',
    'REE RA ake',
    '£5 3',
    '',
    'dds-90332-page-8-article-09.txt',
    3],
  ['', '', '', '', 'dds-90332-page-8-article-14.txt', 1]]


### Assign dates to each file

In [4]:
from datetime import datetime

column_dates_df = pd.read_csv(f'{data_dir}/column_dates/{proc_year}.tsv', 
                   delimiter='\t', 
                   usecols=['page_image_name', 'cleaned_date'],
                   na_filter=False
                  )

bylines_list_with_dates = []
for index, row in titles_df.iterrows():
    reader_name = row['reader_name']
    reader_location = row['reader_location']
    reader_title = row['reader_title']
    reader_org = row['reader_org']
    txt_name = row['txt_name']
    lines_count = row['lines_count']

    for jndex, row in column_dates_df.iterrows():
        page_image_name = row['page_image_name']
        cleaned_date = row['cleaned_date']
        cleaned_datetime_obj = datetime.strptime(cleaned_date , '%Y-%m-%d')    

        if txt_name.startswith(page_image_name):
            # know the date of the file
            bylines_list_with_dates.append([
                reader_name, 
                reader_location, 
                reader_title, 
                reader_org, 
                txt_name, 
                cleaned_date,
                lines_count
            ])

# updated titles headers and df with known date of column
titles_headers = [
    'reader_name', 
    'reader_location', 
    'reader_title', 
    'reader_org', 
    'txt_name', 
    'letter_date', 
    'lines_count'
]
titles_df = pd.DataFrame(bylines_list_with_dates, columns=titles_headers)

### Now write to file with dates noted

In [5]:
# export to tsv
processed_tsv = os.path.join(bylines_and_files_dir, f'{proc_year}.tsv')
titles_df.to_csv(processed_tsv, 
                sep='\t',
                encoding='utf-8', 
                index=False,
                columns = titles_headers)

titles_df.head(5)

Unnamed: 0,reader_name,reader_location,reader_title,reader_org,txt_name,letter_date,lines_count
0,,,,,dds-90325-page-8-article-03.txt,1978-01-03,1
1,,,,,dds-90325-page-8-article-06.txt,1978-01-03,1
2,,,,,dds-90325-page-8-article-10.txt,1978-01-03,1
3,institutions: ( ~ os,Kisumu,"2° F.R.O. Akubs Koyugy,",,dds-90325-page-8-article-12.txt,1978-01-03,3
4,,,,,dds-90325-page-8-article-13.txt,1978-01-03,1
