## Extract titles from raw letters

1. Loop through a directory letters. Extracted by year.
* Parse each letter simply to extract the first occuring sentences before an empty line.
* Create a tsv file with extracted title and file name

In [7]:
import os
import sys
import glob
from pathlib import Path
from pprint import pprint
import math
import re
from collections import Counter

current_directory = os.getcwd()
prj_root = os.path.dirname(current_directory)
data_dir = f'{prj_root}/data'
txt_dir = f'{prj_root}/data/TXT_XML'
training_letters_dir = f'{prj_root}/data/training/letters'

proc_year = "1978"
path_list = []
for f in sorted(Path(txt_dir).glob(f'{proc_year}/*.txt')):
    txt_path = str(f) # cast PosixPath to str
    txt_name = os.path.basename(txt_path)
    path_list.append(txt_name)

print(f"Total files extract in {proc_year} columns are: {len(path_list)}\n")
pprint(path_list[0:10], indent=2)

Total files extract in 1977 columns are: 4838

[ 'dds-90009-page-8-article-01.txt',
  'dds-90009-page-8-article-02.txt',
  'dds-90009-page-8-article-03.txt',
  'dds-90009-page-8-article-05.txt',
  'dds-90009-page-8-article-06.txt',
  'dds-90010-page-8-article-01.txt',
  'dds-90010-page-8-article-03.txt',
  'dds-90010-page-8-article-05.txt',
  'dds-90010-page-8-article-08.txt',
  'dds-90010-page-8-article-10.txt']


### Create list of titles

**Extract the first 'paragraph' in a text file.** Since the OCR output separated the title from the body text using an empty line in the extract texts, this characteristic was used to retrieve titles.

In [8]:
import pandas as pd

titles_and_files_dir = f'{data_dir}/titles_and_files'

if not os.path.exists(titles_and_files_dir):
    os.makedirs(titles_and_files_dir)

# def words(text): return re.findall(r'\w+', text)
def words(text): return re.findall(r'[^-,\.\n\r\s]+', text, flags=re.ASCII) # [^,\s]+ match any text that is not a tab and not a whitespace.

valid_files = 0
titles_list = []
for idx, path in enumerate(path_list):
    # print(f'------++++++++++++==================== BEGIN {path} ================++++++++++------')
    txt_name = os.path.basename(path)
    txt_sans_ext = os.path.splitext(txt_name)[0]
    txt_path = f"{txt_dir}/{proc_year}/{path}"
    
    # only handle non-empty files
    if os.stat(txt_path).st_size >= 20: # more 20 bytes at least
        valid_files += 1
        with open(txt_path, 'r', encoding='utf-8') as infile:
            file_lines = infile.readlines()
            title_fragments = []
            for j, line in enumerate(file_lines):
                # 1) first 'paragraph' in a letter
                if line == '\n':
                    title_length = file_lines[:j]
                    # check that it is not a paragraph masquerading as a tile
                    if sum([len(x) for x in title_length]) < 100:
                        # get all the previous lines... 
                        title_fragments = title_length

                    # exit this loop, regardless
                    break
                
                # 2) if we get to last line without any fragment 
                # i.e found only one 'paragraph' in the extract file then 
                # check that the last line is not too long and get everything as title
                if len(file_lines) == j+1 and sum([len(x) for x in file_lines]) <= 100:
                    title_fragments = file_lines
                    
            if len(title_fragments) > 0:
                title = ' '.join(title_fragments)  # join them with white space xter
                title = words(title)  # clean up title
                title = ' '.join(title)
                titles_list.append([title, txt_name, sum([len(x) for x in title_length])])               

headers = ['title', 'txt_name', 'title_length']
titles_df = pd.DataFrame(titles_list, columns=headers)

# export to tsv
processed_tsv = os.path.join(titles_and_files_dir, f'{proc_year}.tsv')
titles_df.to_csv(processed_tsv, 
                sep='\t',
                encoding='utf-8', 
                index=False,
                columns = headers)
                
print(f"Valid files found: {valid_files}")
titles_df.head(10)
# pprint(titles_list[0:1000], indent=2)

Valid files found: 3935


Unnamed: 0,title,txt_name,title_length
0,Wiolence in Southern Africa —,dds-90009-page-8-article-01.txt,31
1,An end to non violence,dds-90009-page-8-article-02.txt,23
2,The war of the ape and the fish,dds-90009-page-8-article-03.txt,32
3,The power struggle in China,dds-90009-page-8-article-05.txt,28
4,Let the people decide,dds-90009-page-8-article-06.txt,22
5,Harassing wananchi,dds-90010-page-8-article-01.txt,19
6,MERU KANU ELECTIONS WERE UNFAIR AND UNDEMOCRATIC,dds-90010-page-8-article-05.txt,49
7,Eliminate comuption,dds-90010-page-8-article-08.txt,20
8,Excellent bank services,dds-90010-page-8-article-10.txt,24
9,ABOLISH CHRISTMAS HOLIDAY BECAUSE IT’S UNPRODU...,dds-90010-page-8-article-12.txt,52
