In [13]:
import json
from striprtf.striprtf import rtf_to_text
import os
import re
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

### Functions

* Create functions to:
    * Process file:
        1. Load RTF file
        2. Convert to `str` object
        3. Split into a list of documents
    * Process article/document
        1. Split document into components
        2. extracted needed fields, e.g. date, source, etc.
        3. create a dictionary 


In [14]:
def process_RTF_file(fpath):
    print(f'Processing {fpath}...')
    # 1. load file
    rtf = open(fpath).read()
    
    # 2. convert to plain text str
    text = rtf_to_text(rtf, errors="ignore")
    
    # 3. split into a list of documents
    docs = text.split('End of Document')
    
    return docs

In [16]:
def process_doc(doc):
    
    date_RE = re.compile(r'^[JFMASOND][a-z]+ \d{1,2}, \d{4}')
    
    doc = doc.strip().replace('\xa0',' ')
    doc_dict = {}

    # 1. find body indices
    body_start = doc.index('\nBody')+5
    
    try:
        body_end = doc.index('Load-Date:')
    except:
        try:
            body_end = doc.index('Correction-Date:')
        except:
            body_end = len(doc)
    
    doc_dict['body'] = re.sub(' +',' ',doc[body_start:body_end].strip())
    doc_dict['footer'] = doc[body_end:]
        
    # 2. assume first three lines are:
    #    TITLE
    #    PUBLICATION
    #    DATE
    
    header = doc[:body_start-4]
    header_lines = header.split('\n')
    
    if len(header_lines)<2:
        print(header_lines, doc)
    
    # test if 3rd line has date pattern if not find index of line
    dateline_idx = [idx for idx, line in enumerate(header_lines[2:],2) 
                                        if date_RE.match(line)]
    
    if len(dateline_idx):
        date_idx = dateline_idx[0]
        doc_dict['pubdate'] = header_lines[date_idx]
    else:
        date_idx=None
        
        if doc_dict['footer'].startswith('Correction-Date:'):
            date_str=doc_dict['footer'].split('\n')[0].split(':')[1]
            date_str = f'{date_str.split("day, ")[0]}day'
            doc_dict['pubdate'] = date_str
            print("#### ", date_str)
        elif doc_dict['footer'].startswith('Load-Date'):
            date_str = doc_dict['footer'].split('\n')[0].split(':')[1].strip()
            print(">>>> ", date_str)
            doc_dict['pubdate'] = date_str
        else:
            doc_dict['pubdate'] = None
    
    if date_idx==3:
        doc_dict['title']=header_lines[0]
        doc_dict['subtitle']=header_lines[1]
        doc_dict['publication']=header_lines[2]
    else:
        doc_dict['title']=header_lines[0]
        doc_dict['publication']=header_lines[1]        

    if doc_dict['pubdate'] and doc_dict['pubdate'].count('day,')>0:
        pdate, edition = doc_dict['pubdate'].split('day, ')
        doc_dict['pubdate']=f'{pdate}day'
        doc_dict['edition']=edition.strip()
    
    for hline in header_lines:
        lmatch = re.match('^([A-Za-z]+):\s(.*)', hline)
        if lmatch:
            doc_dict[lmatch.group(1).lower()]=lmatch.group(2)
    
    return doc_dict

### Process the RTF files in `raw data/post`

In [18]:
postDIR = 'raw data/post'

In [21]:
postRTF_files = [f'{postDIR}/{fn}' for fn in os.listdir(postDIR) if fn.endswith('.rtf')]

In [22]:
len(postRTF_files)

51

In [35]:
postRTF_files[0:50]

['raw data/post/files_after_1.rtf',
 'raw data/post/files_after_101.rtf',
 'raw data/post/files_after_1083.rtf',
 'raw data/post/files_after_1183.rtf',
 'raw data/post/files_after_1283.rtf',
 'raw data/post/files_after_1383.rtf',
 'raw data/post/files_after_1483.rtf',
 'raw data/post/files_after_1583.rtf',
 'raw data/post/files_after_1683.rtf',
 'raw data/post/files_after_1783.rtf',
 'raw data/post/files_after_1883.rtf',
 'raw data/post/files_after_1973.rtf',
 'raw data/post/files_after_201.rtf',
 'raw data/post/files_after_2073.rtf',
 'raw data/post/files_after_2173.rtf',
 'raw data/post/files_after_2273.rtf',
 'raw data/post/files_after_2373.rtf',
 'raw data/post/files_after_2473.rtf',
 'raw data/post/files_after_2573.rtf',
 'raw data/post/files_after_2673.rtf',
 'raw data/post/files_after_2773.rtf',
 'raw data/post/files_after_2873.rtf',
 'raw data/post/files_after_3000 (1).rtf',
 'raw data/post/files_after_3000 (10).rtf',
 'raw data/post/files_after_3000 (2).rtf',
 'raw data/post/f

In [None]:
# step 1 get list of docs from RTF files
# WARNING: THIS TAKES HOURS TO RUN ON YOUR OWN COMPUTER
post_docs = []
for rtf_file in postRTF_files:
    post_docs.extend(process_RTF_file(rtf_file))

In [None]:
all_post_docs = []
for doc in post_docs:
    if doc.strip()=='':
        continue
    all_post_docs.append(process_doc(doc))

In [43]:
with open('clean data/post_corpus.json', 'w') as out:
    out.write(json.dumps(all_docs, indent=4))

### Process the RTF files in `raw data/pre`

In [38]:
preDIR = 'raw data/pre'

In [40]:
preRTF_files = [f'{preDIR}/{fn}' for fn in os.listdir(preDIR) if fn.endswith('.rtf')]

In [41]:
len(preRTF_files)

74

In [42]:
preRTF_files[0:73]

['raw data/pre/files_before_0_1000 (1).rtf',
 'raw data/pre/files_before_0_1000 (10).rtf',
 'raw data/pre/files_before_0_1000 (2).rtf',
 'raw data/pre/files_before_0_1000 (3).rtf',
 'raw data/pre/files_before_0_1000 (4).rtf',
 'raw data/pre/files_before_0_1000 (5).rtf',
 'raw data/pre/files_before_0_1000 (6).rtf',
 'raw data/pre/files_before_0_1000 (7).rtf',
 'raw data/pre/files_before_0_1000 (8).rtf',
 'raw data/pre/files_before_0_1000 (9).rtf',
 'raw data/pre/files_before_1000_2000 (1).rtf',
 'raw data/pre/files_before_1000_2000 (10).rtf',
 'raw data/pre/files_before_1000_2000 (2).rtf',
 'raw data/pre/files_before_1000_2000 (3).rtf',
 'raw data/pre/files_before_1000_2000 (4).rtf',
 'raw data/pre/files_before_1000_2000 (5).rtf',
 'raw data/pre/files_before_1000_2000 (6).rtf',
 'raw data/pre/files_before_1000_2000 (7).rtf',
 'raw data/pre/files_before_1000_2000 (8).rtf',
 'raw data/pre/files_before_1000_2000 (9).rtf',
 'raw data/pre/files_before_2000_3000 (1).rtf',
 'raw data/pre/files

In [None]:
# step 1 get list of docs from RTF files
# WARNING: THIS TAKES HOURS TO RUN ON YOUR OWN COMPUTER
pre_docs = []
for rtf_file in preRTF_files:
    pre_docs.extend(process_RTF_file(rtf_file))

In [None]:
len(pre_docs)

In [None]:
all_pre_docs = []
for pre_doc in pre_docs:
    if pre_doc.strip()=='':
        continue
    all_pre_docs.append(process_doc(pre_doc))

In [None]:
with open('clean data/pre_corpus.json', 'w') as out:
    out.write(json.dumps(all_pre_docs, indent=4))