# Parse

A [PDF data table](https://github.com/palewire/chicago-trees-analysis/blob/main/input/01.01.2008_12.31.2020311CSR.1.pdf) released by the Chicago Department of Transporation was converted into a text file via Tabula. This notebook parses the results and outputs a data table of comma-separated values.

Import Python tools

In [4]:
import re
import csv
import pandas as pd
from io import StringIO

Read in the text file parsed by Tabula.

In [5]:
with open("input/tabula-01.01.2008_12.31.2020311CSR.1.csv", "r") as f:
    data = f.readlines()

Parse the raw data

In [6]:
def is_header(row):
    """
    Identifies if the provided row is a header that can be ignored.
    
    Returns True or False.
    """
    return row.startswith("SR #")

In [7]:
def is_new_row(row):
    """
    Identifies if the provided row is the start of a new record.
    
    Returns True or False.
    """
    return re.search('^\d{2}\-\d', row)

In [8]:
record_list = []
record = []
# Loop through every record in the file
for row in data:
    # Trim whitespace
    row = row.strip()
    # If this is a header row, skip it
    if is_header(row):
        continue
    # If this a new row, start a new batch
    if is_new_row(row):
        if record:
            # Add the current one to the master list, too
            record_list.append(record)
        record = [row]
    # If it's not, add it to the current record
    else:
        record.append(row)

Concatenate the raw data

In [9]:
def parse_record_set(record_set):
    """
    Parses the raw row as as CSV
    """
    row_list = []
    for record in record_set:
        f = StringIO(record)
        reader = csv.reader(f, delimiter=',')
        row_list.append(list(reader)[0])
    return row_list

In [10]:
def concat_record_set(record_set):
    """
    Combine stacked rows in into a single line of CSV data.
    """
    row_list = parse_record_set(record_set)
    value_list = row_list.pop(0)
    for row in row_list:
        for i, cell in enumerate(row):
            if cell:
                value_list[i] += f" {cell.strip()}"
    return value_list

In [11]:
df = pd.DataFrame(concat_record_set(r) for r in record_list)

Tidy up the dataframe

In [12]:
df.columns = [
    'sr_number',
    'type',
    'raw_location',
    'area',
    'created_date',
    'activity_date',
    'status',
    'group',
    'description',
]

In [13]:
df['address'] = df.raw_location.apply(lambda x: x.split(",")[0].strip())

Parse out the wards

In [14]:
def get_ward(row):
    if len(row['area'].strip()) > 0:
        return row['area'].strip()
    else:
        match = re.search("\d{1,2}-Ward", row['raw_location'])
        if match:
            return match.group(0)
        else:
            return pd.NA

In [15]:
df['ward'] = df.apply(get_ward, axis=1)

Write out the result

In [17]:
df.to_csv("input/parsed-01.01.2008_12.31.2020311CSR.1.csv", index=False)