In [116]:
import pandas as pd
from collections import defaultdict

# Table Data Conversion
**Name**: Nicholas A. Del Grosso

**Date**: Sept. 6, 2018

**Description**: A demonstration of cleaning an ill-formed automatic table extraction from Tabula in Python using the Pandas package.

The Tabula software is used for extracting tables from PDFs. It is free, and can be found at https://tabula.technology/

Pandas is a Python library for working with tables.  This is similar to the 'dplyr' library in R.  A tutorial for using it can be found here: 

  - https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python
  
  - http://pandas.pydata.org/pandas-docs/stable/
  

## Load Data

Let's see what this looks like straight out of Tabula...

In [260]:
filename = 'tabula-buildingreportjul2228201851157.csv'
df = pd.read_csv(filename, dtype=str, na_filter='')
df.head(30)

Unnamed: 0,Folder Number Sub Type,Work Proposed,Unnamed: 2,Status,Issue Date,Unnamed: 5,Constr. Value,Unnamed: 7,Folder Name
0,18 830938 000 00 NB,,,Issued,7/24/18,,,0,12111 3rd Ave
1,18 830939 000 00 NB,,,Issued,7/24/18,,,0,12417 No 2 Rd Unit 230
2,Owner: Porte Industries Ltd,,,,,,,,
3,Owner: Romana Investments Ltd,,,,,,,,
4,Owner: West Coast Planners Ltd,,,,,,,,
5,18 830943 000 00 NB,,,Issued,7/24/18,,,0,12111 Bridgeport Rd Unit 110
6,Owner: Kingvision Enterprises Ltd,,,,,,,,
7,18 831140 000 00 NB,,,Final,7/25/18,,,0,9597 Francis Rd
8,Applicant: Coso Investments Corp. (Michael Coq...,,,,,,,,
9,Owner: Margaret L Tecson,,,,,,,,


## Clean Out "Unnamed" Columns

Merge Unnamed columns with adjacent columns, remove rows that contain co.umn names

In [261]:
df['Work Proposed'] = df['Work Proposed'].str.cat(df['Unnamed: 2'], sep='')
df = df[df['Folder Number Sub Type'].str.contains('Total:|Folder Number') == False]
df = df[df['Folder Number Sub Type'].astype(bool)]
df['Address'] = df['Unnamed: 7'].transform(lambda ss: '' if len(ss.strip()) < 2 else ss).str.cat(df['Folder Name'])
df['Constr. Value'] = df['Unnamed: 5'].str.cat(df['Constr. Value'], sep='')
df = df.drop(labels=['Unnamed: 2', 'Unnamed: 7', 'Unnamed: 5', 'Folder Name'], axis=1)
df.head(20)

Unnamed: 0,Folder Number Sub Type,Work Proposed,Status,Issue Date,Constr. Value,Address
0,18 830938 000 00 NB,,Issued,7/24/18,,12111 3rd Ave
1,18 830939 000 00 NB,,Issued,7/24/18,,12417 No 2 Rd Unit 230
2,Owner: Porte Industries Ltd,,,,,
3,Owner: Romana Investments Ltd,,,,,
4,Owner: West Coast Planners Ltd,,,,,
5,18 830943 000 00 NB,,Issued,7/24/18,,12111 Bridgeport Rd Unit 110
6,Owner: Kingvision Enterprises Ltd,,,,,
7,18 831140 000 00 NB,,Final,7/25/18,,9597 Francis Rd
8,Applicant: Coso Investments Corp. (Michael Coq...,,,,,
9,Owner: Margaret L Tecson,,,,,


The Address and Constr. Value columns are still a bit confused, with Addresses in some Constr. Value rows.  This needs to be cleaned up.  

In [262]:
missing_address = df['Address'].eq('') & (df['Constr. Value'].str.strip())
df.loc[missing_address, 'Address'] = df.loc[missing_address, 'Constr. Value']
df.loc[missing_address, 'Constr. Value'] = '' 
df.loc[df['Constr. Value'] == '0', 'Constr. Value'] = ''
df.head(20)

Unnamed: 0,Folder Number Sub Type,Work Proposed,Status,Issue Date,Constr. Value,Address
0,18 830938 000 00 NB,,Issued,7/24/18,,12111 3rd Ave
1,18 830939 000 00 NB,,Issued,7/24/18,,12417 No 2 Rd Unit 230
2,Owner: Porte Industries Ltd,,,,,
3,Owner: Romana Investments Ltd,,,,,
4,Owner: West Coast Planners Ltd,,,,,
5,18 830943 000 00 NB,,Issued,7/24/18,,12111 Bridgeport Rd Unit 110
6,Owner: Kingvision Enterprises Ltd,,,,,
7,18 831140 000 00 NB,,Final,7/25/18,,9597 Francis Rd
8,Applicant: Coso Investments Corp. (Michael Coq...,,,,,
9,Owner: Margaret L Tecson,,,,,


## Extract Values from Non-Tabular Organization

The rows in "Folder Number Sub Type" contain more than one type of data.  Let's turn that column into multiple columns.

In [263]:
def extract_owners(nameseq):
    names = defaultdict(lambda: defaultdict(list))
    for name in nameseq:
        keyval = name.split(': ')
        if len(keyval) == 1:
            currname = keyval[0]
        else:
            key, val = keyval
            names[currname][key + 's'].append(val)
    return pd.DataFrame(names).T.fillna('').transform(lambda ll: ll.str.join(', '))

names = extract_owners(df['Folder Number Sub Type'].tolist())
df.set_index('Folder Number Sub Type', inplace=True)
# df.append(names)
df = pd.merge(names, df, left_index=True, right_index=True).reset_index()
df.head(20)

Unnamed: 0,index,Applicants,Contractors,Occupiers,Owners,Work Proposed,Status,Issue Date,Constr. Value,Address
0,16 734158 000 02 B7 One Family Dwelling,Metro Van Con Ltd (Bhupinder Johal),Metro Van Construction Ltd (604) 825-4651,,"Lakhbir S Bath, Harpreet K Bath, Manjit K Bath...",Revision,Issued,7/26/18,,10191 Blundell Rd - Bldg Design Mod
1,16 750521 000 00 B7 One Family Dwelling,Penta Builders Group (Patrick Mullin),Penta Builders Group Inc (604) 244-9594,,Jin L Gu,New,Issued,7/23/18,435305.0,11740 Pintail Dr
2,16 752300 000 01 B7 One Family Dwelling,Insite Architecture (Geoffrey Lee),,,,Revision,Issued,7/25/18,,10381 Hollywell Dr---Building Design Mo
3,17 762301 000 01 B9 Retail,Pattie Dehaan,Mavacon Inc (450) 902-1999,,"Richmond Centre (6060minoru Blvd)Ltd, Ari Rich...",Revision,Issued,7/25/18,,06060 Minoru Blvd Unit 1758
4,17 781953 000 01 BB Warehouse,Steve Kalsi,Frise Construction & Consulting Co Ltd (604) 7...,,Mason Investments Ltd,Revision,Issued,7/24/18,,13900 Bridgeport Rd---Bldg Design Mod
5,17 783416 000 02 B7 Single Family/Suite,Yan Min Yang,Jun Ren (604) 721-7932,,"Xuefang Su, Yan M Yang",Revision,Issued,7/26/18,,8271 Leslie Rd---Building Design Modific
6,17 790304 000 01 B7 Hotel,Mike Amiri,MYK Construction Ltd (604) 729-8520,,Maple Hospitality Inc,Revision,Issued,7/26/18,,8811 Bridgeport Rd - Building Design Mo
7,17 792047 000 00 B7 Single Family/Suite,Maryem Ahbib,Professional Home Builders Ltd (604) 600-7270,,Jasjit S Uppal,New,Issued,7/23/18,444441.0,10640 Gilmore Cr
8,18 802241 000 00 B8 Office,Seng Tsoi Architects (Seng Tsoi),Mansouri Enterprises Inc (604) 298-8388,,0989705 Bc Ltd,New,Issued,7/23/18,1000000.0,7960 Alderbridge Way - Temporary Sales
9,18 817691 000 00 B7 One Family Dwelling,Barstow Construction Ltd (Russ Barstow),Barstow Construction Ltd (778) 892-5462,,"Brandon M Crossley, Stephanie L Crossley",NewIssued,,7/24/18,,"412,21510871 Bonavista Gate"


There is more data in the first column than I noticed--both an ID number and the type of dwelling are there.  These should also be in seperate columns.

In [264]:
def split_at_last_digit(ss):
    bb = list(map(str.isdigit, ss))
    try:
        idx = len(bb) - bb[::-1].index(True)
        return ss[:idx], ss[idx:]
    except ValueError:
        return '', ss

    
df = pd.DataFrame(df['index'].apply(split_at_last_digit).tolist(), columns=['ID', 'Building Type']).join(df)
df = df.drop(['index'], axis=1)
df.head(20)

Unnamed: 0,ID,Building Type,Applicants,Contractors,Occupiers,Owners,Work Proposed,Status,Issue Date,Constr. Value,Address
0,16 734158 000 02 B7,One Family Dwelling,Metro Van Con Ltd (Bhupinder Johal),Metro Van Construction Ltd (604) 825-4651,,"Lakhbir S Bath, Harpreet K Bath, Manjit K Bath...",Revision,Issued,7/26/18,,10191 Blundell Rd - Bldg Design Mod
1,16 750521 000 00 B7,One Family Dwelling,Penta Builders Group (Patrick Mullin),Penta Builders Group Inc (604) 244-9594,,Jin L Gu,New,Issued,7/23/18,435305.0,11740 Pintail Dr
2,16 752300 000 01 B7,One Family Dwelling,Insite Architecture (Geoffrey Lee),,,,Revision,Issued,7/25/18,,10381 Hollywell Dr---Building Design Mo
3,17 762301 000 01 B9,Retail,Pattie Dehaan,Mavacon Inc (450) 902-1999,,"Richmond Centre (6060minoru Blvd)Ltd, Ari Rich...",Revision,Issued,7/25/18,,06060 Minoru Blvd Unit 1758
4,17 781953 000 01,BB Warehouse,Steve Kalsi,Frise Construction & Consulting Co Ltd (604) 7...,,Mason Investments Ltd,Revision,Issued,7/24/18,,13900 Bridgeport Rd---Bldg Design Mod
5,17 783416 000 02 B7,Single Family/Suite,Yan Min Yang,Jun Ren (604) 721-7932,,"Xuefang Su, Yan M Yang",Revision,Issued,7/26/18,,8271 Leslie Rd---Building Design Modific
6,17 790304 000 01 B7,Hotel,Mike Amiri,MYK Construction Ltd (604) 729-8520,,Maple Hospitality Inc,Revision,Issued,7/26/18,,8811 Bridgeport Rd - Building Design Mo
7,17 792047 000 00 B7,Single Family/Suite,Maryem Ahbib,Professional Home Builders Ltd (604) 600-7270,,Jasjit S Uppal,New,Issued,7/23/18,444441.0,10640 Gilmore Cr
8,18 802241 000 00 B8,Office,Seng Tsoi Architects (Seng Tsoi),Mansouri Enterprises Inc (604) 298-8388,,0989705 Bc Ltd,New,Issued,7/23/18,1000000.0,7960 Alderbridge Way - Temporary Sales
9,18 817691 000 00 B7,One Family Dwelling,Barstow Construction Ltd (Russ Barstow),Barstow Construction Ltd (778) 892-5462,,"Brandon M Crossley, Stephanie L Crossley",NewIssued,,7/24/18,,"412,21510871 Bonavista Gate"


## Save Data to Excel

In [265]:
df.to_excel(filename.split('-')[1] + '.xlsx')