In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from datetime import date

# Data Loading

SF DBI’s development pipeline datasets use four nearly-identical data models. Parcel identificator was called “BLKLOT” or “Block Lot” in older datasets and is called APN (Assessor's Parcel Number) in the most recent datasets. The same parcel id can written with a leading zero in one dataset and without a leading zero in another one. Prior to 2014 Q3, datasets don’t have fields for affordable unit counts (total affordable units and net affordable units).

Some project records don’t include building permit id or use the placeholder “MULTIPLE” instead of actual permit references. Initially, we remove permitless projects from the dataset, than re-add them at a later stage.

In [6]:
def loadData(fileName, label, fmt = 1):
    columns = ['UNITS', 'NET_UNITS', 'AFF_UNITS', 'NET_AFF_UNITS', 'NAMEADDR', 'APN', 'BESTSTAT', 'BESTDATE', 'LOCATION']
    print(fileName)
    converters = { 'APN': lambda x: x[4:], 'BP_APPLNO': lambda x: 'N'+x.replace(',','') }
    if fmt == 2 or fmt == 3:
        converters = { 'BLKLOT': lambda x: x.lstrip('0'), 'BP_APPLNO': lambda x: 'N'+x.replace(',','') }
    if fmt == 4:
        converters = { 'Block Lot': lambda x: x.lstrip('0'), 'DBI Permit': lambda x: 'N'+x.replace(',','')}
    if fmt == 6:
        converters = { 'APN': lambda x: x[4:], 'BPAPPLNO': lambda x: 'N'+x.replace(',','') }
        
    X = pd.read_csv(fileName, sep=',', parse_dates=[], infer_datetime_format=True, quotechar='"', converters=converters)
    
    if fmt == 2 or fmt == 3:
        X = X.rename(columns={"AFF_UNITS_NET": "NET_AFF_UNITS", "BLKLOT": "APN"})
    if fmt == 3:
        X = X.rename(columns={"UNITSNET": "NET_UNITS", "Geography": "LOCATION"})
    if fmt == 4:
        X['LOCATION'] = X['Location 1']#.apply(lambda x: x.split('\n')[1])
        X['NAMEADDR'] = X['Location 1'].apply(lambda x: x.split('\n')[0])
        
        X = X.rename(columns={"Units": "UNITS", "Net Added Units": "NET_UNITS", "Best Stat": "BESTSTAT", "Best Date": "BESTDATE", 'Block Lot': 'APN',
                             'DBI Permit': 'BP_APPLNO'})
        X['NET_AFF_UNITS'] = 0
        X['AFF_UNITS'] = 0
    if fmt == 5:
        X = X.rename(columns={"PROPUSE": "PROJECT_TYPE"})
    if fmt == 6:
        X = X.rename(columns={"UNITSNET": "NET_UNITS", "Location": "LOCATION", "BPAPPLNO": "BP_APPLNO", 
                              "AFFORDABLE": "AFF_UNITS", "AFFORDABLENET": "NET_AFF_UNITS"})
        X['NET_AFF_UNITS'] = X['NET_AFF_UNITS'].fillna(0)
        X['AFF_UNITS'] = X['AFF_UNITS'].fillna(0)
    X = X.rename(columns={"Location": "LOCATION"})
    X['LOCATION'] = X['LOCATION'].apply(lambda x: str(x).split('\n')[-1])

    X = X[X['UNITS'] > 0]
    X = X[X['NET_UNITS'] >= 0]
    
    # Filtering records without permits
    M = X[(X['BP_APPLNO'] == 'N') | (X['BP_APPLNO'] == 'NMULTIPLE')][columns+['BP_APPLNO']]
    X = X[(X['BP_APPLNO'] != 'N') & (X['BP_APPLNO'] != 'NMULTIPLE')]
    
    # Filtering duplicated records
    D = X[X.duplicated('BP_APPLNO', keep=False)][columns+['BP_APPLNO']].sort_values('BP_APPLNO')
    X=X[~X.duplicated('BP_APPLNO')]
    
    # Set unique index by Permit ID
    X = X.set_index('BP_APPLNO')
    X.index.names = ['PERMIT_ID']
    X['BESTDATE'] = pd.to_datetime(X["BESTDATE"])
    X[['UNITS', 'NET_UNITS', 'NET_AFF_UNITS', 'AFF_UNITS']] = X[['UNITS', 'NET_UNITS', 'NET_AFF_UNITS', 'AFF_UNITS']].astype(int)
    
    X = X[columns]
    
    return X,M,D

# Helper Indexes

y13q4 = 0
y14q1 = 1
y14q2 = 2
y14q3 = 3
y14q4 = 4
y15q1 = 5
y15q2 = 6
y15q3 = 7
y15q4 = 8
y16q1 = 9
y16q2 = 10
y16q3 = 11
y16q4 = 12
y17q1 = 13

# Files and versions

files = [
    {'label': '2013\'Q4', 'file': 'data/SF_Development_Pipeline_2013_Q4.csv', 'format': 4},
    {'label': '2014\'Q1', 'file': 'data/SF_Development_Pipeline_2014_Q1.csv', 'format': 4},
    {'label': '2014\'Q2', 'file': 'data/SF_Development_Pipeline_2014_Q2.csv', 'format': 4},
    {'label': '2014\'Q3', 'file': 'data/SF_Development_Pipeline_2014_Q3.csv', 'format': 4},
    {'label': '2014\'Q4', 'file': 'data/SF_Development_Pipeline_2014_Q4.csv', 'format': 3},
    {'label': '2015\'Q1', 'file': 'data/SF_Development_Pipeline_2015_Q1.csv', 'format': 2},
    {'label': '2015\'Q2', 'file': 'data/SF_Development_Pipeline_2015_Q2.csv', 'format': 1},
    {'label': '2015\'Q3', 'file': 'data/SF_Development_Pipeline_2015_Q3.csv', 'format': 1},
    {'label': '2015\'Q4', 'file': 'data/SF_Development_Pipeline_2015_Q4.csv', 'format': 1},
    {'label': '2016\'Q1', 'file': 'data/SF_Development_Pipeline_2016_Q1.csv', 'format': 1},
    {'label': '2016\'Q2', 'file': 'data/SF_Development_Pipeline_2016_Q2.csv', 'format': 5},
    {'label': '2016\'Q3', 'file': 'data/SF_Development_Pipeline_2016_Q3.csv', 'format': 1},
    {'label': '2016\'Q4', 'file': 'data/SF_Development_Pipeline_2016_Q4.csv', 'format': 1},
    {'label': '2017\'Q1', 'file': 'data/SF_Development_Pipeline_2017_Q1.csv', 'format': 1},
    {'label': '2017\'Q2', 'file': 'data/SF_Development_Pipeline_2017_Q2.csv', 'format': 6}
]

# Loading Data
count = len(files)
labels = []
data = []
missing = []
duplicates = []

for f in files:
    t,m,d = loadData(f['file'], f['label'], f['format'])
    data.append(t)
    missing.append(m)
    duplicates.append(d)
    labels.append(f['label'])
    
for i in range(count):
    fname = 'data/Pipeline_Unified_'+labels[i].replace('\'','_')+'.csv'
    data[i].to_csv(fname)

data/SF_Development_Pipeline_2013_Q4.csv
data/SF_Development_Pipeline_2014_Q1.csv
data/SF_Development_Pipeline_2014_Q2.csv
data/SF_Development_Pipeline_2014_Q3.csv
data/SF_Development_Pipeline_2014_Q4.csv
data/SF_Development_Pipeline_2015_Q1.csv
data/SF_Development_Pipeline_2015_Q2.csv
data/SF_Development_Pipeline_2015_Q3.csv
data/SF_Development_Pipeline_2015_Q4.csv
data/SF_Development_Pipeline_2016_Q1.csv
data/SF_Development_Pipeline_2016_Q2.csv
data/SF_Development_Pipeline_2016_Q3.csv
data/SF_Development_Pipeline_2016_Q4.csv
data/SF_Development_Pipeline_2017_Q1.csv
data/SF_Development_Pipeline_2017_Q2.csv


# Displaying duplicated records
Printing out all records with duplicate building permit id for manual inspection.

In [5]:
for i in range(count):
    if (len(duplicates[i]) > 0):
        print()
        print("Duplicated records at {}".format(labels[i]))
        display(duplicates[i])


Duplicated records at 2013'Q4


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
265,7.0,7.0,0,0,4132 Third Street,5260002,BP ISSUED,08-Aug-13,"(37.740113999999998, -122.389089)",N200707055953
812,7.0,7.0,0,0,4132 03rd St,5260003,BP ISSUED,08-Aug-13,"(37.740068999999998, -122.388912)",N200707055953
188,2.0,1.0,0,0,1076 Hampshire St,4152016,CONSTRUCTION,21-Feb-12,"(37.754974400000002, -122.4076295)",N200709193092
515,2.0,2.0,0,0,1078 Hampshire St,4152046,CONSTRUCTION,21-Feb-12,"(37.754857000000001, -122.407522)",N200709193092
127,1.0,1.0,0,0,83 Panorama Dr,2821010,BP APPROVED,02-Mar-12,"(37.747147900000002, -122.4509959)",N200711077576
527,1.0,1.0,0,0,83 Panorama Dr,2821023,BP APPROVED,02-Mar-12,"(37.747062999999997, -122.45041500000001)",N200711077576
299,2.0,1.0,0,0,268 Madison St,5943008,CONSTRUCTION,13-Sep-13,"(37.725757999999999, -122.42266100000001)",N200711077587
520,1.0,1.0,0,0,268 Madison St,5943051,CONSTRUCTION,13-Sep-13,"(37.725698000000001, -122.422527)",N200711077587
307,1.0,1.0,0,0,138 Alpha St,6208003,BP ISSUED,25-Apr-13,"(37.714095, -122.404408)",N200806194898
530,1.0,1.0,0,0,138 Alpha St,6208056,BP ISSUED,25-Apr-13,"(37.714331000000001, -122.404082)",N200806194898



Duplicated records at 2014'Q1


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
657,7.0,7.0,0,0,4132 03rd St,5260003,CONSTRUCTION,26-Feb-14,"(37.740068999999998, -122.388912)",N200707055953
124,7.0,7.0,0,0,4132 Third Street,5260002,CONSTRUCTION,26-Feb-14,"(37.740113999999998, -122.389089)",N200707055953
60,2.0,1.0,0,0,1076 Hampshire St,4152016,CONSTRUCTION,21-Feb-12,"(37.754974400000002, -122.4076295)",N200709193092
627,2.0,2.0,0,0,1078 Hampshire St,4152046,CONSTRUCTION,21-Feb-12,"(37.754857000000001, -122.407522)",N200709193092
756,1.0,1.0,0,0,83 Panorama Dr,2821023,BP APPROVED,02-Mar-12,"(37.747062999999997, -122.45041500000001)",N200711077576
281,1.0,1.0,0,0,83 Panorama Dr,2821010,BP APPROVED,02-Mar-12,"(37.747147900000002, -122.4509959)",N200711077576
748,1.0,1.0,0,0,268 Madison St,5943051,CONSTRUCTION,08-Jan-14,"(37.725698000000001, -122.422527)",N200711077587
151,2.0,1.0,0,0,268 Madison St,5943008,CONSTRUCTION,08-Jan-14,"(37.725757999999999, -122.42266100000001)",N200711077587
758,1.0,1.0,0,0,138 Alpha St,6208056,BP ISSUED,25-Apr-13,"(37.714331000000001, -122.404082)",N200806194898
159,1.0,1.0,0,0,138 Alpha St,6208003,BP ISSUED,25-Apr-13,"(37.714095, -122.404408)",N200806194898



Duplicated records at 2014'Q2


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
735,7.0,7.0,0,0,4132 03rd St,5260003,CONSTRUCTION,06/30/2014 12:00:00 AM,"(37.740069, -122.388912)",N200707055953
207,7.0,7.0,0,0,4132 Third Street,5260002,CONSTRUCTION,06/30/2014 12:00:00 AM,"(37.740114, -122.389089)",N200707055953
741,2.0,2.0,0,0,1078 Hampshire St,4152046,CONSTRUCTION,02/21/2012 12:00:00 AM,"(37.754857, -122.407522)",N200709193092
208,2.0,1.0,0,0,1076 Hampshire St,4152016,CONSTRUCTION,02/21/2012 12:00:00 AM,"(37.7549744, -122.4076295)",N200709193092
9,1.0,1.0,0,0,83 Panorama Dr,2821010,BP ISSUED,04/04/2014 12:00:00 AM,"(37.7471479, -122.4509959)",N200711077576
754,1.0,1.0,0,0,83 Panorama Dr,2821023,BP ISSUED,04/04/2014 12:00:00 AM,"(37.747063, -122.450415)",N200711077576
746,1.0,1.0,0,0,268 Madison St,5943051,CONSTRUCTION,01/08/2014 12:00:00 AM,"(37.725698, -122.422527)",N200711077587
306,2.0,1.0,0,0,268 Madison St,5943008,CONSTRUCTION,01/08/2014 12:00:00 AM,"(37.725758, -122.422661)",N200711077587
313,1.0,1.0,0,0,138 Alpha St,6208003,BP ISSUED,04/25/2013 12:00:00 AM,"(37.714095, -122.404408)",N200806194898
756,1.0,1.0,0,0,138 Alpha St,6208056,BP ISSUED,04/25/2013 12:00:00 AM,"(37.714331, -122.404082)",N200806194898



Duplicated records at 2014'Q3


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
514,7.0,7.0,0,0,4132 03rd St,5260003,CONSTRUCTION,28-Jul-14,"(37.740068999999998, -122.388912)",N200707055953
236,7.0,7.0,0,0,4132 Third Street,5260002,CONSTRUCTION,28-Jul-14,"(37.740113999999998, -122.389089)",N200707055953
171,2.0,1.0,0,0,1076 Hampshire St,4152016,CONSTRUCTION,21-Feb-12,"(37.754974400000002, -122.4076295)",N200709193092
654,2.0,2.0,0,0,1078 Hampshire St,4152046,CONSTRUCTION,21-Feb-12,"(37.754857000000001, -122.407522)",N200709193092
668,1.0,1.0,0,0,83 Panorama Dr,2821023,CONSTRUCTION,23-Sep-14,"(37.747062999999997, -122.45041500000001)",N200711077576
112,1.0,1.0,0,0,83 Panorama Dr,2821010,CONSTRUCTION,23-Sep-14,"(37.747147900000002, -122.4509959)",N200711077576
659,1.0,1.0,0,0,268 Madison St,5943051,CONSTRUCTION,08-Jan-14,"(37.725698000000001, -122.422527)",N200711077587
269,2.0,1.0,0,0,268 Madison St,5943008,CONSTRUCTION,08-Jan-14,"(37.725757999999999, -122.42266100000001)",N200711077587
270,1.0,1.0,0,0,138 Alpha St,6208003,BP ISSUED,25-Apr-13,"(37.714095, -122.404408)",N200806194898
670,1.0,1.0,0,0,138 Alpha St,6208056,BP ISSUED,25-Apr-13,"(37.714331000000001, -122.404082)",N200806194898



Duplicated records at 2014'Q4


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
411,2,1,0,0,1076 HAMPSHIRE ST,4152016,CONSTRUCTION,02/21/2012,"(37.754974400000002, -122.4076295)",N200709193092
448,2,2,0,0,1078 HAMPSHIRE ST,4152046,CONSTRUCTION,02/21/2012,"(37.754857000000001, -122.407522)",N200709193092
400,2,2,0,0,447 LINDEN ST,818048,CONSTRUCTION,10/10/2014,"(37.775965900000003, -122.42527699999999)",N200912304034
409,2,2,0,0,443 LINDEN ST,818049,CONSTRUCTION,10/10/2014,"(37.776243000000001, -122.42505300000001)",N200912304034
69,98,98,0,0,1239 TURK ST,757027,CONSTRUCTION,03/27/2015,"(37.780832599999997, -122.42944060000001)",N201207104447
70,98,98,98,98,1100 GOLDEN GATE AV,757025,CONSTRUCTION,03/27/2015,"(37.780004699999999, -122.428684)",N201207104447
38,191,191,160,160,218 BUCHANAN ST,857001A,CONSTRUCTION,03/19/2015,"(37.772350000000003, -122.42627899999999)",N201209059006
55,133,133,160,160,55 LAGUNA STREET,857001,CONSTRUCTION,03/19/2015,"(37.7710334, -122.4252554)",N201209059006
53,139,139,0,0,555 FULTON ST,794028,CONSTRUCTION,03/31/2015,"(37.7781941, -122.42581)",N201305036062
54,136,136,0,0,746 LAGUNA ST,794015,CONSTRUCTION,05/13/2010,"(37.778094699999997, -122.42641860000001)",N201305036062



Duplicated records at 2015'Q1


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
794,2,2,0,0,1078 HAMPSHIRE ST,4152046,CONSTRUCTION,02/21/2012,"(37.75487194240, -122.40775958600)",N200709193092
863,2,1,0,0,1076 HAMPSHIRE ST,4152016,CONSTRUCTION,02/21/2012,"(37.75494040740, -122.40776607900)",N200709193092
523,2,2,0,0,443 LINDEN ST,818049,CONSTRUCTION,10/10/2014,"(37.77599145770, -122.42523007200)",N200912304034
782,2,2,0,0,447 LINDEN ST,818048,CONSTRUCTION,10/10/2014,"(37.77596050200, -122.42532017400)",N200912304034



Duplicated records at 2015'Q3


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
321,8,6,0,0,95 LELAND AV,6250028,BP REINSTATED,03/05/2010,"(37.71168292210, -122.40561714300)",N200704128664
346,8,5,0,0,95 LELAND AV,6250037,BP REINSTATED,03/05/2010,"(37.71172071370, -122.40571413900)",N200704128664
170,59,59,7,7,249 PENNSYLVANIA AV,3999002,BP Filed,05/29/2015,"(37.76297137300, -122.39317202300)",N201505297549
222,16,16,0,0,502 07TH ST,3780001,BP FILED,05/29/2015,"(37.77399389340, -122.40478807300)",N201505297549



Duplicated records at 2016'Q2


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
415,2,1,0,0,1948 & 1948A QUESADA AV,5329011,BP FILED,04/11/2016,"(37.73625912340, -122.39606265400)",N201604114413
416,1,1,0,0,1948 QUESADA AV,5329011,BP FILED,04/11/2016,"(37.73625912340, -122.39606265400)",N201604114413



Duplicated records at 2017'Q2


Unnamed: 0,UNITS,NET_UNITS,AFF_UNITS,NET_AFF_UNITS,NAMEADDR,APN,BESTSTAT,BESTDATE,LOCATION,BP_APPLNO
997,1.0,1.0,0.0,0.0,300 - 308 UNIVERSITY ST,5991001,BP ISSUED,06/09/2017 12:00:00 AM,"(37.7263450623, -122.4142608643)",N201407302510
999,1.0,1.0,0.0,0.0,302 UNIVERSITY ST,5991002,BP ISSUED,06/09/2017 12:00:00 AM,"(37.7262802124, -122.4142303467)",N201407302510
374,9.0,6.0,0.0,0.0,271 UPPER TER,2628032,BP FILED,03/20/2015 12:00:00 AM,"(37.7622299194, -122.4454345703)",N201503201418
603,2.0,2.0,0.0,0.0,271 UPPER TR,2628032,BP FILED,03/20/2015 12:00:00 AM,"(37.7622299194, -122.4454345703)",N201503201418
201,32.0,32.0,0.0,0.0,241 10TH ST,3518020,CONSTRUCTION,08/18/2017 12:00:00 AM,"(37.773651123, -122.4134368896)",N201506159021
215,28.0,28.0,0.0,0.0,241 10TH ST,3518020,CONSTRUCTION,08/18/2017 12:00:00 AM,"(37.773651123, -122.4134368896)",N201506159021


# Fixing duplicates
Re-adding duplicate records with corrected values.

In [None]:
def fix_duplicate(permitId, units, netUnits, affUnits, netAffUnits, addr, apn, beststat, bestdate):
    for i in range(count):
        if len(duplicates[i][duplicates[i]['BP_APPLNO'] == permitId]) > 0:
            data[i].loc[permitId] = [units, netUnits, affUnits, netAffUnits, addr, apn, beststat, bestdate]
        duplicates[i] = duplicates[i][duplicates[i]['BP_APPLNO'] != permitId]

fix_duplicate('N200709193092', 2, 1, 0, 0, '1076-1078 Hampshire St', '4152016', "CONSTRUCTION", date(2012, 2, 21))
fix_duplicate('N200711077587', 2, 1, 0, 0, '268 Madison St', '5943008', "CONSTRUCTION", date(2014, 2, 8))
fix_duplicate('N200707055953', 7, 7, 0, 0, '4132 03rd St', '5260003', "CONSTRUCTION", date(2014, 7, 28))
fix_duplicate('N200912304034', 2, 2, 0, 0, '447 Linden St', '818048', "CONSTRUCTION", date(2014, 10, 10))
fix_duplicate('N201209059006', 191, 191, 0, 0, '218 Buchanan St', '857001', "CONSTRUCTION", date(2014, 7, 31))
fix_duplicate('N200711077576', 1, 1, 0, 0, '83 Panorama Dr', '2821010', "CONSTRUCTION", date(2014, 9, 23))
fix_duplicate('N201207104447', 98, 98, 0, 0, '1100 GOLDEN GATE AV', '757025', "CONSTRUCTION", date(2015, 3, 27))
fix_duplicate('N201305036062', 139, 139, 0, 0, '555 FULTON ST', '794028', "CONSTRUCTION", date(2015, 3, 31))

hasDuplicates = False
for i in range(count):
    if (len(duplicates[i]) > 0):
        print()
        print("Duplicated records at {}".format(labels[i]))
        display(duplicates[i])
        hasDuplicates = True
if not hasDuplicates:
    print("No duplicates present!")

# Displaying records without Permit ID
Printing all records without Permit ID

In [None]:
for i in range(count):
    if (len(missing[i]) > 0):
        print()
        print("Records without Permit ID at {}".format(labels[i]))
        display(missing[i])

# Fixing records without Permit ID
Correcting records without Permit ID

In [None]:
def fix_missing(permitId, units, netUnits, affUnits, netAffUnits, addr, apn):
    for i in range(count):
        if len(missing[i][missing[i]['APN'].isin(apn)]) > 0:
            data[i].loc[permitId] = [units, netUnits, affUnits, netAffUnits, addr, apn[0]]
        missing[i] = missing[i][~missing[i]['APN'].isin(apn)]
        
fix_missing("N_CUSTOM_JAMESTOWN", 132, 14, 0, 0, '833-881 Jamestown', ['4991277'])
fix_missing("N_CUSTOM_BROTHERHOOD", 196, 196, 0, 0, '800 BROTHERHOOD WAY', ['7331003', '7331005'])
fix_missing("N_CUSTOM_SHIPYARD", 229, 229, 0, 0, 'HUNTERS POINT SHIPYARD, PHASE I', ['4591C001', '4624031'])

hasMissing = False
for i in range(count):
    if (len(missing[i]) > 0):
        print()
        print("Records without Permit ID at {}".format(labels[i]))
        display(missing[i])
        hasMissing = True
if not hasMissing:
    print("No missing present")

# Searching for incorrect unit values
Searching projects with unit counts varying from quarter to quarter and projects with net units exceeding total units.

In [None]:
def find_incorrect(column):
    incorrect = {}
    for i in range(count-1):
        X = data[i]
        for j in range(i+1, count):
            X2 = data[j]
        
            X = X[X.index.isin(X2.index)].sort_index()
            X2 = X2[X2.index.isin(X.index)].sort_index()
            Y = X[X[column] != X2[column]]
            Y2 = X2[X[column] != X2[column]]
            if len(Y)>0:
                for index, row in Y.iterrows():
                    a = int(Y.loc[index][column])
                    b = int(Y2.loc[index][column])
                    if index not in incorrect:
                        incorrect[index] = { 'min': min(a, b), 'max': max(a,b)}
                    else:
                        incorrect[index] = { 'min': min(incorrect[index]['min'], a, b), 'max': max(incorrect[index]['max'], a, b)}
    return incorrect


incorrect = find_incorrect('UNITS')
print("Unit number fluctuation")
display(incorrect)

incorrectNet = find_incorrect('NET_UNITS')
print("Net Unit number fluctuation")
display(incorrectNet)

In [None]:
def fix_units(permitId, units, column):
    for i in range(count):
        if permitId in data[i].index:
            d = data[i].loc[permitId]
            d[column] = units
            data[i].loc[permitId] = d

for k in incorrect:
    fix_units(k, incorrect[k]['max'], 'UNITS')
for k in incorrectNet:
    fix_units(k, incorrectNet[k]['max'], 'NET_UNITS')    
    
# 201 Folsom
fix_units('N201207124717', 656, 'UNITS')
fix_units('N201207124717', 656, 'NET_UNITS')

# Detection of completed buildings, p.1
Identifying completed projects as the ones that were in construction in a given quarter and are removed from the pipeline in the following quarter.

In [None]:
def buildStats(data):
    buildings = list(map(lambda x: len(x), completed))
    units = list(map(lambda x: x['UNITS'].values.sum(), completed))
    netUnits = list(map(lambda x: x['NET_UNITS'].values.sum(), completed))
    aff = list(map(lambda x: x['AFF_UNITS'].values.sum(), completed[4:]))
    netAff = list(map(lambda x: x['NET_AFF_UNITS'].values.sum(), completed[4:]))
    return { "buildings": buildings, "units": units, "netUnits": netUnits, "aff": aff, "netAff": netAff }

def printStats(stats):
    print("Buildings: {}".format(stats["buildings"]))
    print("Net Units: {}".format(stats["netUnits"]))
    print("Total Units: {}".format(sum(stats["netUnits"])))
    print("Net Affordable Units: {}".format(stats["netAff"]))
    print("Total Affordable Units: {}".format(sum(stats["netAff"])))

def contains(data, key):
    for i in range(len(data)):
        if key in data[i].index:
            return True
    return False

In [None]:
completed = []
for i in range(len(data)-1):
    X = data[i]
    X2 = data[i+1]
    X = X[~X.index.isin(X2.index)]
    completed.append(X)

stats = buildStats(completed)
printStats(stats)

# Detection of completed buildings, p.2
Correcting for projects that re-appear in the pipeline after being removed.

In [None]:
completed = []
for i in range(len(data)-1):
    X = data[i]
    for j in range(i+1,len(data)):
        X2 = data[j]
        X = X[~X.index.isin(X2.index)]
    completed.append(X)

stats = buildStats(completed)
printStats(stats)

# Percent of units in top 10 projects for each quartal
Here we print the top ten projects (by net new units) for each quarter and compare their contribution to the total net number of units completed in the same period. As we see below, in a typical quarter the top ten projects account for 90%+ of citywide housing production.

In [None]:
for i in range(len(labels)-1):
    topUnits = completed[i].sort_values('NET_UNITS', ascending=False).head(10)['NET_UNITS'].values.sum()
    units = completed[i]['NET_UNITS'].values.sum()
    print(labels[i+1])
    print(topUnits/units)

# Group Stats by Year

In [None]:
for i in range(3):
    yearCount = int(sum(stats['netUnits'][i*4:i*4+4]))
    print("Year {}:      {}".format(2014 + i, yearCount))
print("Year 2017 (Q1): {}".format(completed[len(completed) - 1]['NET_UNITS'].sum()))

# Result Table (top 10)

In [None]:
for i in range(len(labels)-1):
    print(labels[i+1])
    display(completed[i].sort_values('NET_UNITS', ascending=False).head(10))

# Exporting datasets

In [None]:
for i in range(count):
    fname = 'data_complete/PipelineCleaned_'+labels[i].replace('\'','_')+'.csv'
    data[i].to_csv(fname)
for i in range(count-1):
    fname = 'data_complete/PipelineCompleted_'+labels[i+1].replace('\'','_')+'.csv'
    #completed[i].to_csv(fname)

# Total in pipeline per quartal

In [None]:
for i in range(count):
    print("{}: {}".format(labels[i], data[i]['NET_UNITS'].values.sum()))