In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Loading

Мы имеем 4 практически идентичных формата датасетов на портале sfdata.org за исключением наименования колонок и уникального идентификатора. Порядковый номер увеличивается по мере старения формата. 1 - самый новый, 4 самый старый.

Последний отличается от старых форматом идентификаторов объектов, но по факту в новом версии они просто имеют префикс "APN" записанное в колонку "APN", а в старой это число, которое иногда имеет префикс "0" и записано в колонках "BLKLOT" или "Block Lot". В данных до 3 квартала 2014 отсутствует информация об affordable housing.

Для анализа использовались данные только со статусом CONSTRUCTION в колонках "BESTSTAT" (или аналогичном), которые имеют указанное количество NET_UNITS и UNITS большее или равное нулю.

Далее некоторые пермиты не указаны и дальнейшее исследование показало что это большие сложные проекты (вроде создания новых улиц) и мы добавим к статистике позже. На данном этапе просто выведем все проекты без пермитов и удалим их из датасета.

In [2]:
def loadData(fileName, fmt = 1):
    col = 'BP_APPLNO'
    converters = { 'APN': lambda x: x[4:], 'BP_APPLNO': lambda x: 'N'+x.replace(',','') }
    if fmt == 2 or fmt == 3:
        col = 'BP_APPLNO'
        converters = { 'BLKLOT': lambda x: x.lstrip('0'), 'BP_APPLNO': lambda x: 'N'+x.replace(',','') }
    if fmt == 4:
        col = 'DBI Permit'
        converters = { 'Block Lot': lambda x: x.lstrip('0'), 'DBI Permit': lambda x: 'N'+x.replace(',',''),
                     'Location 1': lambda x: x.split('\n')[0]}
        
    X = pd.read_csv(fileName, sep=',', parse_dates=[], infer_datetime_format=True, quotechar='"', converters=converters)
    
    if fmt == 2 or fmt == 3:
        X = X.rename(columns={"AFF_UNITS_NET": "NET_AFF_UNITS", "BLKLOT": "APN"})
    if fmt == 3:
        X = X.rename(columns={"UNITSNET": "NET_UNITS"})
    if fmt == 4:
        X = X.rename(columns={"Units": "UNITS", "Net Added Units": "NET_UNITS", "Best Stat": "BESTSTAT", 'Location 1': 'NAMEADDR', 'Block Lot': 'APN'})
    if fmt == 5:
        X = X.rename(columns={"PROPUSE": "PROJECT_TYPE"})
    X = X[X['BESTSTAT'] == 'CONSTRUCTION']
    X = X[X['UNITS'] >= 0]
    X = X[X['NET_UNITS'] >= 0]
    
    # Filtering records without permits
#     if len(X[X[col] == 'N'])>0:
#         print(fileName)
#         print(X[X[col] == 'N'][[col, 'NAMEADDR','APN', 'UNITS','NET_UNITS','PROJECT_TYPE', "NET_AFF_UNITS"]])
    X = X[X[col] != 'N']
    
#     D = X[X.duplicated(col, keep=False)]
#     if len(D) > 0:
#         print(fileName)
#         print(D[['APN', col, 'NAMEADDR', 'UNITS', 'NET_UNITS']])
#         X=X[X.duplicated(col)]
    
    X = X.set_index(col)
    X.index.names = ['PERMIT_ID']
    
    return X

labels = ['2013\'Q4', '2014\'Q1', '2014\'Q2', '2014\'Q3', '2014\'Q4', '2015\'Q1', '2015\'Q2', '2015\'Q3',
         '2015\'Q4', '2016\'Q1', '2016\'Q2', '2016\'Q3', '2016\'Q4']

data = []

y13q4 = 0
y14q1 = 1
y14q2 = 2
y14q3 = 3
y14q4 = 4
y15q1 = 5
y15q2 = 6
y15q3 = 7
y15q4 = 8
y16q1 = 9
y16q2 = 10
y16q3 = 11
y16q4 = 12
y17q1 = 13

data.append(loadData('data/San_Francisco_Development_Pipeline_2013_Quarter_4.csv', 4))
data.append(loadData('data/San_Francisco_Development_Pipeline_2014_Quarter_1.csv', 4))
data.append(loadData('data/San_Francisco_Development_Pipeline_2014_Quarter_2.csv', 4))
data.append(loadData('data/San_Francisco_Development_Pipeline_2014_Quarter_3.csv', 4))
data.append(loadData('data/San_Francisco_Development_Pipeline_2014_Quarter_4.csv', 3))
data.append(loadData('data/San_Francisco_Development_Pipeline_2015_Quarter_1.csv', 2))
data.append(loadData('data/San_Francisco_Development_Pipeline_2015_Quarter_2.csv'))
data.append(loadData('data/San_Francisco_Development_Pipeline_2015_Quarter_3.csv'))
data.append(loadData('data/San_Francisco_Development_Pipeline_2015_Quarter_4.csv'))
data.append(loadData('data/SF_Development_Pipeline_2016_Q1.csv'))
data.append(loadData('data/SF_Development_Pipeline_2016_Q2.csv', 5))
data.append(loadData('data/SF_Development_Pipeline_2016_Q3.csv'))
data.append(loadData('data/SF_Development_Pipeline_2016_Q4.csv'))
data.append(loadData('data/SF_Development_Pipeline_2017_Q1.csv'))

# Find invalid buildings

In [3]:
# for i in range(len(data)-2):
#     X = data[i]
#     X2 = data[i+1]
#     X = X[X.index.isin(X2.index)].sort_index()
#     X2 = X2[X2.index.isin(X.index)].sort_index()
#     Y = X[X['NET_UNITS'] != X2['NET_UNITS']]
#     if len(Y) > 0:
#         print(labels[i])
#     for index, row in Y.iterrows():        
#         print(Y.loc[index][['NAMEADDR']][0])
#         print("Was: {}".format(Y.loc[index][['NET_UNITS']][0]))
#         print("Became: {}".format(X2.loc[index][['NET_UNITS']][0]))
#         print()

# Detection of completed buildings, p.1
Для поиска завершенных строений мы искали в следующем квартале отсутствующие записи о строительстве сравнивая их по идентификатору стройки.

In [4]:
def buildStats(data):
    buildings = list(map(lambda x: len(x), completed))
    units = list(map(lambda x: x['UNITS'].values.sum(), completed))
    netUnits = list(map(lambda x: x['NET_UNITS'].values.sum(), completed))
    aff = list(map(lambda x: x['AFF_UNITS'].values.sum(), completed[4:]))
    netAff = list(map(lambda x: x['NET_AFF_UNITS'].values.sum(), completed[4:]))
    return { "buildings": buildings, "units": units, "netUnits": netUnits, "aff": aff, "netAff": netAff }

def printStats(stats):
    print("Buildings: {}".format(stats["buildings"]))
    print("Net Units: {}".format(stats["netUnits"]))
    print("Total Units: {}".format(sum(stats["netUnits"])))
    print("Net Affordable Units: {}".format(stats["netAff"]))
    print("Total Affordable Units: {}".format(sum(stats["netAff"])))

def contains(data, key):
    for i in range(len(data)):
        if key in data[i].index:
            return True
    return False

In [5]:
completed = []
for i in range(len(data)-1):
    X = data[i]
    X2 = data[i+1]
    X = X[~X.index.isin(X2.index)]
    completed.append(X)

stats = buildStats(completed)
printStats(stats)

Buildings: [19, 15, 29, 54, 21, 15, 19, 24, 80, 29, 38, 47, 54]
Net Units: [1672.0, 328.0, 732.0, 1761.0, 601, 653, 604, 633, 1524, 901, 865, 983, 977]
Total Units: 12234.0
Net Affordable Units: [160, 10, 34, 167, 64, 96, 175, 116, 71]
Total Affordable Units: 893


# Detection of completed buildings, p.2
Однако, сравнение только со следующим кварталом показало что некоторые строения пропадают и появляются через несколько кварталов, потому мы начали фильтровать со всеми следующими кварталами.

In [6]:
completed = []
for i in range(len(data)-1):
    X = data[i]
    for j in range(i+1,len(data)):
        X2 = data[j]
        X = X[~X.index.isin(X2.index)]
    completed.append(X)

stats = buildStats(completed)
printStats(stats)

Buildings: [19, 15, 29, 49, 20, 14, 19, 24, 80, 29, 38, 45, 54]
Net Units: [1672.0, 328.0, 732.0, 1355.0, 485, 174, 604, 633, 1524, 901, 865, 768, 977]
Total Units: 11018.0
Net Affordable Units: [0, 10, 34, 167, 64, 96, 175, 23, 71]
Total Affordable Units: 640


# Detection of completed buildings, p.3
Однако, мы нашли некоторые стройки которые исчезают и появляются с другим номером, но по тому же адресу. Потому мы начали фильтровать еще и по полному совпадению адреса.

In [7]:
completed = []
for i in range(len(data)-1):
    X = data[i]
    for j in range(i+1,len(data)):
        X2 = data[j]
        X = X[~X.index.isin(X2.index)]
        print("----")
        print("Filtered {}/{}: {}".format(i, j, len(X[X['NAMEADDR'].isin(X2['NAMEADDR'])])))
        print("----")
        print(X[X['NAMEADDR'].isin(X2['NAMEADDR'])].sort_values('NET_UNITS', ascending=False)[['NAMEADDR', 'NET_UNITS', 'UNITS']])
        print("----")        
        print(X2[X2['NAMEADDR'].isin(X['NAMEADDR'])].sort_values('NET_UNITS', ascending=False)[['NAMEADDR', 'NET_UNITS', 'UNITS']])
        print("----")
        
        X = X[~X['NAMEADDR'].isin(X2['NAMEADDR'])]
    completed.append(X)

stats = buildStats(completed)
printStats(stats)

----
Filtered 0/1: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/2: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/3: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/4: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/5: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/6: 0
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
Empty DataFrame
Columns: [NAMEADDR, NET_UNITS, UNITS]
Index: []
----
----
Filtered 0/

In [8]:
# for i in range(len(data)-1):
#     X = data[i]
#     X2 = data[i+1]
#     X = X[X.index.isin(X2.index)]
#     X2 = X2[X2.index.isin(X.index)]
#     print(i)
#     if len(X) != len(X2):
#         if len(X) > len(X2):
#             Y = X2[~X2.index.isin(X.index)]
#         else:
#             Y = X[~X.index.isin(X2.index)]
#         print(Y)
#     for index, row in X.iterrows():
#         if X.loc[index]['NET_UNITS'] != X2.loc[index]['NET_UNITS']:
#             print(True)
#    Y = X[X['NET_UNITS'] != X2['NET_UNITS']]
 #   print(Y[['NAMEADDR','NET_UNITS']])
print(data[5].index)  



Index(['N201003107932', 'N200505273609', 'N200809252660', 'N201207124717',
       'N201306250394', 'N201304023626', 'N201307303137', 'N201212246822',
       'N201312234917', 'N201211284953',
       ...
       'N201404072588', 'N200611178045', 'N200011024683', 'N201108011461',
       'N200711097802', 'N201110066246', 'N200810275193', 'N201011084497',
       'N201203287036', 'N200810245075'],
      dtype='object', name='PERMIT_ID', length=220)


# Detection of completed buildings, p.4
Однако, тк стройки могут пропадать а потом еще раз появляться то весьма вероятно что некоторые стройки все еще не отфильтрованы. Мы произвели ручную фильтрацию данных и нашли одну большую стройку (57 TEHAMA ST) которая еще не завершена (мы прошлись пешком до них и увидели что они все еще в процессе). Наверняка есть еще.

In [9]:
for i in range(len(completed)):
    if '3736078A' in completed[i].index:
        completed[i] = completed[i].drop('3736078A')

stats = buildStats(completed)
printStats(stats)

Buildings: [19, 15, 29, 49, 19, 14, 19, 24, 80, 29, 38, 45, 54]
Net Units: [1672.0, 328.0, 732.0, 1355.0, 479, 174, 604, 633, 1524, 901, 865, 768, 977]
Total Units: 11012.0
Net Affordable Units: [0, 10, 34, 167, 64, 96, 175, 23, 71]
Total Affordable Units: 640


# Improve units

In [15]:
# Folsom 201 st

src = completed[y16q1].loc['N201207124717']
src['UNITS'] = 669
src['NET_UNITS'] = 669
completed[y16q1].loc['N201207124717']=src

# 268 MADISON ST
src = completed[y16q4].loc['N200711077587']
src['UNITS'] = 77 
src['NET_UNITS'] = 77
completed[y16q4].loc['N200711077587N201207124717']=src

# 1 HENRY ADAMS ST 
src = completed[y16q3].loc['N201306250394']
src['UNITS'] = 560
src['NET_UNITS'] = 560
completed[y16q3].loc['N201306250394']=src

stats = buildStats(completed)
printStats(stats)

Buildings: [19, 15, 29, 49, 19, 14, 19, 24, 80, 29, 38, 45, 56]
Net Units: [1672.0, 328.0, 732.0, 1355.0, 479, 174, 604, 633, 1524, 1285, 865, 1089, 1131]
Total Units: 11871.0
Net Affordable Units: [0, 10, 34, 167, 64, 96, 175, 23, 71]
Total Affordable Units: 640


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in 

# Adding buildings wihhout permit

In [16]:

# Stip unwanted columns
truncated = []
for i in range(len(completed)):
    truncated.append(completed[i][['NET_UNITS', 'UNITS', 'NAMEADDR']])

# HP Shipyard Phase I: 229 units, Q3 2016
stats["units"][11] = stats["units"][11] + 229
stats["netUnits"][11] = stats["netUnits"][11] + 229
truncated[11].loc['N<SHIPYARD>'] = [229, 229, 'HP Shipyard Phase I']

# 800 Brotherhood Way 182 units completion in Q4 2016
stats["units"][12] = stats["units"][12] + 182
stats["netUnits"][12] = stats["netUnits"][12] + 182
truncated[12].loc['N<brotherhood>'] = [182, 182, '800 Brotherhood Way']

printStats(stats)

Buildings: [19, 15, 29, 49, 19, 14, 19, 24, 80, 29, 38, 45, 56]
Net Units: [1672.0, 328.0, 732.0, 1355.0, 479, 174, 604, 633, 1524, 1285, 865, 1318, 1313]
Total Units: 12282.0
Net Affordable Units: [0, 10, 34, 167, 64, 96, 175, 23, 71]
Total Affordable Units: 640


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


# Percent of units in top 10 projects for each quartal
Теперь мы посмотрим количество топ 10 завершенных проектов в каждом квартале и то сколько процентов построенных квартир они содержат от общего числа построенных квартир.

In [17]:
for i in range(len(labels)):
    topUnits = truncated[i].sort_values('NET_UNITS', ascending=False).head(10)['NET_UNITS'].values.sum()
    units = truncated[i]['NET_UNITS'].values.sum()
    print(labels[i])
    print(topUnits/units)

2013'Q4
0.995215311005
2014'Q1
0.984756097561
2014'Q2
0.959016393443
2014'Q3
0.935793357934
2014'Q4
0.985386221294
2015'Q1
0.98275862069
2015'Q2
0.990066225166
2015'Q3
0.976303317536
2015'Q4
0.892388451444
2016'Q1
0.984435797665
2016'Q2
0.954913294798
2016'Q3
0.96282245827
2016'Q4
0.90860624524


# Group Stats by Year

In [18]:
for i in range(3):
    yearCount = int(sum(stats['netUnits'][1+i*4:1+i*4+4]))
    print("Year {}: {}".format(2014 + i, yearCount))

Year 2014: 2894
Year 2015: 2935
Year 2016: 4781


# Result Table (top 10)
Как мы видим больше 90% квартир построены в топ-10 проектов, составим отчет по этим строениям за каждый квартал

In [19]:
cols = ['NET_UNITS', 'UNITS', 'NAMEADDR']
for i in range(len(labels)):
    print(labels[i])
    print(truncated[i].sort_values('NET_UNITS', ascending=False)[cols].head(10))

2013'Q4
               NET_UNITS  UNITS             NAMEADDR
PERMIT_ID                                           
N200607207084      754.0  754.0       1401 Market St
N201104224606      315.0  315.0       185 Channel St
N201207054130      273.0  273.0         1155 04th St
N201012217106      115.0  115.0  1960-1998 Market St
N200506246051       88.0   88.0       333 Fremont St
N200912223711       52.0   52.0     63 West Point Rd
N201109074027       24.0   24.0       1600 Market St
N200701051074       19.0   19.0         246 Ritch St
N200912183521       15.0   15.0         1266 09th Av
N200711137944        9.0    9.0         3135 24th St
2014'Q1
               NET_UNITS  UNITS              NAMEADDR
PERMIT_ID                                            
N201106017202      182.0  182.0           260 05th St
N201111038205       40.0   40.0          1501 15th St
N201110146841       38.0   38.0  1645-1661 Pacific Av
N200608290880       35.0   35.0        1080 Sutter St
N201202154236       20.0