In [126]:
import pandas as pd
from datetime import datetime
from datetime import date
from os import listdir
from os.path import isfile, join
from functools import reduce
import numpy as np

In [127]:
df_sales = pd.read_csv('../data/project/NYC Residential Sales Master.csv', index_col=0)

In [128]:
def category_date(d):
    if date(2009,1,1) <= d <= date(2010,12,31):
        return 'p1'
    if date(2011,1,1) <= d <= date(2012,12,31):
        return 'p2'
    if date(2013,1,1) <= d <= date(2014,12,31):
        return 'p3'
    if date(2015,1,1) <= d <= date(2016,12,31):
        return 'p4'
    if date(2017,1,1) <= d <= date(2018,12,31):
        return 'p5'
    if d >= date(2019,1,1):
        return 'p6'
    return None

dict_p = {
    'p1': '2009-2010',
    'p2': '2011-2012',
    'p3': '2013-2014',
    'p4': '2015-2016',
    'p5': '2017-2018',
    'p6': '2019'
}

In [129]:
def group_sales(period, df):
    df_sales_p = df[df['period']==period]
    df_sales_p = df_sales_p.drop_duplicates(keep='first')\
                           .reset_index(drop=True)\
                           .groupby(['ZIP'])\
                           .agg({'SALE PRICE': ['mean', 
                                                'median', 
                                                'count']})

    df_sales_p.columns = [f"average sales price {dict_p[period]}", 
                          f"median sales price {dict_p[period]}", 
                          f"number of transactions {dict_p[period]}"]
    
    df_sales_p = df_sales_p.reset_index()\
                           .astype(dtype={'ZIP': str})\
                           .reset_index(drop=True)
    
    return df_sales_p

In [130]:
def gen_sales_summary(df_sales, export_path):
    df_sales['SALE DATE'] = df_sales['SALE DATE'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
    df_sales['period'] = df_sales['SALE DATE'].apply(category_date)
    
    groups = [p for p in df_sales['period'].unique() if p is not None]
    
    for period in groups:
        df_sales_p = group_sales(period, df_sales)
        df_sales_p.to_csv(f'{export_path}/sales_summary_{period}.csv', index=False)

In [131]:
gen_sales_summary(df_sales, '../data/project/sales_summary')

In [116]:
def merge_dfs(export_path, on, key):
    filenames = [f for f in listdir(export_path) if isfile(join(export_path, f)) and f != '.DS_Store']
    dfs = []
    
    for fn in filenames:
        df = pd.read_csv(f'{export_path}/{fn}')
        dfs.append(df)
    
    df_merged = reduce(lambda left,right: pd.merge(left,right,on=[on],
                                            how='outer'), dfs).fillna('void')
    
    columns = df_merged.columns
    accepted = [col for col in columns if key in col]
    
    return df_merged[[on]+accepted]

In [117]:
sales_summary = merge_dfs('../data/project/sales_summary', 'ZIP', 'average')
sales_summary

Unnamed: 0,ZIP,average sales price 2011-2012,average sales price 2013-2014,average sales price 2009-2010,average sales price 2015-2016,average sales price 2017-2018,average sales price 2019
0,0,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05
1,10001,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06
2,10002,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06
3,10003,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06
4,10004,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06
...,...,...,...,...,...,...,...
172,11436,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05
173,11691,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05
174,11692,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05
175,11693,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05


In [70]:
donations_summary = merge_dfs('../data/project/Political_contributions_summary', 'zip code', 'total').rename(columns={'zip code': 'ZIP'})
donations_summary

Unnamed: 0,ZIP,total donations 2011-2012,total donations 2017-2018,total donations 2019,total donations 2013-2014,total donations 2009-2010,total donations 2015-2016
0,1,3900,17441,4614,2210,9031,5989
1,2,600,12080,2425,1000,19700,1561
2,3,300,6690,425,void,3000,1350
3,4,1300,4163,717,1115,13125,140
4,5,600,4618,250,void,2200,void
...,...,...,...,...,...,...,...
47726,99778,void,void,void,void,void,315
47727,99825,void,void,void,void,void,372
47728,99830,void,void,void,void,void,50
47729,99832,void,void,void,void,void,35


In [71]:
sales_zipcodes = sales_summary['ZIP'].unique()
donations_zipcodes = donations_summary['ZIP'].unique()
zipcodes = list(set(sales_zipcodes).intersection(set(donations_zipcodes)))

In [79]:
sales_data = sales_summary.loc[sales_summary.ZIP.isin(zipcodes)][sales_summary.columns].reset_index(drop=True)
donations_data = donations_summary.loc[donations_summary.ZIP.isin(zipcodes)][donations_summary.columns].reset_index(drop=True)
full_data = pd.merge(sales_data, donations_data, on='ZIP', how='outer').astype(dtype={'ZIP': str})


In [80]:
full_data

Unnamed: 0,ZIP,average sales price 2011-2012,average sales price 2013-2014,average sales price 2009-2010,average sales price 2015-2016,average sales price 2017-2018,average sales price 2019,total donations 2011-2012,total donations 2017-2018,total donations 2019,total donations 2013-2014,total donations 2009-2010,total donations 2015-2016
0,0,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,void,1.07814e+06,516518,87200,1338,683850
1,10001,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,679053,6.90089e+06,1.01036e+06,1.46631e+06,287000,4.88778e+06
2,10002,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,68310,602952,270415,99263,45405,935572
3,10003,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,1.33528e+06,7.09708e+06,2.54968e+06,1.69273e+06,905278,5.85799e+06
4,10004,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,315941,1.34976e+06,409908,435572,721756,1.39469e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,11436,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,1790,24064,16548,3358,250,14282
173,11691,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,10767,50501,23576,11012,17326,99930
174,11692,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,250,9583,6447,800,250,12257
175,11693,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,700,8733,8926,989,7200,14044


In [81]:
sub_markets = pd.read_csv('../data/project/All_Properties with Political_contributions PLUTO_New_Data.csv')
dict_submarkets = dict(zip(sub_markets['ZIP'].astype(str), sub_markets['REIS SUBMARKET']))

In [82]:
def assign_submarket(zipcode, dict_submarkets):
    try:
        return dict_submarkets[zipcode]
    except:
        return None

In [83]:
full_data['submarket'] = full_data['ZIP'].apply(lambda x: assign_submarket(x, dict_submarkets))

In [84]:
full_data

Unnamed: 0,ZIP,average sales price 2011-2012,average sales price 2013-2014,average sales price 2009-2010,average sales price 2015-2016,average sales price 2017-2018,average sales price 2019,total donations 2011-2012,total donations 2017-2018,total donations 2019,total donations 2013-2014,total donations 2009-2010,total donations 2015-2016,submarket
0,0,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,6.955782e+05,void,1.07814e+06,516518,87200,1338,683850,Kings County
1,10001,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,7.864643e+06,679053,6.90089e+06,1.01036e+06,1.46631e+06,287000,4.88778e+06,Midtown West
2,10002,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,6.234941e+06,68310,602952,270415,99263,45405,935572,West Village/Downtown
3,10003,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,7.963247e+06,1.33528e+06,7.09708e+06,2.54968e+06,1.69273e+06,905278,5.85799e+06,West Village/Downtown
4,10004,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,3.938000e+06,315941,1.34976e+06,409908,435572,721756,1.39469e+06,West Village/Downtown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,11436,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,3.533441e+05,1790,24064,16548,3358,250,14282,Queens County
173,11691,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,7.646252e+05,10767,50501,23576,11012,17326,99930,Queens County
174,11692,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,4.718308e+05,250,9583,6447,800,250,12257,Queens County
175,11693,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,3.354517e+05,700,8733,8926,989,7200,14044,Queens County


In [87]:
submarkets = [sub for sub in full_data.submarket.unique() if sub is not None]
submarkets

['Kings County',
 'Midtown West',
 'West Village/Downtown',
 'Staten Island',
 'West village/Downtown',
 'Upper East Side',
 'Upper West Side',
 'Morningside/Washington',
 'Bronx County',
 'Queens County']

In [93]:
data = full_data[full_data.submarket==submarkets[0]]
data

Unnamed: 0,ZIP,average sales price 2011-2012,average sales price 2013-2014,average sales price 2009-2010,average sales price 2015-2016,average sales price 2017-2018,average sales price 2019,total donations 2011-2012,total donations 2017-2018,total donations 2019,total donations 2013-2014,total donations 2009-2010,total donations 2015-2016,submarket
0,0,695578.2,695578.2,695578.2,695578.2,695578.2,695578.2,void,1078140.0,516518.0,87200.0,1338,683850.0,Kings County
7,10007,12975140.0,12975140.0,12975140.0,12975140.0,12975140.0,12975140.0,379961,1660630.0,478061.0,355207.0,194284,1493000.0,Kings County
9,10010,14942150.0,14942150.0,14942150.0,14942150.0,14942150.0,14942150.0,2.43704e+06,24303100.0,2600140.0,1977930.0,794876,19932400.0,Kings County
14,10016,5656355.0,5656355.0,5656355.0,5656355.0,5656355.0,5656355.0,963440,3459980.0,1228560.0,824090.0,605135,3732130.0,Kings County
15,10017,9156126.0,9156126.0,9156126.0,9156126.0,9156126.0,9156126.0,2.17215e+06,9614160.0,3340880.0,2580910.0,2.02131e+06,11384300.0,Kings County
19,10022,7869954.0,7869954.0,7869954.0,7869954.0,7869954.0,7869954.0,7.95732e+06,57335600.0,6121910.0,14879400.0,3.56121e+06,71581900.0,Kings County
88,11201,3694279.0,3694279.0,3694279.0,3694279.0,3694279.0,3694279.0,964474,6972400.0,2515200.0,731658.0,460685,4868870.0,Kings County
89,11203,486842.5,486842.5,486842.5,486842.5,486842.5,486842.5,13305,59393.0,24616.0,16533.0,18553,123283.0,Kings County
90,11204,915431.6,915431.6,915431.6,915431.6,915431.6,915431.6,28007,114673.0,67895.0,107875.0,37900,179625.0,Kings County
91,11205,1336148.0,1336148.0,1336148.0,1336148.0,1336148.0,1336148.0,67982,446245.0,275229.0,33132.0,16181,315104.0,Kings County
