In [1]:
import pandas as pd
import numpy as np
#from shapely import wkt
import geopandas as gpd

# Zoning Data

In [2]:
multifamily = {'R-M', 'R-M(PD)', 'R-M(CL)', 'R-2', 'R-2(PD)', 'UV', 'CP', 
               'CP(PD)', 'MUC', 'MUN', 'DC', 'DC-NT1', 'DC(PD)'}
#confirm what R-M(CL) means
#make sure we don't need to include others: also include UR, TR

In [3]:
sj_zips = gpd.read_file('Zip_Code_Boundary.geojson')
#sj_zips['geometry'] = sj_zips['geometry'].apply(wkt.loads)
#sj_zips = sj_zips.to_crs(epsg = 26910) #converting crs to planar

sj_zones = gpd.read_file('zoning_data/Zoning_2022.geojson')
#sj_zones['geometry'] = sj_zones['geometry'].apply(wkt.loads)

#sj_zones = sj_zones.to_crs(epsg = 26910) #converting crs to planar


In [4]:
#sj_zones['area'] = sj_zones.area #creating area column

In [5]:
#sj_zones

In [6]:
# def calculate_percentage(group):
#     total_area = group['area'].sum()
#     rm_area = group[group['ZONINGABBREV'].isin(multifamily)]['area'].sum()
#     percentage = rm_area / total_area
#     return percentage

def calculate_percentage(group):
    total = len(group)
    rm_rows = len(group[group['ZONINGABBREV'].isin(multifamily)])
    
    return rm_rows / total
    

In [7]:
def calc_area(group):
    return group['area'].sum()

**Final Year**

In [8]:
joined_data = gpd.sjoin(sj_zips, sj_zones, how='inner', predicate='intersects')


percentages = (
    joined_data.groupby('ZIPCODE').apply(calculate_percentage)
    .reset_index(name='multifam_percentage')
)

# areas = (
#     joined_data.groupby('ZIPCODE').apply(calc_area)
#     .reset_index(name='total_area')
# )

joined_data = joined_data.merge(percentages, on = 'ZIPCODE', how = 'left')
#joined_data = joined_data.merge(areas, on = 'ZIPCODE', how = 'left')


abbrev_data = joined_data[['ZIPCODE', 'FACILITYID', 'ZONINGABBREV', 'multifam_percentage']] #would add total area here
abbrev_data['ZIPCODE'] = abbrev_data['ZIPCODE'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abbrev_data['ZIPCODE'] = abbrev_data['ZIPCODE'].astype('int64')


**Early Year (2009)**

In [9]:
base_zones = gpd.read_file('zoning_data/Zoning_2009.geojson')
base_zones = base_zones.to_crs(sj_zones.crs)

#base_zones['area'] = base_zones.area #creating area column


base_joined = gpd.sjoin(sj_zips, base_zones, how='inner', predicate='intersects')

base_percentages = (
    base_joined.groupby('ZIPCODE').apply(calculate_percentage)
    .reset_index(name='multifam_percentage_2009')
)


base_joined = base_joined.merge(base_percentages, on = 'ZIPCODE', how = 'left')


abbrev_base = base_joined[['ZIPCODE', 'FACILITYID', 'ZONINGABBREV', 'multifam_percentage_2009']]
abbrev_base['ZIPCODE'] = abbrev_base['ZIPCODE'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abbrev_base['ZIPCODE'] = abbrev_base['ZIPCODE'].astype('int64')


**Base Year (2015)**

In [10]:
zones_15 = gpd.read_file('zoning_data/Zoning_2015.geojson')

zones_15 = zones_15.to_crs(sj_zones.crs)
#zones_15['area'] = zones_15.area #creating area column


joined_15 = gpd.sjoin(sj_zips, zones_15, how='inner', predicate='intersects')

percentages_15 = (
    joined_15.groupby('ZIPCODE').apply(calculate_percentage)
    .reset_index(name='multifam_percentage_2015')
)



joined_15 = joined_15.merge(percentages_15, on = 'ZIPCODE', how = 'left')


abbrev_15 = joined_15[['ZIPCODE', 'FACILITYID', 'ZONINGABBREV', 'multifam_percentage_2015']] 
abbrev_15['ZIPCODE'] = abbrev_15['ZIPCODE'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abbrev_15['ZIPCODE'] = abbrev_15['ZIPCODE'].astype('int64')


In [11]:
#sorted(abbrev_data['ZIPCODE'].unique())

**Joining**

In [12]:
to_join_base = abbrev_base[['ZIPCODE', 'multifam_percentage_2009']].groupby('ZIPCODE').max()
to_join_15 = abbrev_15[['ZIPCODE', 'multifam_percentage_2015']].groupby('ZIPCODE').max()
to_join_final = abbrev_data[['ZIPCODE', 'multifam_percentage']].groupby('ZIPCODE').max() #would add total area

temp = to_join_base.merge(to_join_15, how = 'inner', left_index = True, right_index = True)
zip_panel = temp.merge(to_join_final, how = 'inner', left_index = True, right_index = True)

In [13]:
#creating independent variables
zip_panel['initial_diff'] = zip_panel['multifam_percentage_2015'] - zip_panel['multifam_percentage_2009']
zip_panel['final_diff'] = zip_panel['multifam_percentage'] - zip_panel['multifam_percentage_2015']

# Rent Data

In [14]:
#final year rents
rents = pd.read_excel('fy2021_safmrs_revised.xlsx')

rents.rename({
    'ZIP\nCode': 'ZIP Code', 
    'SAFMR\n0BR': 'SAFMR 0BR',
    'SAFMR\n0BR - 90%\nPayment\nStandard': 'SAFMR 0BR - 90% Payment Standard',
    'SAFMR\n0BR - 110%\nPayment\nStandard': 'SAFMR 0BR - 110% Payment Standard',
    'SAFMR\n1BR': 'SAFMR 1BR',
       'SAFMR\n1BR - 90%\nPayment\nStandard': 'SAFMR 1BR - 90% Payment Standard',
       'SAFMR\n1BR - 110%\nPayment\nStandard': 'SAFMR 1BR - 110% Payment Standard', 
    'SAFMR\n2BR': 'SAFMR 2BR',
       'SAFMR\n2BR - 90%\nPayment\nStandard': 'SAFMR 2BR - 90% Payment Standard',
       'SAFMR\n2BR - 110%\nPayment\nStandard': 'SAFMR 2BR - 110% Payment Standard', 
    'SAFMR\n3BR': 'SAFMR 3BR',
       'SAFMR\n3BR - 90%\nPayment\nStandard': 'SAFMR 3BR - 90% Payment Standard',
       'SAFMR\n3BR - 110%\nPayment\nStandard': 'SAFMR 3BR - 110% Payment Standard', 
    'SAFMR\n4BR': 'SAFMR 4BR',
       'SAFMR\n4BR - 90%\nPayment\nStandard': 'SAFMR 4BR - 90% Payment Standard',
       'SAFMR\n4BR - 110%\nPayment\nStandard': 'SAFMR 4BR - 110% Payment Standard'
}, axis = 1, inplace = True)

abbrev_rents = rents[['ZIP Code', 'SAFMR 0BR', 'SAFMR 1BR',
                     'SAFMR 2BR', 'SAFMR 3BR', 'SAFMR 4BR']]
del rents

In [15]:
#base year rents
rents_2015 = pd.read_excel('small_area_fmrs_fy2015.xls')

rents_2015 = rents_2015[rents_2015['cntyname'] == 'Santa Clara County']
rents_2015 = rents_2015[['zipcode', 'area_rent_br0', 'area_rent_br1', 'area_rent_br2', 'area_rent_br3', 'area_rent_br4']]

rents_2015.rename({
    'area_rent_br0': '0BR_base', 'area_rent_br1': '1BR_base', 
    'area_rent_br2': '2BR_base', 'area_rent_br3': '3BR_base', 
    'area_rent_br4': '4BR_base'
}, axis = 1, inplace = True)

rents_2015['zipcode'] = rents_2015['zipcode'].astype('int64')

In [16]:
#adjusting for inflation
rents_2015['0BR_base'] = rents_2015['0BR_base'] * 1.12
rents_2015['1BR_base'] = rents_2015['1BR_base'] * 1.12
rents_2015['2BR_base'] = rents_2015['2BR_base'] * 1.12
rents_2015['3BR_base'] = rents_2015['3BR_base'] * 1.12
rents_2015['4BR_base'] = rents_2015['4BR_base'] * 1.12

**Calculating change in rent**

In [17]:
rents_2015.columns

Index(['zipcode', '0BR_base', '1BR_base', '2BR_base', '3BR_base', '4BR_base'], dtype='object')

In [18]:
merged_rents = rents_2015.merge(abbrev_rents, left_on = 'zipcode', right_on = 'ZIP Code')

## 'SAFMR xBR' is the convention used in 2021, compared to xBR_base being used in 2012
merged_rents['0BR_change'] = merged_rents['SAFMR 0BR'] - merged_rents['0BR_base']
merged_rents['1BR_change'] = merged_rents['SAFMR 1BR'] - merged_rents['1BR_base']
merged_rents['2BR_change'] = merged_rents['SAFMR 2BR'] - merged_rents['2BR_base']
merged_rents['3BR_change'] = merged_rents['SAFMR 3BR'] - merged_rents['4BR_base']
merged_rents['4BR_change'] = merged_rents['SAFMR 4BR'] - merged_rents['4BR_base']

# log changes

merged_rents['log_0BR_change'] = np.log(merged_rents['SAFMR 0BR']) - np.log(merged_rents['0BR_base'])
merged_rents['log_1BR_change'] = np.log(merged_rents['SAFMR 1BR']) - np.log(merged_rents['1BR_base'])
merged_rents['log_2BR_change'] = np.log(merged_rents['SAFMR 2BR']) - np.log(merged_rents['2BR_base'])
merged_rents['log_3BR_change'] = np.log(merged_rents['SAFMR 3BR']) - np.log(merged_rents['4BR_base'])
merged_rents['log_4BR_change'] = np.log(merged_rents['SAFMR 4BR']) - np.log(merged_rents['4BR_base'])



changes = merged_rents[['zipcode', '0BR_change', '1BR_change', 
                        '2BR_change', '3BR_change', '4BR_change',
                        'log_0BR_change', 'log_1BR_change', 'log_2BR_change',
                        'log_3BR_change', 'log_4BR_change',
                       '0BR_base', '1BR_base', '2BR_base', '3BR_base', '4BR_base']]

# Base Demographic Data (controls)

In [19]:
test_demo = pd.read_csv('controls_data/full_2015_demographics.csv')

test_demo['Label (Grouping)'] = test_demo['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
test_demo.columns = test_demo.columns.str.replace('!!', ' ')


test_demo['Label (Grouping)'] = test_demo['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
test_demo.columns = test_demo.columns.str.replace('!!', ' ')


og = test_demo.copy()


#to get the rows outside of "one race" dropdown
rows = [2] + list(range(60, 75))
test_demo = test_demo.iloc[rows]


#note that median age and total housing units are in 'estimate' columns
vals_to_keep = {'Male', 'White', 'Black or African American',
       'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander',
                'Some Other Race', 'Hispanic or Latino (of any race)',
               }

test_demo = test_demo[test_demo['Label (Grouping)'].isin(vals_to_keep)]


test_demo = test_demo.filter(regex='^(?=.*(?i)percent|label)(?!.*(?i)percent margin of error)')

pivoted = test_demo.set_index('Label (Grouping)').T
pivoted.reset_index(inplace = True)
pivoted.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted['ZIP'] = pivoted['ZIP'].str.extract(r'ZCTA5 (\d+) Percent')[0]

In [20]:
# Getting the non-percent values
og = og.filter(regex='^(?=.*(?i)estimate|label)(?!.*(?i)estimate margin of error)')

og = og.set_index('Label (Grouping)')
og_pivot = og[(og.index == 'Total housing units') | (og.index == 'Median age (years)') | 
              (og.index == 'Total population')].T

og_pivot = og_pivot.T.drop_duplicates(keep = 'first').T

og_pivot.reset_index(inplace = True)
og_pivot.rename({'index': 'ZIP'}, axis = 1, inplace = True)
og_pivot['ZIP'] = og_pivot['ZIP'].str.extract(r'ZCTA5 (\d+) Estimate')[0]

no_income = pivoted.merge(og_pivot, on = 'ZIP')

In [21]:
#2015 income data

raw_med_15 = pd.read_csv('controls_data/sj_med_income_2015.csv')
raw_med_15.columns = raw_med_15.columns.str.replace('!!', ' ')

mean_med = raw_med_15[(raw_med_15['Label (Grouping)'] == 'Median income (dollars)') 
           | (raw_med_15['Label (Grouping)'] == 'Mean income (dollars)')]

mean_med = mean_med.filter(regex=r'^(ZCTA5 \d{5} Households Estimate|Label \(Grouping\))$')


pivoted_income = mean_med.set_index('Label (Grouping)').T


pivoted_income.reset_index(inplace = True)
pivoted_income.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_income['ZIP'] = pivoted_income['ZIP'].str.extract(r'ZCTA5 (\d+) Households Estimate')[0]
pivoted_income.replace({'-': np.nan}, inplace = True)

pivoted_income['Median income (dollars)'] = (
    pivoted_income['Median income (dollars)'].str.replace(',', '').astype('float64')
)

pivoted_income['Mean income (dollars)'] = (
    pivoted_income['Mean income (dollars)'].str.replace(',', '').astype('float64')
)

full_controls = no_income.merge(pivoted_income, on = 'ZIP')

In [22]:
#cleaning

full_controls.replace('-', np.nan, inplace = True)

def clean_pct(column):
    return column.str.rstrip('%').astype(float) / 100

def clean_totals(val):
    if isinstance(val, str):
        return float(val.replace(',', ''))
    else:
        return val

to_clean = ['Male', 'White', 'Black or African American', 'American Indian and Alaska Native',
           'Asian', 'Native Hawaiian and Other Pacific Islander', 'Hispanic or Latino (of any race)']

full_controls[to_clean] = full_controls[to_clean].apply(clean_pct)

full_controls['Total population'] = full_controls['Total population'].apply(clean_totals)
full_controls['Total housing units'] = full_controls['Total housing units'].apply(clean_totals)



full_controls['Density per unit'] = full_controls['Total population'] / full_controls['Total housing units']
#careful: should I do this so that they're technically mutually exclusive?
full_controls['Minority'] = full_controls['Black or African American'] + full_controls['Hispanic or Latino (of any race)']

full_controls['ZIP'] = full_controls['ZIP'].astype('int64')


# Base year Housing Characteristics

In [23]:
housing_2015 = pd.read_csv('controls_data/2015_housing_characteristics.csv')

housing_2015['Label (Grouping)'] = housing_2015['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
housing_2015.columns = housing_2015.columns.str.replace('!!', ' ')

In [24]:
housing_chars = {'Vacant housing units', 'Occupied housing units', 'Rental vacancy rate',
'Average household size of renter-occupied unit', 'Owner-occupied units', 
                 'Renter-occupied', 'Average household size of renter-occupied unit'}

selected_housing = housing_2015[housing_2015['Label (Grouping)'].isin(housing_chars)]

In [25]:
pivoted_housing = selected_housing.set_index('Label (Grouping)').T


pivoted_housing.reset_index(inplace = True)
pivoted_housing.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_housing['ZIP'] = pivoted_housing['ZIP'].str.extract(r'ZCTA5 (\d+) Estimate')[0]
pivoted_housing.replace({'-': np.nan}, inplace = True)
pivoted_housing = pivoted_housing[~pivoted_housing['ZIP'].isna()].reset_index().drop(columns = ['index'])

In [26]:
column_numbers = [0, 1, 2, 3, 5, 6, 12]
pivoted_housing = pivoted_housing.iloc[:, column_numbers]

In [27]:
selected_housing_pct = selected_housing.set_index('Label (Grouping)').T
selected_housing_pct = selected_housing_pct[selected_housing_pct.index.str.contains('Percent')]

selected_housing_pct = selected_housing_pct[~selected_housing_pct.index.str.contains('Margin')]

In [28]:
column_numbers_pct = [0, 1, 4]
selected_housing_pct = selected_housing_pct.iloc[:, column_numbers_pct]

In [29]:
pivoted_housing_pct = selected_housing_pct
pivoted_housing_pct.reset_index(inplace = True)
pivoted_housing_pct.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_housing_pct['ZIP'] = pivoted_housing_pct['ZIP'].str.extract(r'ZCTA5 (\d+) Percent')[0]
pivoted_housing_pct.replace({'-': np.nan}, inplace = True)
pivoted_housing_pct = pivoted_housing_pct[~pivoted_housing_pct['ZIP'].isna()].reset_index().drop(columns = ['index'])

In [30]:
pivoted_housing_pct = pivoted_housing_pct.set_index('ZIP').add_suffix('_pct')

In [31]:
full_housing = pivoted_housing_pct.merge(pivoted_housing, left_index = True, right_on = 'ZIP').set_index('ZIP')

In [32]:
def clean_int(val):
    return pd.to_numeric(val.str.replace(',', ''), errors='coerce')

In [33]:
pct_cols = ['Occupied housing units_pct', 'Vacant housing units_pct', 'Renter-occupied_pct']

non_pct_cols = ['Occupied housing units', 'Vacant housing units', 'Renter-occupied',
               'Average household size of renter-occupied unit', 'Owner-occupied units',
               'Rental vacancy rate']

full_housing[pct_cols] = full_housing[pct_cols].apply(clean_pct)
full_housing[non_pct_cols] = full_housing[non_pct_cols].apply(clean_int)

full_housing['Rental vacancy rate'] = full_housing['Rental vacancy rate'] / 100

In [34]:
full_housing = full_housing.reset_index()
full_housing['ZIP'] = full_housing['ZIP'].astype('int64')

In [35]:
#merging demographic controls with housing characteristics
full_controls = full_controls.merge(full_housing, left_on = 'ZIP', right_on = 'ZIP')

# Final year demographic data

In [36]:
#pd.read_csv('controls_data/full_2012_demographics.csv')

In [37]:
demo_2021 = pd.read_csv('controls_data/full_2021_demographics.csv')

demo_2021['Label (Grouping)'] = demo_2021['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
demo_2021.columns = demo_2021.columns.str.replace('!!', ' ')


demo_2021['Label (Grouping)'] = demo_2021['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
demo_2021.columns = demo_2021.columns.str.replace('!!', ' ')


og_2021 = demo_2021.copy()

#to get the rows outside of "one race" dropdown
rows = [2] + [38, 39, 40, 45, 53, 58, 74]
demo_2021 = demo_2021.iloc[rows]


#note that median age and total housing units are in 'estimate' columns
vals_to_keep = {'Male', 'White', 'Black or African American',
       'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander',
                'Some Other Race', 'Hispanic or Latino (of any race)',
               }

demo_2021 = demo_2021[demo_2021['Label (Grouping)'].isin(vals_to_keep)]


demo_2021 = demo_2021.filter(regex='^(?=.*(?i)percent|label)(?!.*(?i)percent margin of error)')

pivoted_21 = demo_2021.set_index('Label (Grouping)').T
pivoted_21.reset_index(inplace = True)
pivoted_21.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_21['ZIP'] = pivoted_21['ZIP'].str.extract(r'ZCTA5 (\d+) Percent')[0]

In [38]:
# Getting the non-percent values
og_2021 = og_2021.filter(regex='^(?=.*(?i)estimate|label)(?!.*(?i)estimate margin of error)')

og_2021 = og_2021.set_index('Label (Grouping)')
og_21_pivot = og_2021[(og_2021.index == 'Total housing units') | (og_2021.index == 'Median age (years)') | 
              (og_2021.index == 'Total population')].T

og_21_pivot = og_21_pivot.T.drop_duplicates(keep = 'first').T

og_21_pivot.reset_index(inplace = True)
og_21_pivot.rename({'index': 'ZIP'}, axis = 1, inplace = True)
og_21_pivot['ZIP'] = og_21_pivot['ZIP'].str.extract(r'ZCTA5 (\d+) Estimate')[0]

no_income_21 = pivoted_21.merge(og_21_pivot, on = 'ZIP')

In [39]:
raw_med_21 = pd.read_csv('controls_data/sj_med_income_2021.csv')
raw_med_21.columns = raw_med_21.columns.str.replace('!!', ' ')

mean_med_21 = raw_med_21[(raw_med_21['Label (Grouping)'] == 'Median income (dollars)') 
           | (raw_med_21['Label (Grouping)'] == 'Mean income (dollars)')]

mean_med_21 = mean_med_21.filter(regex=r'^(ZCTA5 \d{5} Households Estimate|Label \(Grouping\))$')


pivoted_income_21 = mean_med_21.set_index('Label (Grouping)').T


pivoted_income_21.reset_index(inplace = True)
pivoted_income_21.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_income_21['ZIP'] = pivoted_income_21['ZIP'].str.extract(r'ZCTA5 (\d+) Households Estimate')[0]
pivoted_income_21.replace({'-': np.NaN, 'N': np.NaN}, inplace = True)



pivoted_income_21['Median income (dollars)'] = (
     pivoted_income_21['Median income (dollars)'].str.replace(',', '').astype('float64')
)

pivoted_income_21['Mean income (dollars)'].str.replace(',', '').unique()
pivoted_income_21['Mean income (dollars)'] = (
     pivoted_income_21['Mean income (dollars)'].str.replace(',', '').astype('float64')
)

full_controls_21 = no_income_21.merge(pivoted_income_21, on = 'ZIP')

In [40]:
#cleaning

full_controls_21.replace('-', np.nan, inplace = True)

def clean_pct(column):
    return column.str.rstrip('%').astype(float) / 100

def clean_totals(val):
    if isinstance(val, str):
        return float(val.replace(',', ''))
    else:
        return val

to_clean = ['Male', 'White', 'Black or African American', 'American Indian and Alaska Native',
           'Asian', 'Native Hawaiian and Other Pacific Islander', 'Hispanic or Latino (of any race)']

full_controls_21[to_clean] = full_controls_21[to_clean].apply(clean_pct)

full_controls_21['Total population'] = full_controls_21['Total population'].apply(clean_totals)
full_controls_21['Total housing units'] = full_controls_21['Total housing units'].apply(clean_totals)


full_controls_21['ZIP'] = full_controls_21['ZIP'].astype('int64')

full_controls_21['Minority'] = full_controls_21['Black or African American'] + full_controls_21['Hispanic or Latino (of any race)']
full_controls_21['Density per unit'] = full_controls_21['Total population'] / full_controls_21['Total housing units']



# Creating Final Panel

In [41]:
#merge zoning df with rent df
temp_full = zip_panel.merge(changes, left_index = True, right_on = 'zipcode').set_index('zipcode')

In [42]:
#merge zoning+rent df with demographic controls
final_panel = (
    temp_full.merge(full_controls, 
                    left_index = True, right_on = 'ZIP')
    .set_index('ZIP')
)

final_panel['Median age (years)'] = final_panel['Median age (years)'].astype(float)

In [43]:
#log scaling columns
final_panel['log_0BR_base'] = np.log(final_panel['0BR_base'])
final_panel['log_1BR_base'] = np.log(final_panel['1BR_base'])
final_panel['log_2BR_base'] = np.log(final_panel['2BR_base'])
final_panel['log_3BR_base'] = np.log(final_panel['3BR_base'])
final_panel['log_4BR_base'] = np.log(final_panel['4BR_base'])

final_panel['log_population'] = np.log(final_panel['Total population'])

final_panel['log_housing_units'] = np.log(final_panel['Total housing units'])

final_panel['log_med_income'] = np.log(final_panel['Median income (dollars)'])

final_panel.replace(float('-inf'), np.NaN, inplace = True)

In [44]:
#doesn't work, area calculation incorrect
# final_panel['Population density (per km)'] = (
#     final_panel['Total population'] / (final_panel['total_area'] / 10**6)
# )

In [45]:
#sj_zips[sj_zips['ZIPCODE'] == '95014']['geometry'].plot()

In [46]:
final_panel.to_csv('new_full_data.csv')

# EDA

In [45]:
corr = final_panel.corr(numeric_only = False)
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,multifam_percentage_2012,multifam_percentage_2015,multifam_percentage,initial_diff,final_diff,0BR_change,1BR_change,2BR_change,3BR_change,4BR_change,log_0BR_change,log_1BR_change,log_2BR_change,log_3BR_change,log_4BR_change,0BR_base,1BR_base,2BR_base,3BR_base,4BR_base,Male,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Hispanic or Latino (of any race),Total population,Median age (years),Total housing units,Median income (dollars),Mean income (dollars),Density per unit,Minority,Occupied housing units_pct,Vacant housing units_pct,Renter-occupied_pct,Occupied housing units,Vacant housing units,Rental vacancy rate,Renter-occupied,Average household size of renter-occupied unit,Owner-occupied units,log_0BR_base,log_1BR_base,log_2BR_base,log_3BR_base,log_4BR_base,log_population,log_housing_units,log_med_income
multifam_percentage_2012,1.0,0.997958,0.706342,0.203954,-0.127921,-0.048581,0.011465,-0.01341,0.109368,0.04342,0.183604,0.192503,0.193352,0.200607,0.194218,-0.505435,-0.509624,-0.507493,-0.508635,-0.508174,0.049382,0.134743,-0.02515,0.238672,-0.195304,-0.251798,0.037081,0.27738,0.014022,0.363167,-0.306445,-0.217615,-0.232459,0.033937,-0.119921,0.119921,0.354718,0.366947,0.230982,-0.27177,0.418582,-0.120838,0.223513,-0.507109,-0.511208,-0.50943,-0.510715,-0.510063,0.238499,0.299242,-0.305776
multifam_percentage_2015,0.997958,1.0,0.698071,0.266073,-0.141616,-0.061406,-0.004356,-0.028236,0.089302,0.025874,0.158451,0.166896,0.167671,0.174853,0.168481,-0.481046,-0.485041,-0.483092,-0.484281,-0.483803,0.041428,0.133738,-0.056598,0.212976,-0.189321,-0.2474,0.036293,0.28984,0.006198,0.379206,-0.284996,-0.200633,-0.216156,0.029291,-0.083219,0.083219,0.339454,0.383152,0.241196,-0.269224,0.434863,-0.11182,0.235096,-0.481499,-0.485358,-0.483813,-0.485163,-0.484487,0.259258,0.316336,-0.274944
multifam_percentage,0.706342,0.698071,1.0,0.039405,0.609954,0.125422,0.206588,0.178489,0.331735,0.254542,0.446081,0.459313,0.463517,0.474133,0.468722,-0.652187,-0.65341,-0.651097,-0.651145,-0.65118,0.212991,0.054108,0.244978,0.264175,-0.278002,-0.337014,0.292006,-0.016581,-0.167955,0.08664,-0.576023,-0.466458,-0.407618,0.321607,-0.458128,0.458128,0.695748,0.082595,0.13966,-0.031677,0.385085,-0.310218,-0.175598,-0.68799,-0.689551,-0.687603,-0.687539,-0.687738,-0.12089,-0.036845,-0.660117
initial_diff,0.203954,0.266073,0.039405,1.0,-0.239988,-0.207985,-0.239783,-0.230374,-0.281801,-0.258694,-0.34229,-0.347164,-0.348097,-0.347511,-0.348753,0.254881,0.256863,0.254579,0.253593,0.253961,-0.110288,0.016296,-0.48788,-0.337675,0.045747,0.008164,-0.003351,0.256202,-0.116606,0.331244,0.256628,0.209069,0.195175,-0.063213,0.534287,-0.534287,-0.150492,0.334679,0.210892,-0.024905,0.34799,0.109793,0.230097,0.273197,0.275905,0.272762,0.271457,0.271977,0.374257,0.332385,0.400583
final_diff,-0.127921,-0.141616,0.609954,-0.239988,1.0,0.241356,0.290433,0.278014,0.359798,0.323274,0.441354,0.450303,0.455256,0.461985,0.461556,-0.369281,-0.366549,-0.36551,-0.36426,-0.364838,0.248614,-0.073204,0.401324,0.129523,-0.174819,-0.192127,0.363536,-0.343692,-0.23906,-0.29989,-0.480953,-0.422843,-0.324316,0.41221,-0.541271,0.541271,0.586206,-0.309849,-0.073853,0.254159,0.051119,-0.30513,-0.50295,-0.418277,-0.416165,-0.415182,-0.413599,-0.414621,-0.454055,-0.401031,-0.608339
0BR_change,-0.048581,-0.061406,0.125422,-0.207985,0.241356,1.0,0.991141,0.993926,0.937559,0.971514,0.88513,0.871366,0.862211,0.847345,0.849402,0.068897,0.072954,0.073843,0.074756,0.07543,0.025892,-0.117895,0.193499,-0.269554,0.26392,-0.403357,-0.506226,-0.331899,0.45912,-0.257009,0.261298,0.359256,-0.488986,-0.481685,-0.374841,0.374841,0.064808,-0.255635,-0.232654,0.147787,-0.30929,-0.45366,-0.141978,0.044206,0.047051,0.048493,0.049811,0.050043,-0.242992,-0.155015,0.063427
1BR_change,0.011465,-0.004356,0.206588,-0.239783,0.290433,0.991141,1.0,0.998959,0.975173,0.993866,0.934172,0.925159,0.919073,0.908121,0.909683,-0.059083,-0.055095,-0.054238,-0.05323,-0.052646,0.071344,-0.082327,0.203912,-0.22391,0.201417,-0.445909,-0.44557,-0.341458,0.43834,-0.265182,0.187292,0.288893,-0.508939,-0.419846,-0.398334,0.398334,0.136344,-0.263952,-0.236855,0.101482,-0.275498,-0.483823,-0.180658,-0.083892,-0.081045,-0.079725,-0.078332,-0.078175,-0.271645,-0.179891,-0.019732
2BR_change,-0.01341,-0.028236,0.178489,-0.230374,0.278014,0.993926,0.998959,1.0,0.967899,0.991068,0.91947,0.91032,0.904557,0.893442,0.895276,-0.0235,-0.019293,-0.018604,-0.017589,-0.01699,0.065388,-0.091811,0.188504,-0.242621,0.218154,-0.443286,-0.454597,-0.341965,0.445893,-0.271146,0.217349,0.314092,-0.488868,-0.43075,-0.386221,0.386221,0.108554,-0.270026,-0.239819,0.112635,-0.293584,-0.470458,-0.175692,-0.048464,-0.04541,-0.044251,-0.042846,-0.042679,-0.273662,-0.18612,0.009952
3BR_change,0.109368,0.089302,0.331735,-0.281801,0.359798,0.937559,0.975173,0.967899,1.0,0.992148,0.979025,0.977406,0.976937,0.973111,0.973661,-0.272449,-0.268335,-0.267735,-0.266719,-0.266186,0.14275,-0.020915,0.208179,-0.138741,0.088551,-0.497038,-0.319866,-0.338226,0.386136,-0.265772,0.05705,0.15978,-0.511824,-0.293816,-0.416359,0.416359,0.242069,-0.264705,-0.234544,0.022154,-0.212671,-0.504469,-0.230578,-0.295835,-0.292686,-0.291782,-0.290437,-0.290287,-0.303075,-0.210389,-0.155142
4BR_change,0.04342,0.025874,0.254542,-0.258694,0.323274,0.971514,0.993866,0.991068,0.992148,1.0,0.956047,0.951172,0.948595,0.94147,0.943167,-0.152233,-0.147892,-0.147327,-0.146292,-0.145728,0.119757,-0.047033,0.191634,-0.200333,0.146874,-0.483482,-0.386412,-0.350388,0.425503,-0.279708,0.146393,0.242484,-0.500641,-0.362288,-0.399444,0.399444,0.172191,-0.278663,-0.245514,0.062931,-0.261891,-0.496731,-0.21322,-0.176443,-0.173164,-0.172215,-0.170818,-0.170665,-0.306635,-0.217371,-0.068301


# 0br 

In [46]:
final_panel.columns

Index(['multifam_percentage_2012', 'multifam_percentage_2015',
       'multifam_percentage', 'initial_diff', 'final_diff', '0BR_change',
       '1BR_change', '2BR_change', '3BR_change', '4BR_change',
       'log_0BR_change', 'log_1BR_change', 'log_2BR_change', 'log_3BR_change',
       'log_4BR_change', '0BR_base', '1BR_base', '2BR_base', '3BR_base',
       '4BR_base', 'Male', 'White', 'Black or African American',
       'American Indian and Alaska Native', 'Asian',
       'Native Hawaiian and Other Pacific Islander',
       'Hispanic or Latino (of any race)', 'Total population',
       'Median age (years)', 'Total housing units', 'Median income (dollars)',
       'Mean income (dollars)', 'Density per unit', 'Minority',
       'Occupied housing units_pct', 'Vacant housing units_pct',
       'Renter-occupied_pct', 'Occupied housing units', 'Vacant housing units',
       'Rental vacancy rate', 'Renter-occupied',
       'Average household size of renter-occupied unit',
       'Owner-occupi

In [47]:
indep = ['multifam_percentage_2012', 'initial_diff', 'final_diff',
        'White', 'Minority',
       'Black or African American', 'American Indian and Alaska Native',
       'Asian', 'Native Hawaiian and Other Pacific Islander',
       'Hispanic or Latino (of any race)', 'Total population',
       'Median age (years)', 'Total housing units', 'Median income (dollars)',
       'Mean income (dollars)', 'Density per unit', 'log_0BR_base',
       'log_1BR_base', 'log_2BR_base', 'log_3BR_base', 'log_4BR_base',
       'log_population', 'log_med_income', 'log_housing_units', 'Occupied housing units_pct', 'Vacant housing units_pct',
       'Renter-occupied_pct', 'Occupied housing units', 'Vacant housing units',
       'Rental vacancy rate', 'Renter-occupied',
       'Average household size of renter-occupied unit',
       'Owner-occupied units']

In [48]:
for var in indep:

    print(f"The correlation between 0br_change and {var} is: {final_panel['0BR_change'].corr(final_panel[var])}")

The correlation between 0br_change and multifam_percentage_2012 is: -0.04858070459454216
The correlation between 0br_change and initial_diff is: -0.20798496249064907
The correlation between 0br_change and final_diff is: 0.24135639725606994
The correlation between 0br_change and White is: -0.11789479731978526
The correlation between 0br_change and Minority is: -0.48168469668567787
The correlation between 0br_change and Black or African American is: 0.19349938484426657
The correlation between 0br_change and American Indian and Alaska Native is: -0.2695543438476864
The correlation between 0br_change and Asian is: 0.26392033329725256
The correlation between 0br_change and Native Hawaiian and Other Pacific Islander is: -0.40335710480469816
The correlation between 0br_change and Hispanic or Latino (of any race) is: -0.5062260420345247
The correlation between 0br_change and Total population is: -0.331899459803246
The correlation between 0br_change and Median age (years) is: 0.4591200251016593

# Rezoning correlations

In [49]:
rezone_factors = ['multifam_percentage_2012', 'initial_diff',
        'White', 'Minority',
       'Black or African American', 'American Indian and Alaska Native',
       'Asian', 'Native Hawaiian and Other Pacific Islander',
       'Hispanic or Latino (of any race)', 'Total population',
       'Median age (years)', 'Total housing units', 'Median income (dollars)',
       'Mean income (dollars)', 'Density per unit', 'log_0BR_base',
       'log_1BR_base', 'log_2BR_base', 'log_3BR_base', 'log_4BR_base',
       'log_population', 'log_med_income', 'log_housing_units', 'Occupied housing units_pct', 'Vacant housing units_pct',
       'Renter-occupied_pct', 'Occupied housing units', 'Vacant housing units',
       'Rental vacancy rate', 'Renter-occupied',
       'Average household size of renter-occupied unit',
       'Owner-occupied units']

In [50]:
for var in rezone_factors:

    print(f"The correlation between final_diff and {var} is: {final_panel['final_diff'].corr(final_panel[var])}")

The correlation between final_diff and multifam_percentage_2012 is: -0.1279206001746311
The correlation between final_diff and initial_diff is: -0.23998790536651218
The correlation between final_diff and White is: -0.07320381808987696
The correlation between final_diff and Minority is: 0.412209866992705
The correlation between final_diff and Black or African American is: 0.40132361914535625
The correlation between final_diff and American Indian and Alaska Native is: 0.12952335040284413
The correlation between final_diff and Asian is: -0.17481920426595696
The correlation between final_diff and Native Hawaiian and Other Pacific Islander is: -0.1921265857770428
The correlation between final_diff and Hispanic or Latino (of any race) is: 0.36353637441066566
The correlation between final_diff and Total population is: -0.3436922154603921
The correlation between final_diff and Median age (years) is: -0.23905960076221172
The correlation between final_diff and Total housing units is: -0.29988969

In [51]:
for var in rezone_factors:

    print(f"The correlation between initial_diff and {var} is: {final_panel['initial_diff'].corr(final_panel[var])}")

The correlation between initial_diff and multifam_percentage_2012 is: 0.20395436047743057
The correlation between initial_diff and initial_diff is: 0.9999999999999998
The correlation between initial_diff and White is: 0.01629598811614267
The correlation between initial_diff and Minority is: -0.06321323343407614
The correlation between initial_diff and Black or African American is: -0.4878803408951493
The correlation between initial_diff and American Indian and Alaska Native is: -0.33767496820595266
The correlation between initial_diff and Asian is: 0.04574652101404406
The correlation between initial_diff and Native Hawaiian and Other Pacific Islander is: 0.008163824369713711
The correlation between initial_diff and Hispanic or Latino (of any race) is: -0.0033512138155624357
The correlation between initial_diff and Total population is: 0.2562019367237853
The correlation between initial_diff and Median age (years) is: -0.11660625713392732
The correlation between initial_diff and Total ho

# Rent model

regress br_change final_diff initial_diff blackorafricanamerican hispanicorlatinoofanyrace log_0br_base medianageyears medianincomedollars averagehouseholdsizeofrenteroccu log_housing_units

- higher adjusted r^2 (0.41 vs 0.39)
- changing to log_med_income brings this down to 0.37 adjusted
- different effect for hispanic (-779) vs black (2340), but black isn't statistically significant


regress br_change final_diff initial_diff minority log_0br_base medianageyears log_med_income rentalvacancyrate renteroccupied_pct log_housing_units
- adjusted r^2 of 0.39
- final diff p val: 0.014, initial_diff 0.195

**old best:** regress br_change final_diff initial_diff blackorafricanamerican hispanicorlatinoofanyrace log_0br_base medianageyears log_med_income rentalvacancyrate renteroccupied_pct log_housing_units multifam_percentage_2012

- adjusted r^2 of 0.378
- final_diff p-val: 0.028
- but, multifam_pct_2012 is least statistically significant, 0.798 pval

regress br_change final_diff initial_diff blackorafricanamerican hispanicorlatinoofanyrace medianageyears log_med_income rentalvacancyrate renteroccupied_pct log_housing_units

- adj r^2 of 0.397
- final_diff pval 0.011


**Best:** regress br_change final_diff initial_diff minority medianageyears log_med_income rentalvacancyrate renteroccupied_pct log_housing_units

- adj r^2 of 0.41
- final_diff pval 0.010


# Log rent model

**Best:** regress log_0br_change final_diff initial_diff minority medianageyears log_med_income log_housing_units rentalvacancyrate renteroccupied_pct log_0br_base

- final diff coeff 0.7, p-value 0.010
- adjusted r^2 0.54


regress log_0br_change final_diff initial_diff minority log_0br_base medianageyears log_med_income averagehouseholdsizeofrenteroccu log_housing_units rentalvacancyrate renteroccupied_pct

- final diff coeff: 0.72, p-value 0.01
- adjusted r^2 0.53


regress log_0br_change final_diff initial_diff minority log_0br_base medianageyears medianincomedollars averagehouseholdsizeofrenteroccu log_housing_units rentalvacancyrate renteroccupied_pct

- final diff coeff: 0.7, p value 0.007
- adjusted r^2 0.55


# Rezoning model

# Demographic model

In [47]:
demo_2021 = pd.read_csv('controls_data/full_2021_demographics.csv')

demo_2021['Label (Grouping)'] = demo_2021['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
demo_2021.columns = demo_2021.columns.str.replace('!!', ' ')


demo_2021['Label (Grouping)'] = demo_2021['Label (Grouping)'].apply(lambda x: str(x).replace(u'\xa0', u''))
demo_2021.columns = demo_2021.columns.str.replace('!!', ' ')


og_2021 = demo_2021.copy()

#to get the rows outside of "one race" dropdown
rows = [2] + [38, 39, 40, 45, 53, 58, 74]
demo_2021 = demo_2021.iloc[rows]


#note that median age and total housing units are in 'estimate' columns
vals_to_keep = {'Male', 'White', 'Black or African American',
       'American Indian and Alaska Native', 'Asian', 'Native Hawaiian and Other Pacific Islander',
                'Some Other Race', 'Hispanic or Latino (of any race)',
               }

demo_2021 = demo_2021[demo_2021['Label (Grouping)'].isin(vals_to_keep)]


demo_2021 = demo_2021.filter(regex='^(?=.*(?i)percent|label)(?!.*(?i)percent margin of error)')

pivoted_21 = demo_2021.set_index('Label (Grouping)').T
pivoted_21.reset_index(inplace = True)
pivoted_21.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_21['ZIP'] = pivoted_21['ZIP'].str.extract(r'ZCTA5 (\d+) Percent')[0]

In [48]:
# Getting the non-percent values
og_2021 = og_2021.filter(regex='^(?=.*(?i)estimate|label)(?!.*(?i)estimate margin of error)')

og_2021 = og_2021.set_index('Label (Grouping)')
og_21_pivot = og_2021[(og_2021.index == 'Total housing units') | (og_2021.index == 'Median age (years)') | 
              (og_2021.index == 'Total population')].T

og_21_pivot = og_21_pivot.T.drop_duplicates(keep = 'first').T

og_21_pivot.reset_index(inplace = True)
og_21_pivot.rename({'index': 'ZIP'}, axis = 1, inplace = True)
og_21_pivot['ZIP'] = og_21_pivot['ZIP'].str.extract(r'ZCTA5 (\d+) Estimate')[0]

no_income_21 = pivoted_21.merge(og_21_pivot, on = 'ZIP')

In [49]:
raw_med_21 = pd.read_csv('controls_data/sj_med_income_2021.csv')
raw_med_21.columns = raw_med_21.columns.str.replace('!!', ' ')

mean_med_21 = raw_med_21[(raw_med_21['Label (Grouping)'] == 'Median income (dollars)') 
           | (raw_med_21['Label (Grouping)'] == 'Mean income (dollars)')]

mean_med_21 = mean_med_21.filter(regex=r'^(ZCTA5 \d{5} Households Estimate|Label \(Grouping\))$')


pivoted_income_21 = mean_med_21.set_index('Label (Grouping)').T


pivoted_income_21.reset_index(inplace = True)
pivoted_income_21.rename({'index': 'ZIP'}, axis = 1, inplace = True)
pivoted_income_21['ZIP'] = pivoted_income_21['ZIP'].str.extract(r'ZCTA5 (\d+) Households Estimate')[0]
pivoted_income_21.replace({'-': np.NaN, 'N': np.NaN}, inplace = True)



pivoted_income_21['Median income (dollars)'] = (
     pivoted_income_21['Median income (dollars)'].str.replace(',', '').astype('float64')
)

pivoted_income_21['Mean income (dollars)'].str.replace(',', '').unique()
pivoted_income_21['Mean income (dollars)'] = (
     pivoted_income_21['Mean income (dollars)'].str.replace(',', '').astype('float64')
)

full_controls_21 = no_income_21.merge(pivoted_income_21, on = 'ZIP')

In [50]:
#cleaning

full_controls_21.replace('-', np.nan, inplace = True)

def clean_pct(column):
    return column.str.rstrip('%').astype(float) / 100

def clean_totals(val):
    if isinstance(val, str):
        return float(val.replace(',', ''))
    else:
        return val

to_clean = ['Male', 'White', 'Black or African American', 'American Indian and Alaska Native',
           'Asian', 'Native Hawaiian and Other Pacific Islander', 'Hispanic or Latino (of any race)']

full_controls_21[to_clean] = full_controls_21[to_clean].apply(clean_pct)

full_controls_21['Total population'] = full_controls_21['Total population'].apply(clean_totals)
full_controls_21['Total housing units'] = full_controls_21['Total housing units'].apply(clean_totals)


full_controls_21['ZIP'] = full_controls_21['ZIP'].astype('int64')

full_controls_21['Density per unit'] = full_controls_21['Total population'] / full_controls_21['Total housing units']

full_controls_21['Minority'] = full_controls_21['Black or African American'] + full_controls_21['Hispanic or Latino (of any race)']


In [51]:
#get these for 2015 dp05

In [52]:
set(full_controls_21['ZIP'].unique()) - set(full_controls['ZIP'].unique())

{95013, 95050, 95134, 95135}

In [None]:
#get these for 2021 dp05

In [73]:
set(full_controls['ZIP'].unique()) - set(full_controls_21['ZIP'].unique())

{95140}

In [65]:
controls_21_copy = full_controls_21.copy()

controls_21_copy.columns = [col + ' 2021' for col in controls_21_copy.columns]

base_controls_copy = full_controls.copy()

base_controls_copy.columns = [col + ' base' for col in base_controls_copy.columns]
base_controls_copy['ZIP base'] = base_controls_copy['ZIP base'].astype(int)

In [66]:
diffs_and_base = (
    zip_panel.merge(base_controls_copy, left_index = True, 
                    right_on = 'ZIP base')
)

all_diffs = diffs_and_base.merge(controls_21_copy, left_on = 'ZIP base', 
                    right_on = 'ZIP 2021')

In [67]:
all_diffs['Median age (years) 2021'] = all_diffs['Median age (years) 2021'].astype(float)
all_diffs['Median age (years) base'] = all_diffs['Median age (years) base'].astype(float)


In [68]:
#change variables
all_diffs['change white'] = all_diffs['White 2021'] - all_diffs['White base']
all_diffs['change black'] = (
    all_diffs['Black or African American 2021'] - 
    all_diffs['Black or African American base']
)

all_diffs['change native'] = (
    all_diffs['American Indian and Alaska Native 2021'] - 
    all_diffs['American Indian and Alaska Native base']
)

all_diffs['change asian'] = all_diffs['Asian 2021'] - all_diffs['Asian base']

all_diffs['change hispanic'] = (
    all_diffs['Hispanic or Latino (of any race) 2021'] - 
    all_diffs['Hispanic or Latino (of any race) base']
)

all_diffs['change minority'] = (
    all_diffs['Minority 2021'] - 
    all_diffs['Minority base']
)

all_diffs['change median years'] = (
    all_diffs['Median age (years) 2021'] - 
    all_diffs['Median age (years) base']
)

all_diffs['change median income'] = (
    all_diffs['Median income (dollars) 2021'] - 
    all_diffs['Median income (dollars) base']
)

all_diffs['change mean income'] = (
    all_diffs['Mean income (dollars) 2021'] - 
    all_diffs['Mean income (dollars) base']
)

all_diffs['change housing'] = (
    all_diffs['Total housing units 2021'] - 
    all_diffs['Total housing units base']
)

In [69]:
all_diffs = all_diffs.merge(rents_2015, left_on = 'ZIP base', right_on = 'zipcode', how = 'inner')

In [70]:
all_diffs['log_0br_base'] = np.log(all_diffs['0BR_base'])

all_diffs['log_med_income_base'] = np.log(all_diffs['Median income (dollars) base'])

In [71]:
all_diffs.to_csv('demographic_changes.csv')

## Modeling

regress changemedianincome final_diff initial_diff minoritybase medianageyearsbase medianincomedollarsbase rentalvacancyratebase renteroccupied_pctbase

- adj r^2 of 0.487
- final diff p value of 0.04, positive coeff
- initial diff negative coeff, p val of 0.45



**same set:** regress changemedianincome final_diff initial_diff minoritybase medianageyearsbase log_med_income_base rentalvacancyratebase renteroccupied_pctbase log_housing_units_base
- adj r^2 of 0.497
- final diff p value 0.16, positive coeff
- minority is very negative and very statsitically significant!

**Final**:
gen log_0br_base = log(br_base)

gen log_med_income_base = log(medianincomedollarsbase)

gen log_med_income = log(medianincomedollars2021)

gen change_log_income = log_med_income - log_med_income_base


gen log_housing_units_base = log(totalhousingunitsbase)

regress change_log_income final_diff initial_diff minoritybase medianageyearsbase log_med_income_base log_housing_units_base rentalvacancyratebase renteroccupied_pctbase log_0br_base


regress changeminority final_diff initial_diff minoritybase medianageyearsbase log_med_income_base log_housing_units_base rentalvacancyratebase renteroccupied_pctbase log_0br_base
