In [1]:
import numpy as np
import pandas as pd
import requests
import geopandas as gpd
from shapely import wkt
from shapely.geometry import MultiPoint

### Subway Data Clean Up

In [2]:
# load subway data
joined_df = pd.read_csv('subway ridership - joined.csv').drop('s_table_station_name', axis = 1)
unjoined_df = pd.read_excel('subway unjoined - fix.xlsx', sheet_name = 'final').drop('s_table_station_name', axis = 1)
comp_id_df = pd.read_csv('unique complexes.csv').drop('count_', axis = 1)

In [3]:
# make sure points are geometry format not strings
unjoined_df[['first_point','second_point','third_point','fourth_point']] = unjoined_df[['first_point','second_point','third_point','fourth_point']
                                                                                        ].applymap(lambda x:wkt.loads(x) if isinstance(x, str) else x)

  ].applymap(lambda x:wkt.loads(x) if isinstance(x, str) else x)


In [4]:
geom_cols = ['first_point','second_point','third_point','fourth_point']

def get_centroid(row):
    # filter out nulls
    points = [p for p in row[geom_cols] if p is not None and not pd.isna(p)]
    if len(points) == 0:
        return None
    if len(points) == 1:
        return points[0]
    # create centroid point
    return MultiPoint(points).centroid

unjoined_df['station_geom'] = unjoined_df.apply(get_centroid, axis = 1)

# drop unnecessary cols
unjoined_df = unjoined_df.drop(['first_point','second_point','third_point','fourth_point'], axis = 1)

In [5]:
# concat the two dataframes
sub_df = pd.concat([joined_df, unjoined_df])
sub_df = sub_df.rename(columns = {'r_table_station_name': 'station_complex_name'})

# add unique copmlex ids → determine if station is local stop vs transfer hub
sub_df = pd.merge(sub_df, comp_id_df, on = 'complex_id', how = 'left')

In [6]:
# check for nulls
sub_df.isna().sum()

station_complex_name    0
routes                  0
ridership_2013          5
ridership_2018          1
complex_id              0
borough                 0
station_geom            0
complex_id_unique       0
dtype: int64

In [7]:
# all our merged and created columns have no nulls
# nulls for ridership → likely has something to do with stops that may not have been open during 2013 or 2018 → drop them
sub_df = sub_df.dropna().reset_index(drop = True)

In [8]:
# remove extra characters from routes col
sub_df['routes'] = sub_df['routes'].str.replace(',', '')
sub_df['routes'] = sub_df['routes'].str.replace('/', '')
sub_df['routes'] = sub_df['routes'].str.replace(' ', '')

# now we can count the characters to count the number of subway lines
sub_df['route_count'] = sub_df['routes'].str.len()

# add back spaces between routes for better readability
sub_df['routes'] = sub_df['routes'].apply(lambda x: ' '.join(x))

### API Calls

        Census — Zip Codes Business Patterns: https://www.census.gov/data/developers/data-sets/cbp-zbp/zbp-api.2018.html#list-tab-353702932
                2013 variables: https://api.census.gov/data/2013/zbp/variables.html
                2018 variables: https://api.census.gov/data/2018/zbp/variables.html

In [9]:
years = [2013,2018]

# dictionary to store separate year df
zbp_dict = {}

for year in years:
    url = f'https://api.census.gov/data/{year}/zbp'

    params = {
        'get': 'EMP,ESTAB',     # employee count and establishment count
        'for': 'zipcode:*'
    }

    response = requests.get(url, params = params)

    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data[1:], columns = data[0])

            # fix column headers
        df.columns = df.columns.str.lower()
        df = df.rename(columns = {'emp':'employee_count','estab':'business_count','zip code':'zip_code'})
        
            # filter zichangeodes → 10s & 11s grabs NYC + lower tier → https://simple.wikipedia.org/wiki/List_of_ZIP_Code_prefixes
        df = df[df['zip_code'].str.startswith(('10', '11'))].reset_index(drop = True)
        zbp_dict[year] = df
    else:
        print(f"Error for {year}: {response.status_code}, {response.text}")

zbp13_df = zbp_dict[2013]
zbp18_df = zbp_dict[2018]

        Census — American Community Survey 5-Year: https://www.census.gov/data/developers/data-sets/acs-5year.2018.html#list-tab-1806015614
                2013 variables: https://api.census.gov/data/2013/acs/acs5/variables.html
                2018 variables: https://api.census.gov/data/2018/acs/acs5/variables.html

In [10]:
# dictionary to store separate year df
acs_dict = {}

acs_variables = ['B01003_001E','B01001_001E','B01001_026E','B01002_001E',
                 'B01001_007E','B01001_008E','B01001_009E','B01001_010E','B01001_011E','B01001_012E','B01001_013E',
                 'B01001_031E','B01001_032E','B01001_033E','B01001_034E','B01001_035E','B01001_036E','B01001_037E',
                 'B19013_001E','B25064_001E','B25001_001E','B25003_003E','B25003_001E']

for year in years:
    url = f'https://api.census.gov/data/{year}/acs/acs5'

    params = {
        'get':','.join(acs_variables),                   # total population estimate
        'for':'zip code tabulation area:*',    # ZCTAs not ZIPs
        'in':'state:36'                        # 36 = new york 
    }

    response = requests.get(url, params = params)

    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data[1:], columns = data[0])
        
            # fix column headers
        df.columns = df.columns.str.lower()
        df = df.rename(columns = {'zip code tabulation area':'zcta','b01003_001e':'population','b01001_001e':'male_pop','b01001_026e':'female_pop','b01002_001e':'median_age',
                                  'b01001_007e':'male_18_19','b01001_008e':'male_20','b01001_009e':'male_21','b01001_010e':'male_22_24',
                                  'b01001_011e':'male_25_29','b01001_012e':'male_30_34','b01001_013e':'male_35_39',
                                  'b01001_031e':'female_18_19','b01001_032e':'female_20','b01001_033e':'female_21','b01001_034e':'female_22_24',
                                  'b01001_035e':'female_25_29','b01001_036e':'female_30_34','b01001_037e':'female_35_39',
                                  'b19013_001e':'median_household_income','b25064_001e':'median_gross_rent','b25001_001e':'housing_units',
                                  'b25003_003e':'renter_occupied','b25003_001e':'occupied_total'})
        acs_dict[year] = df
    else:
        print(f'Error for {year}: {response.status_code}, {response.text}')

acs13_df = acs_dict[2013]
acs18_df = acs_dict[2018]

        Employment data uses ZIP and population data uses ZCTA so we need to use a crosswalk to merge

        HRSA ZIP to ZCTA Crosswalk: https://data.hrsa.gov/DataDownload/GeoCareNavigator/ZIP%20Code%20to%20ZCTA%20Crosswalk.xlsx


In [11]:
cross_df = pd.read_excel('ZIP Code to ZCTA Crosswalk.xlsx')
cross_df.columns = cross_df.columns.str.lower()
cross_df = cross_df[cross_df['state'] == 'NY']
cross_df['zcta'] = cross_df['zcta'].astype(int)
cross_df = cross_df[['zip_code','zcta','zip_join_type']].reset_index(drop = True)

### Merging

In [12]:
df_list = [acs13_df,acs18_df]

acs_df_dict = {}

for year, df in zip(years, df_list):
    df['m_18_24'] = df['male_18_19'].astype(int) + df['male_20'].astype(int) + df['male_21'].astype(int) + df['male_22_24'].astype(int)
    df['f_18_24'] = df['female_18_19'].astype(int) + df['female_20'].astype(int) + df['female_21'].astype(int) + df['female_22_24'].astype(int)
    df['m_25_39'] = df['male_25_29'].astype(int) + df['male_30_34'].astype(int) + df['male_35_39'].astype(int)
    df['f_25_39'] = df['female_25_29'].astype(int) + df['female_30_34'].astype(int) + df['female_35_39'].astype(int)
    df['age_18_24'] = df['m_18_24'].astype(int) + df['f_18_24'].astype(int)
    df['age_25_39'] = df['m_25_39'].astype(int) + df['f_25_39'].astype(int)

    acs_df_dict[year] = df.copy()


acs13_df = acs_df_dict[2013]
acs18_df = acs_df_dict[2018]

In [13]:
# drop unnecessary cols
acs13_df = acs13_df[['zcta','population', 'male_pop', 'female_pop', 'median_age','age_18_24', 'age_25_39',
          'median_household_income', 'median_gross_rent', 'housing_units','renter_occupied', 'occupied_total']].astype(float)
acs18_df = acs18_df[['zcta','population', 'male_pop', 'female_pop', 'median_age','age_18_24', 'age_25_39',
          'median_household_income', 'median_gross_rent', 'housing_units','renter_occupied', 'occupied_total']].astype(float)

In [14]:
# rename cols
acs13_df = acs13_df.rename(columns = {'population':'pop_2013', 'male_pop':'male_pop_2013', 'female_pop':'female_pop_2013',
                           'median_age':'median_age_2013','age_18_24':'college_early_prof_2013', 'age_25_39':'young_prof_2013',
                           'median_household_income':'median_income_2013', 'median_gross_rent':'median_rent_2013',
                           'housing_units':'housing_units_2013','renter_occupied':'renter_occupied_2013', 'occupied_total':'occupied_total_2013'})

acs18_df = acs18_df.rename(columns = {'population':'pop_2018', 'male_pop':'male_pop_2018', 'female_pop':'female_pop_2018',
                           'median_age':'median_age_2018','age_18_24':'college_early_prof_2018', 'age_25_39':'young_prof_2018',
                           'median_household_income':'median_income_2018', 'median_gross_rent':'median_rent_2018',
                           'housing_units':'housing_units_2018','renter_occupied':'renter_occupied_2018', 'occupied_total':'occupied_total_2018'})

In [15]:
# merge to ACS to get one row per ZCTA
acs_merged = pd.merge(acs18_df, acs13_df, on = 'zcta', how = 'left')

# there are 0s and negative values → make them nulls for now
acs_merged[acs_merged <= 0] = np.nan

In [16]:
# merge to ZBP to get one row per ZIP
zbp_merged = pd.merge(zbp18_df, zbp13_df, on = 'zip_code', how = 'left')
zbp_merged = zbp_merged.rename(columns = {'employee_count_x':'employee_count_2018','business_count_x':'business_count_2018','employee_count_y':'employee_count_2013','business_count_y':'business_count_2013'})
zbp_merged = zbp_merged[['zip_code','employee_count_2013','employee_count_2018','business_count_2013','business_count_2018']].astype(int)

In [17]:
# use crosswalk to convert ZCTA to ZIP
acs_merged['zcta'] = acs_merged['zcta'].astype(int)
acs_merged = pd.merge(acs_merged, cross_df, on = 'zcta', how = 'left')
acs_merged = acs_merged[acs_merged['zip_code'].astype(str).str.startswith(('10', '11'))].reset_index(drop = True)

# some duplicated ZIPs → keep 'ZIP matches ZCTA' but if that isn't true for a zip, then use 'Spaital join to ZCTA'"ZIP Code to ZCTA Crosswalk.xlsx"
acs_merged['priority'] = (acs_merged['zip_join_type'] == 'Zip matches ZCTA').astype(int)
acs_merged = acs_merged.sort_values(by = ['zip_code','priority'], ascending = [True, False]).drop_duplicates(subset = 'zip_code', keep = 'first').reset_index(drop = True)

In [18]:
# merge ZBP and ACS
zip_df = pd.merge(zbp_merged, acs_merged, on = 'zip_code', how = 'left')
zip_df = zip_df.drop(['zcta','priority'], axis = 1)
zip_df = zip_df.dropna().reset_index(drop = True)     # some rows did not merge with population data → drop

### BigQuery public data

        Zip Code Geometry
        Subway Station Ridership and Geometry

In [19]:
zip_geo_df = pd.read_csv('NYC zip geometry.csv')

#separate geometry and boroughs
sub_geo = sub_df[['station_complex_name','borough','station_geom']]

In [20]:
# ZIP level population and employment still contains ZIPs that are not in NYC
# geometry data contains only NYC, excluding Staten Island
nyc_df = pd.merge(zip_geo_df, zip_df, on = 'zip_code', how = 'left')
nyc_df = nyc_df.dropna().reset_index(drop = True)       # 1 row did not merge with population data → drop

In [21]:
# turn into geo dataframe
sub_df['station_geom'] = sub_df['station_geom'].astype(str).apply(wkt.loads)                # there seems to be mixed classes → force everything into a string before converting to geometry
sub_gdf = gpd.GeoDataFrame(sub_df, geometry = 'station_geom', crs = 'EPSG:4326')
nyc_df['zip_code_geom'] = nyc_df['zip_code_geom'].apply(wkt.loads)
nyc_gdf = gpd.GeoDataFrame(nyc_df, geometry = 'zip_code_geom', crs = 'EPSG:4326')

### Spatial Buffering → what zip codes are within 0.5 miles of a station

In [22]:
# change crs for better accuracy → feet instead of degrees
sub_gdf = sub_gdf.to_crs(epsg = 2263)
nyc_gdf = nyc_gdf.to_crs(epsg = 2263)

# create a 0.5 mile buffer & set this as the active geometry column
sub_gdf['buffer_geom'] = sub_gdf.geometry.buffer(2640)      # 1 mile = 5280 feet
sub_gdf = sub_gdf.set_geometry('buffer_geom')

# only keep station, borough and the buffer geom
sub_gdf = sub_gdf[['station_complex_name','borough','buffer_geom']]

# spatial join → get zip codes that intersect with the buffer
zips_near_station = gpd.sjoin(nyc_gdf, sub_gdf[['station_complex_name','borough','buffer_geom']], how = 'inner', predicate = 'intersects')

In [23]:
# spatial join row multiplies → a row for each station to zip combination → groupby on stations
spatial_df = zips_near_station.groupby('station_complex_name').agg({
    
    # SUM variables that are counts
    'employee_count_2013':'sum',
    'employee_count_2018':'sum',
    'business_count_2013':'sum',
    'business_count_2018':'sum',
    'pop_2013':'sum',
    'pop_2018':'sum',
    'male_pop_2013':'sum',
    'male_pop_2018':'sum',
    'female_pop_2013':'sum',
    'female_pop_2018':'sum',
    'college_early_prof_2013':'sum',
    'college_early_prof_2018':'sum',
    'young_prof_2013':'sum',
    'young_prof_2018':'sum',
    'housing_units_2013':'sum',
    'housing_units_2018':'sum',
    'renter_occupied_2013':'sum',
    'renter_occupied_2018':'sum',
    'occupied_total_2013':'sum',
    'occupied_total_2018':'sum',

    # use MEAN for variables that are medians
    'median_age_2013':'mean',
    'median_age_2018':'mean',
    'median_income_2013':'mean',
    'median_income_2018':'mean',
    'median_rent_2013':'mean',
    'median_rent_2018':'mean'
}).reset_index()

In [24]:
# add back ridership data and add borough for control
spatial_df = pd.merge(spatial_df, sub_df, on = 'station_complex_name', how = 'left')

In [25]:
# the WTC Cortlandt station was still being rebuilt and didn't reopen until 2018 → drop any rows with 0 values for ridership, population, employees
print(spatial_df[spatial_df['ridership_2013'] == 0])
print(spatial_df[spatial_df['ridership_2018'] == 0])

spatial_df = spatial_df[(spatial_df['ridership_2013'] > 0) & (spatial_df['ridership_2018'] > 0)]
spatial_df = spatial_df.reset_index(drop = True)

    station_complex_name  employee_count_2013  employee_count_2018  \
404        WTC Cortlandt             267468.0             301829.0   

     business_count_2013  business_count_2018  pop_2013  pop_2018  \
404              12709.0              12520.0   79710.0   88822.0   

     male_pop_2013  male_pop_2018  female_pop_2013  ...  median_rent_2013  \
404        79710.0        88822.0          40084.0  ...          1866.375   

     median_rent_2018  routes  ridership_2013  ridership_2018  complex_id  \
404            2985.5       1             0.0          3558.0         328   

       borough                  station_geom  complex_id_unique  route_count  
404  Manhattan  POINT (-74.012188 40.711835)                  Y            1  

[1 rows x 35 columns]
Empty DataFrame
Columns: [station_complex_name, employee_count_2013, employee_count_2018, business_count_2013, business_count_2018, pop_2013, pop_2018, male_pop_2013, male_pop_2018, female_pop_2013, female_pop_2018, college_early

### Add Subway Line Group Indicators

In [26]:
# Define route groups
route_groups = {
    'line_ACE': ['A', 'C', 'E'],
    'line_123': ['1', '2', '3'],
    'line_BDFM': ['B', 'D', 'F', 'M'],
    'line_456': ['4', '5', '6'],
    'line_NQRW': ['N', 'Q', 'R', 'W'],
    'line_7': ['7'],
    'line_L': ['L'],
    'line_G': ['G'],
    'line_JZ': ['J', 'Z']
}

# Split routes column into lists
spatial_df['route_list'] = spatial_df['routes'].str.split(' ')

# Create dummy columns
for group_name, lines in route_groups.items():
    spatial_df[group_name] = spatial_df['route_list'].apply(lambda x: any(route in lines for route in x))

In [27]:
# change boolean calues to binary
bool_cols = spatial_df.select_dtypes(include=['bool']).columns
spatial_df[bool_cols] = spatial_df[bool_cols].astype(int)

### Separate datasets for Absoulte Change Model and Percent Change Model

In [28]:
# create new dfs
perc_reg_df = spatial_df[['station_complex_name','borough','routes','route_count','station_geom',
                          'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ']]

absol_reg_df = spatial_df[['station_complex_name','borough','routes','route_count','station_geom',
                           'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ','ridership_2013']]     # include baseline ridership for absolute model for normalization

In [29]:
# percent change

# add growth / decline
perc_reg_df['rider_change'] = (spatial_df['ridership_2018'] - spatial_df['ridership_2013']) / spatial_df['ridership_2013'] * 100

# employees + business
perc_reg_df['emp_change'] = (spatial_df['employee_count_2018'] - spatial_df['employee_count_2013']) / spatial_df['employee_count_2013'] * 100
perc_reg_df['bus_change'] = (spatial_df['business_count_2018'] - spatial_df['business_count_2013']) / spatial_df['business_count_2013'] * 100

# population
perc_reg_df['pop_change'] = (spatial_df['pop_2018'] - spatial_df['pop_2013']) / spatial_df['pop_2013'] * 100
perc_reg_df['women_share_change'] = ((spatial_df['female_pop_2018'] / spatial_df['pop_2018']) - (spatial_df['female_pop_2013'] / spatial_df['pop_2013'])) * 100

# age
perc_reg_df['med_age_change'] = (spatial_df['median_age_2018'] - spatial_df['median_age_2013']) / spatial_df['median_age_2013'] * 100
perc_reg_df['college_age_change'] = (spatial_df['college_early_prof_2018'] - spatial_df['college_early_prof_2013']) / spatial_df['college_early_prof_2013'] * 100
perc_reg_df['young_prof_age_change'] = (spatial_df['young_prof_2018'] - spatial_df['young_prof_2013']) / spatial_df['young_prof_2013'] * 100

# income
perc_reg_df['med_income_change'] = (spatial_df['median_income_2018'] - spatial_df['median_income_2013']) / spatial_df['median_income_2013'] * 100

# housing
perc_reg_df['med_rent_change'] = (spatial_df['median_rent_2018'] - spatial_df['median_rent_2013']) / spatial_df['median_rent_2013'] * 100
perc_reg_df['housing_units_change'] = (spatial_df['housing_units_2018'] - spatial_df['housing_units_2013']) / spatial_df['housing_units_2013'] * 100
perc_reg_df['renter_share_change'] = ((spatial_df['renter_occupied_2018'] / spatial_df['occupied_total_2018']) - (spatial_df['renter_occupied_2013'] / spatial_df['occupied_total_2013'])) * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perc_reg_df['rider_change'] = (spatial_df['ridership_2018'] - spatial_df['ridership_2013']) / spatial_df['ridership_2013'] * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perc_reg_df['emp_change'] = (spatial_df['employee_count_2018'] - spatial_df['employee_count_2013']) / spatial_df['employee_count_2013'] * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

In [30]:
# add growth / decline
absol_reg_df['rider_change'] = spatial_df['ridership_2018'] - spatial_df['ridership_2013']

# employees + business
absol_reg_df['emp_change'] = spatial_df['employee_count_2018'] - spatial_df['employee_count_2013']
absol_reg_df['bus_change'] = spatial_df['business_count_2018'] - spatial_df['business_count_2013']

# population
absol_reg_df['pop_change'] = spatial_df['pop_2018'] - spatial_df['pop_2013']
absol_reg_df['women_share_change'] = ((spatial_df['female_pop_2018'] / spatial_df['pop_2018']) - (spatial_df['female_pop_2013'] / spatial_df['pop_2013'])) * 100

# age
absol_reg_df['med_age_change'] = spatial_df['median_age_2018'] - spatial_df['median_age_2013']
absol_reg_df['college_age_change'] = spatial_df['college_early_prof_2018'] - spatial_df['college_early_prof_2013']
absol_reg_df['young_prof_age_change'] = spatial_df['young_prof_2018'] - spatial_df['young_prof_2013']

# income
absol_reg_df['med_income_change'] = spatial_df['median_income_2018'] - spatial_df['median_income_2013']

# housing
absol_reg_df['med_rent_change'] = spatial_df['median_rent_2018'] - spatial_df['median_rent_2013']
absol_reg_df['housing_units_change'] = spatial_df['housing_units_2018'] - spatial_df['housing_units_2013']
absol_reg_df['renter_share_change'] = ((spatial_df['renter_occupied_2018'] / spatial_df['occupied_total_2018']) - (spatial_df['renter_occupied_2013'] / spatial_df['occupied_total_2013'])) * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  absol_reg_df['rider_change'] = spatial_df['ridership_2018'] - spatial_df['ridership_2013']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  absol_reg_df['emp_change'] = spatial_df['employee_count_2018'] - spatial_df['employee_count_2013']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  absol_reg_df['b

### Save off dataframes

In [31]:
# dataset for regressions
perc_reg_df.to_csv('percent change regression data.csv', index = False)
absol_reg_df.to_csv('absolute change regression data.csv', index = False)

In [32]:
# zip codes that are within the buffer zone → for mapping
buffer_zip_list = zips_near_station['zip_code'].unique()
buffer_zip_df = nyc_df[nyc_df['zip_code'].isin(buffer_zip_list)]

buffer_zip_df = buffer_zip_df[['zip_code','zip_code_geom','pop_2013','pop_2018','employee_count_2013','employee_count_2018']]
buffer_zip_df[['pop_2013','pop_2018','employee_count_2013','employee_count_2018']] = buffer_zip_df[['pop_2013','pop_2018','employee_count_2013','employee_count_2018']].astype(int)

buffer_zip_df['pop_perc_change'] = round((buffer_zip_df['pop_2018'] - buffer_zip_df['pop_2013']) / buffer_zip_df['pop_2013'] * 100, 1)
buffer_zip_df['emp_perc_change'] = round((buffer_zip_df['employee_count_2018'] - buffer_zip_df['employee_count_2013']) / buffer_zip_df['employee_count_2013'] * 100, 1)

buffer_zip_df.to_csv('zip codes - mapping.csv', index = False)

# simplified station df → for mapping
stations_df = spatial_df[['station_complex_name','station_geom','ridership_2013','ridership_2018','routes']]

stations_df[['ridership_2013','ridership_2018']] = stations_df[['ridership_2013','ridership_2018']].astype(int)
stations_df['ridership_perc_change'] = round((stations_df['ridership_2018'] - stations_df['ridership_2013']) / stations_df['ridership_2013'] * 100, 1)

stations_df.to_csv('stations - mapping.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stations_df[['ridership_2013','ridership_2018']] = stations_df[['ridership_2013','ridership_2018']].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stations_df['ridership_perc_change'] = round((stations_df['ridership_2018'] - stations_df['ridership_2013']) / stations_df['ridership_2013'] * 100, 1)
