In [251]:
import pandas as pd
import numpy as np
import geopandas as gpd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn import metrics

In [2]:
mergedf = gpd.read_file("MERGED_DATA.geojson")

In [102]:
# fixing percent change numbers
mergedf["%_change_new"] = ((mergedf["new"] - mergedf["prev_yr_new"]) / mergedf["prev_yr_new"])
mergedf["%_change_active"] = ((mergedf["active"] - mergedf["prev_yr_active"]) / mergedf["prev_yr_active"])

mergedf.loc[mergedf["%_change_new"] > 100, "%_change_new"] = None

Unnamed: 0,year,month,prior_year,Population,pop_change,Median Income,income_change,Median Age,age_change,WorkTransitCount,...,rt_count_change,MonthTotal,active,new,prev_yr_active,prev_yr_new,%_change_active,%_change_new,train_rides,transport_total
count,92644.0,92644.0,92644.0,92644.0,92308.0,88300.0,86544.0,92296.0,92272.0,92644.0,...,92644.0,92644.0,130136.0,130136.0,102765.0,102765.0,102765.0,18259.0,130704.0,92644.0
mean,2016.501965,6.501727,2015.501965,1268.201891,0.010171,55437.62333,0.046543,36.113448,0.013174,518.093001,...,0.013206,10941.783995,20.787,0.241301,21.25245,0.24628,0.000649,-0.632865,307677.7,328556.9
std,1.118367,3.452404,1.118367,633.088241,0.232013,31605.824615,0.186108,8.396701,0.111691,428.731532,...,0.093691,15241.648279,48.827148,0.773264,49.727477,0.790264,0.232062,0.678618,629324.3,667571.9
min,2015.0,1.0,2014.0,0.0,-1.0,5000.0,-0.671916,13.9,-0.561753,0.0,...,-0.5,0.0,0.0,0.0,1.0,0.0,-0.75,-1.0,0.0,0.0
25%,2016.0,4.0,2015.0,854.0,-0.077071,32316.0,-0.03766,30.5,-0.031339,245.0,...,0.0,3980.567239,6.0,0.0,6.0,0.0,-0.1,-1.0,0.0,9498.143
50%,2017.0,7.0,2016.0,1173.0,0.002375,47578.0,0.024805,34.4,0.003185,398.0,...,0.0,7430.690684,12.0,0.0,13.0,0.0,0.0,-1.0,119080.0,134273.0
75%,2018.0,10.0,2017.0,1562.0,0.082855,70882.0,0.113413,40.6,0.048561,634.0,...,0.0,13190.70962,24.0,0.0,24.0,0.0,0.058824,-0.10101,346129.0,358414.5
max,2018.0,12.0,2017.0,10317.0,15.4,218281.0,4.010504,86.6,1.104603,3755.0,...,2.0,313410.241488,1862.0,47.0,1862.0,47.0,5.0,11.0,5993499.0,6012302.0


In [34]:
# adding bus and train ridership
mergedf["train_rides"].fillna(0, inplace=True)
mergedf["transport_total"] = mergedf["MonthTotal"] + mergedf["train_rides"]

In [242]:
# dropping unneeded columns for regression and null year/month data
smaller = mergedf[mergedf["year"].notna()].set_index(["year", "blockgroup"])
smaller = smaller.drop(columns=["prior_year", "MonthTotal", "month-year", 
                                "prev_month-year", "prev_yr_active", "prev_yr_new",
                                "train_rides", "geometry"])

smaller.month = smaller.month.astype(str)

print(smaller.shape)
smaller.info()

(92644, 19)
<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 92644 entries, (2018.0, '170310101002') to (2015.0, '170319801001')
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   month               92644 non-null  object 
 1   pri_neigh           92644 non-null  object 
 2   Population          92644 non-null  float64
 3   pop_change          92308 non-null  float64
 4   Median Income       88300 non-null  float64
 5   income_change       86544 non-null  float64
 6   Median Age          92296 non-null  float64
 7   age_change          92272 non-null  float64
 8   WorkTransitCount    92644 non-null  float64
 9   wt_count_change     92500 non-null  float64
 10  WorkTransitPercent  92644 non-null  float64
 11  wt_perc_change      92500 non-null  float64
 12  count_of_routes     92644 non-null  float64
 13  rt_count_change     92644 non-null  float64
 14  active              92076 non-null  floa

In [306]:
def scale_and_impute(train, test, exclude_list):
    '''
    complete standard data normalization.
    '''
    
    for col in train.columns:
    
        if train[col].dtype in [int, float] and col not in exclude_list :
            
            data_scaler = StandardScaler()
            train[col] = data_scaler.fit_transform(train[[col]])
            test[col] = data_scaler.transform(test[[col]])
        
            imputer = SimpleImputer()
            train[col] = imputer.fit_transform(train[[col]])
            test[col] = imputer.transform(test[[col]])
        
    return (train, test)


def cat_to_dummies(xtrain, xtest):
        '''
        Uses pd.get_dummies to perform one-hot encoding of categorical variables.

        '''
        
        train = pd.get_dummies(xtrain)
        test = pd.get_dummies(xtest)
        
        for col in train.columns:
            if col not in test.columns:
                test[col] = np.array([0]*test.shape[0])
                
        for col in test.columns:
            if col not in train.columns:
                test = test.drop(columns=col)
 
        return (train, test)

In [307]:
# breaking up CV temporal holdouts

cv1_train = smaller.loc[2015, :]
cv1_test = smaller.loc[2016, :]

cv1_train, cv1_test = scale_and_impute(cv1_train, cv1_test, ["transport_total"])
cv1_train, cv1_test = cat_to_dummies(cv1_train, cv1_test)

cv2_train = smaller.loc[2016, :]
cv2_test = smaller.loc[2017, :]

cv2_train, cv2_test = scale_and_impute(cv2_train, cv2_test, ["transport_total"])
cv2_train, cv2_test = cat_to_dummies(cv2_train, cv2_test)

final_train = smaller.loc[2017, :]
final_test = smaller.loc[2018, :]

final_train, final_test = scale_and_impute(final_train, final_test, ["transport_total"])
final_train, final_test = cat_to_dummies(final_train, final_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [310]:
cv1_x_train = cv1_train.drop(columns=["transport_total"])
cv1_y_train = cv1_train["transport_total"]
cv1_x_test = cv1_test.drop(columns=["transport_total"])
cv1_y_test = cv1_test["transport_total"]

cv2_x_train = cv2_train.drop(columns=["transport_total"])
cv2_y_train = cv2_train["transport_total"]
cv2_x_test = cv2_test.drop(columns=["transport_total"])
cv2_y_test = cv2_test["transport_total"]

final_x_train = final_train.drop(columns=["transport_total"])
final_y_train = final_train["transport_total"]
final_x_test = final_test.drop(columns=["transport_total"])
final_y_test = final_test["transport_total"]

In [311]:
print("CV 1:")
print(cv1_x_train.shape)
print(cv1_y_train.shape)
print(cv1_x_test.shape)
print(cv1_y_test.shape)
print()

print("CV 2:")
print(cv2_x_train.shape)
print(cv2_y_train.shape)
print(cv2_x_test.shape)
print(cv2_y_test.shape)
print()

print("Eval Data:")
print(final_x_train.shape)
print(final_y_train.shape)
print(final_x_test.shape)
print(final_y_test.shape)

CV 1:
(23124, 119)
(23124,)
(23124, 119)
(23124,)

CV 2:
(23124, 119)
(23124,)
(23164, 119)
(23164,)

Eval Data:
(23164, 119)
(23164,)
(23232, 119)
(23232,)


### Notes:

Lots of places where there is zero for new businesses. This creates a situation where growth from zero is being calculated (division by zero errors).

Needed to recalc the busn lic data for both % fields.

L ridership total much higher should be - duplictive blockgroup ownership of total station rides.

For some reason I though 2014 was prior year data, not included data...
may need to recreate asc/cta to include one extra year (2013, to allow 2014 data in cvs).

