# Data Exploration

In [26]:
# Import libraries in alphabetical order
import numpy as np
import pandas as pd


## GeoFeatures_Zurich_provided_by_UrbanDataLabs

### Fields:

- **x, y:** Coordinates in EPSG 2056

- **lat, lng:** Coordinates in latitude, longitude

- **hh_ha, pers_ha:** Density of households and persons per hectare

- **pt_class:** Public transport quality class according to Federal Office for Spatial Development

- **pt_dis:** Distance to next stop (as the crow flies)

- **station_dis:** Distance to railway station

- **noise_street:** Road noise according to BAFU

- **bldg_foot:** Building footprint in m²

- **bldg_corner:** Building is located at an intersection

- **bldg_500:** Number of buildings in 500m radius

- **net_acc:** Size of catchment area in 2 min driving time

In [27]:
# Read GeoFeatures_Zurich_provided_by_UrbanDataLabs file
df = pd.read_csv("GeoFeatures_Zurich_provided_by_UrbanDataLabs.csv")
df.head()

Unnamed: 0,x,y,geom,hh_ha,pers_ha,pt_class,pt_dis,station_dis,noise_street,bldg_foot,bldg_corner,bldg_500,net_acc,lat,lng
0,2682762.5,1246387.5,01010000200808000000000040C577444100000080B304...,8,10,1.0,55,1725,45,4204,t,1402050,1043508.0,47.3631,8.53425
1,2682762.5,1246412.5,01010000200808000000000040C577444100000080CC04...,53,100,1.0,50,1700,69,4204,t,1402050,1043508.0,47.3634,8.53425
2,2682787.5,1246337.5,010100002008080000000000C0D1774441000000808104...,8,10,1.0,75,1770,58,4204,t,1402050,1043508.0,47.3627,8.53457
3,2682787.5,1246362.5,010100002008080000000000C0D1774441000000809A04...,8,10,1.0,55,1745,56,4204,t,1402050,1043508.0,47.3629,8.53457
4,2682787.5,1246387.5,010100002008080000000000C0D177444100000080B304...,8,10,1.0,35,1720,63,4204,t,1402050,1043508.0,47.3631,8.53458


In [28]:
# Select columns
df = df[["lat", "lng", "hh_ha", "pers_ha", "pt_dis", "station_dis", "bldg_foot", "bldg_500", "net_acc"]]
df = df.rename(columns={"hh_ha": "hh_den",
                        "pers_ha": "pop_den",
                        "station_dis": "ts_dist",
                        "bldg_foot" : "building_den",
                        "bldg_500": "builing_den_500",
                        "net_acc": "catchment_area_driv"})

In [38]:
# Null values

# Summary by column
summary_nulls = df.isnull().sum()/len(df)*100
summary_nulls

lat                    0.0
lng                    0.0
hh_den                 0.0
pop_den                0.0
pt_dis                 0.0
ts_dist                0.0
building_den           0.0
builing_den_500        0.0
catchment_area_driv    0.0
migros_boolean         0.0
dtype: float64

In [29]:
df_corr = df[["hh_den", "pop_den", "pt_dis", "ts_dist", "building_den", "builing_den_500", "catchment_area_driv"]]
df_corr.describe()

Unnamed: 0,hh_den,pop_den,pt_dis,ts_dist,building_den,builing_den_500,catchment_area_driv
count,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0,123468.0
mean,25.42961,52.510904,208.818659,4146.849111,1745.523862,699074.9,719546.9
std,33.31709,66.065373,148.929058,2066.859348,4378.82948,605285.9,380161.8
min,0.0,0.0,0.0,0.0,23.0,284.0,181.0
25%,0.0,0.0,110.0,2615.0,220.0,259348.0,418081.0
50%,11.0,26.0,175.0,3950.0,467.0,506199.0,714058.0
75%,41.0,87.0,255.0,5335.0,1220.0,972424.5,1029945.0
max,255.0,546.0,1290.0,10230.0,58985.0,3350409.0,1657424.0


In [30]:
# Correlation?
df_corr.corr(method="pearson")

Unnamed: 0,hh_den,pop_den,pt_dis,ts_dist,building_den,builing_den_500,catchment_area_driv
hh_den,1.0,0.961563,-0.276513,-0.254892,-0.069816,0.333425,0.4155
pop_den,0.961563,1.0,-0.281431,-0.234213,-0.079519,0.303619,0.403119
pt_dis,-0.276513,-0.281431,1.0,0.356661,-0.084605,-0.351942,-0.447405
ts_dist,-0.254892,-0.234213,0.356661,1.0,-0.145825,-0.647647,-0.43419
building_den,-0.069816,-0.079519,-0.084605,-0.145825,1.0,0.311981,0.044833
builing_den_500,0.333425,0.303619,-0.351942,-0.647647,0.311981,1.0,0.599545
catchment_area_driv,0.4155,0.403119,-0.447405,-0.43419,0.044833,0.599545,1.0


# Creating fake data

In [31]:
# Migros column (1/0)
np.random.seed(22)

df['migros_boolean'] = np.random.choice([0, 1], df.shape[0])
df.head()

Unnamed: 0,lat,lng,hh_den,pop_den,pt_dis,ts_dist,building_den,builing_den_500,catchment_area_driv,migros_boolean
0,47.3631,8.53425,8,10,55,1725,4204,1402050,1043508.0,1
1,47.3634,8.53425,53,100,50,1700,4204,1402050,1043508.0,0
2,47.3627,8.53457,8,10,75,1770,4204,1402050,1043508.0,0
3,47.3629,8.53457,8,10,55,1745,4204,1402050,1043508.0,0
4,47.3631,8.53458,8,10,35,1720,4204,1402050,1043508.0,0


In [32]:
df = df.dropna()

# Logistic Regression

In [33]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [35]:
# Dependant variable: "migros_boolean"
dep_var = ["migros_boolean"]

# Independent variables:
indep_var = ["hh_den", "pop_den", "pt_dis", "ts_dist", "building_den", "builing_den_500", "catchment_area_driv"]

# Model:
x = df[indep_var]
y = df[dep_var]

logit_model = sm.Logit(y,x)
result = logit_model.fit()
print(result.summary2())


Optimization terminated successfully.
         Current function value: 0.693121
         Iterations 3
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.000      
Dependent Variable: migros_boolean   AIC:              171170.5076
Date:               2021-09-03 13:56 BIC:              171238.5737
No. Observations:   123468           Log-Likelihood:   -85578.    
Df Model:           6                LL-Null:          -85581.    
Df Residuals:       123461           LLR p-value:      0.40833    
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     3.0000                                        
------------------------------------------------------------------
                     Coef.  Std.Err.    z    P>|z|   [0.025 0.975]
------------------------------------------------------------------
hh_den              -0.0000   0.0006 -0.0113 0.9910 -0.0012 0.0012
pop_den              0.0002   0.0003  0.7214 0.4706 -