# Data Exploration

In [1]:
# Import libraries in alphabetical order
import numpy as np
import pandas as pd


## GeoFeatures_Zurich_provided_by_UrbanDataLabs

### Fields:

- **x, y:** Coordinates in EPSG 2056
- **lat, lng:** Coordinates in latitude, longitude
- **hh_ha, pers_ha:** Density of households and persons per hectare
- **pt_class:** Public transport quality class according to Federal Office for Spatial Development
- **pt_dis:** Distance to next stop (as the crow flies)
- **station_dis:** Distance to railway station
- **noise_street:** Road noise according to BAFU
- **bldg_foot:** Building footprint in m²
- **bldg_corner:** Building is located at an intersection
- **bldg_500:** Number of buildings in 500m radius
- **net_acc:** Size of catchment area in 2 min driving time

In [2]:
# Read GeoFeatures_Zurich_provided_by_UrbanDataLabs file
df = pd.read_csv("GeoFeatures_Zurich_and_supermarkets.csv")
df.head()

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,lat,lng,geometry,migros,coop,discounter,other,competitors,all_supermarkets
0,8,10,55,1725,47.3631,8.53425,POINT (47.3631 8.53425),,,,,,
1,53,100,50,1700,47.3634,8.53425,POINT (47.3634 8.53425),,,,,,
2,8,10,75,1770,47.3627,8.53457,POINT (47.3627 8.53457),,,,,,
3,8,10,55,1745,47.3629,8.53457,POINT (47.3629 8.53457),,,,,,
4,8,10,35,1720,47.3631,8.53458,POINT (47.3631 8.53458),,,,,,


In [3]:
# Num of supermarkets

test = df[df["all_supermarkets"] >= 1]
test.shape

(210, 13)

In [4]:
# Select columns
df = df[["hh_ha", "pers_ha", "pt_dis", "station_dis", "migros", "coop", "discounter", "other", "competitors", "all_supermarkets"]]


In [5]:
df.dtypes

hh_ha                 int64
pers_ha               int64
pt_dis                int64
station_dis           int64
migros              float64
coop                float64
discounter          float64
other               float64
competitors         float64
all_supermarkets    float64
dtype: object

In [6]:
# Null values

# Summary by column
summary_nulls = df.isnull().sum() / len(df)*100
print(summary_nulls)

# Replace nulls with 0's
df = df.fillna(0)

summary_nulls = df.isnull().sum() / len(df)*100
summary_nulls

hh_ha                0.000000
pers_ha              0.000000
pt_dis               0.000000
station_dis          0.000000
migros              99.794983
coop                99.794983
discounter          99.794983
other               99.794983
competitors         99.794983
all_supermarkets    99.794983
dtype: float64


hh_ha               0.0
pers_ha             0.0
pt_dis              0.0
station_dis         0.0
migros              0.0
coop                0.0
discounter          0.0
other               0.0
competitors         0.0
all_supermarkets    0.0
dtype: float64

In [None]:
# Data cleaning 

df[df["migros"] == 2] = 1


In [7]:
round(df.describe(),2)

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets
count,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0,126331.0
mean,25.43,52.51,208.82,4146.85,0.0,0.0,0.0,0.0,0.0,0.0
std,33.32,66.07,148.93,2066.86,0.02,0.03,0.02,0.02,0.04,0.05
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,110.0,2615.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11.0,26.0,175.0,3950.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41.0,87.0,255.0,5335.0,0.0,0.0,0.0,0.0,0.0,0.0
max,255.0,546.0,1290.0,10230.0,2.0,3.0,2.0,1.0,3.0,3.0


In [8]:
# Add plots (boxplots?) for main metrics

In [9]:
# Correlation?
df.corr(method="pearson")

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets
hh_ha,1.0,0.961563,-0.276513,-0.254892,0.002083,0.007235,-0.005097,0.000543,0.002336,0.002989
pers_ha,0.961563,1.0,-0.281431,-0.234213,0.001278,0.007424,-0.005607,-0.000693,0.001664,0.002039
pt_dis,-0.276513,-0.281431,1.0,0.356661,-0.009304,-0.013048,0.007448,0.000819,-0.004407,-0.007969
station_dis,-0.254892,-0.234213,0.356661,1.0,-0.017346,-0.017684,0.005793,-0.001171,-0.009285,-0.01581
migros,0.002083,0.001278,-0.009304,-0.017346,1.0,0.014199,-0.000349,0.023183,0.019191,0.450685
coop,0.007235,0.007424,-0.013048,-0.017684,0.014199,1.0,-0.000496,-0.00044,0.675556,0.609324
discounter,-0.005097,-0.005607,0.007448,0.005793,-0.000349,-0.000496,1.0,0.145869,0.607415,0.542178
other,0.000543,-0.000693,0.000819,-0.001171,0.023183,-0.00044,0.145869,1.0,0.501338,0.45767
competitors,0.002336,0.001664,-0.004407,-0.009285,0.019191,0.675556,0.607415,0.501338,1.0,0.901167
all_supermarkets,0.002989,0.002039,-0.007969,-0.01581,0.450685,0.609324,0.542178,0.45767,0.901167,1.0


In [10]:
# Plot correlations?

In [11]:
migros_df = df[df["migros"] == 1]
print(migros_df.shape)
migros_df.head()

(41, 10)


Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets
7395,0,0,225,2190,1.0,0.0,0.0,0.0,0.0,1.0
15653,86,127,160,4050,1.0,0.0,0.0,0.0,0.0,1.0
16178,0,0,110,2450,1.0,0.0,0.0,0.0,0.0,1.0
23561,3,3,125,3260,1.0,0.0,0.0,0.0,0.0,1.0
28491,53,94,215,4350,1.0,0.0,0.0,0.0,0.0,1.0


In [15]:
competitors = df[(df["all_supermarkets"] >= 1) & (df["migros"] == 0)]
print(competitors.shape)
competitors.head()

(167, 10)


Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets
1618,106,143,110,3010,0.0,0.0,0.0,1.0,1.0,1.0
1636,3,3,160,4540,0.0,0.0,1.0,0.0,1.0,1.0
4349,117,204,145,1835,0.0,1.0,0.0,0.0,1.0,1.0
4766,54,87,160,8510,0.0,0.0,0.0,1.0,1.0,1.0
6617,14,64,225,3000,0.0,1.0,0.0,0.0,1.0,1.0


# Logistic Regression

In [16]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [25]:
df.corr(method="pearson")

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets
hh_ha,1.0,0.961561,-0.276488,-0.254844,0.000432,0.006741,-0.005533,-0.000132,0.002041,0.002288
pers_ha,0.961561,1.0,-0.281405,-0.234164,-0.000448,0.006902,-0.006067,-0.001377,0.001354,0.00132
pt_dis,-0.276488,-0.281405,1.0,0.356681,-0.009299,-0.013711,0.006349,-0.00046,-0.004928,-0.007801
station_dis,-0.254844,-0.234164,0.356681,1.0,-0.018859,-0.018646,0.00429,-0.002948,-0.010019,-0.016135
migros,0.000432,-0.000448,-0.009299,-0.018859,1.0,0.045995,0.037775,0.072957,0.041475,0.429422
coop,0.006741,0.006902,-0.013711,-0.018646,0.045995,1.0,0.024885,0.032076,0.679371,0.622788
discounter,-0.005533,-0.006067,0.006349,0.00429,0.037775,0.024885,1.0,0.179861,0.612238,0.555537
other,-0.000132,-0.001377,-0.00046,-0.002948,0.072957,0.032076,0.179861,1.0,0.50807,0.471044
competitors,0.002041,0.001354,-0.004928,-0.010019,0.041475,0.679371,0.612238,0.50807,1.0,0.916019
all_supermarkets,0.002288,0.00132,-0.007801,-0.016135,0.429422,0.622788,0.555537,0.471044,0.916019,1.0


In [37]:
# Dependant variable: "migros_boolean"
dep_var = ["migros"]

# Independent variables:
indep_var = ["hh_ha", "pers_ha", "pt_dis", "competitors"]

# Model:
x = df[indep_var]
y = df[dep_var]

logit_model = sm.Logit(y,x)
result = logit_model.fit()
print(result.summary2())


Optimization terminated successfully.
         Current function value: 0.006648
         Iterations 14
                         Results: Logit
Model:              Logit            Pseudo R-squared: -1.174   
Dependent Variable: migros           AIC:              1687.7808
Date:               2021-09-03 18:16 BIC:              1726.7674
No. Observations:   126331           Log-Likelihood:   -839.89  
Df Model:           3                LL-Null:          -386.37  
Df Residuals:       126327           LLR p-value:      1.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     14.0000                                     
----------------------------------------------------------------
                 Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------
hh_ha            0.0443   0.0294   1.5056 0.1322 -0.0134  0.1019
pers_ha         -0.0733   0.0154  -4.7559 0.0000 -0.1035 -0.0431
pt_dis      