# Data Exploration

In [6]:
# Import libraries in alphabetical order
import numpy as np
import pandas as pd
import random


## GeoFeatures_Zurich_provided_by_UrbanDataLabs

### Fields:

- **x, y:** Coordinates in EPSG 2056
- **lat, lng:** Coordinates in latitude, longitude
- **hh_ha, pers_ha:** Density of households and persons per hectare
- **pt_class:** Public transport quality class according to Federal Office for Spatial Development
- **pt_dis:** Distance to next stop (as the crow flies)
- **station_dis:** Distance to railway station
- **noise_street:** Road noise according to BAFU
- **bldg_foot:** Building footprint in m²
- **bldg_corner:** Building is located at an intersection
- **bldg_500:** Number of buildings in 500m radius
- **net_acc:** Size of catchment area in 2 min driving time

In [38]:
# Dataframe with Migros supermarket
df = pd.read_csv("C:/Users/RCOLL/OneDrive/Documents/GitHub/migros_data_challange/data/counter_500m.csv")
df.head()

# Filter dataframe with migros
df_migros = df[(df["migros"] >= 1) | (df["coop"] >= 1)].reset_index(drop=True)
num_obs = len(df_migros)

# Take a sample from the df where there are no migros
random.seed(10)
df_no_sup = df[(df["migros"] == 0) & (df["coop"] == 0)]
df_no_migros = df_no_sup.sample(num_obs*2).reset_index(drop=True)

# Concat the dataframes above
model_df = pd.concat([df_migros, df_no_migros]).reset_index(drop=True)
model_df

# Select columns
model_df = model_df[["hh_ha", "pers_ha", "pt_dis", "station_dis", "migros", "coop", "discounter", "other", "competitors", "all_supermarkets",
                     "number_companies", "migros_500m", "coop_500m", "discounter_500m", "other_500m", "number_companies_500m"]]

model_df.head(5)

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,coop,discounter,other,competitors,all_supermarkets,number_companies,migros_500m,coop_500m,discounter_500m,other_500m,number_companies_500m
0,117,204,145,1835,0.0,1.0,0.0,0.0,1.0,1.0,0,0.0,0.0,0.0,0.0,7
1,14,64,225,3000,0.0,1.0,0.0,0.0,1.0,1.0,0,0.0,1.0,0.0,0.0,0
2,0,0,225,2190,1.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,2.0,0.0,3
3,63,107,110,1060,0.0,1.0,0.0,0.0,1.0,1.0,0,1.0,1.0,0.0,0.0,2
4,99,142,90,1890,0.0,1.0,0.0,0.0,1.0,1.0,0,0.0,0.0,0.0,0.0,0


In [39]:
model_df.dtypes

hh_ha                      int64
pers_ha                    int64
pt_dis                     int64
station_dis                int64
migros                   float64
coop                     float64
discounter               float64
other                    float64
competitors              float64
all_supermarkets         float64
number_companies           int64
migros_500m              float64
coop_500m                float64
discounter_500m          float64
other_500m               float64
number_companies_500m      int64
dtype: object

In [40]:
# Null values
summary_nulls = model_df.isnull().sum() / len(model_df)*100
print(summary_nulls)

hh_ha                    0.0
pers_ha                  0.0
pt_dis                   0.0
station_dis              0.0
migros                   0.0
coop                     0.0
discounter               0.0
other                    0.0
competitors              0.0
all_supermarkets         0.0
number_companies         0.0
migros_500m              0.0
coop_500m                0.0
discounter_500m          0.0
other_500m               0.0
number_companies_500m    0.0
dtype: float64


In [41]:
model_df["migros"] = model_df["migros"] + model_df["coop"]
model_df["others"] = model_df["discounter"] + model_df["other"]

model_df["others_500m"] = model_df["discounter_500m"] + model_df["other_500m"]
model_df["migros_500m"] = model_df["migros_500m"] + model_df["coop_500m"]

model_df = model_df[["hh_ha", "pers_ha", "pt_dis", "station_dis", "migros", "others", "number_companies", "migros_500m", "others_500m", "number_companies_500m"]]
model_df.head()

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,others,number_companies,migros_500m,others_500m,number_companies_500m
0,117,204,145,1835,1.0,0.0,0,0.0,0.0,7
1,14,64,225,3000,1.0,0.0,0,1.0,0.0,0
2,0,0,225,2190,1.0,0.0,0,1.0,2.0,3
3,63,107,110,1060,1.0,0.0,0,2.0,0.0,2
4,99,142,90,1890,1.0,0.0,0,0.0,0.0,0


In [42]:
# Data cleaning 
def f(row):
    val = 0
    if row['migros'] >= 1:
        val = 1
    return val

model_df['migros'] = model_df.apply(f, axis=1)
model_df["migros"].unique()

array([1, 0], dtype=int64)

In [43]:
round(model_df.describe(),2)

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,others,number_companies,migros_500m,others_500m,number_companies_500m
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,27.05,55.31,177.11,3546.25,0.33,0.0,0.02,1.02,0.37,2.96
std,33.74,68.67,117.65,1957.24,0.47,0.05,0.19,1.61,0.7,4.29
min,0.0,0.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,105.0,1990.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.0,24.0,155.0,3530.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,44.5,80.5,212.5,4742.5,1.0,0.0,0.0,1.0,1.0,4.0
max,147.0,323.0,1085.0,9485.0,1.0,1.0,2.0,9.0,5.0,31.0


In [44]:
# Correlation?
round(model_df.corr(method="pearson"), 2)

Unnamed: 0,hh_ha,pers_ha,pt_dis,station_dis,migros,others,number_companies,migros_500m,others_500m,number_companies_500m
hh_ha,1.0,0.96,-0.21,-0.18,0.11,0.05,0.03,-0.01,-0.08,0.05
pers_ha,0.96,1.0,-0.21,-0.16,0.1,0.04,0.03,-0.0,-0.08,0.03
pt_dis,-0.21,-0.21,1.0,0.26,-0.26,-0.05,-0.02,-0.25,-0.14,-0.25
station_dis,-0.18,-0.16,0.26,1.0,-0.37,-0.08,-0.02,-0.54,-0.14,-0.47
migros,0.11,0.1,-0.26,-0.37,1.0,0.07,0.1,0.44,0.21,0.35
others,0.05,0.04,-0.05,-0.08,0.07,1.0,-0.01,0.16,-0.03,0.05
number_companies,0.03,0.03,-0.02,-0.02,0.1,-0.01,1.0,0.03,-0.02,0.06
migros_500m,-0.01,-0.0,-0.25,-0.54,0.44,0.16,0.03,1.0,0.45,0.54
others_500m,-0.08,-0.08,-0.14,-0.14,0.21,-0.03,-0.02,0.45,1.0,0.36
number_companies_500m,0.05,0.03,-0.25,-0.47,0.35,0.05,0.06,0.54,0.36,1.0


# Logistic Regression

In [45]:
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [48]:
# Dependant variable: "migros_boolean"
dep_var = ["migros"]

# Independent variables:
indep_var = ["hh_ha", "pers_ha", "pt_dis", "station_dis"]

# Model:
x = model_df[indep_var]
y = model_df[dep_var]

logit_model = sm.Logit(y,x)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.565531
         Iterations 5
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.112     
Dependent Variable: migros           AIC:              445.7207  
Date:               2021-09-08 19:37 BIC:              461.5544  
No. Observations:   387              Log-Likelihood:   -218.86   
Df Model:           3                LL-Null:          -246.33   
Df Residuals:       383              LLR p-value:      7.0672e-12
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
hh_ha          0.0065    0.0113   0.5705  0.5683  -0.0158   0.0287
pers_ha        0.0008    0.0056   0.1450  0.8847  -0.0102   0.

In [None]:
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

In [49]:
y_pred = logreg.predict(df_no_sup)



NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.