## Correlation and Variance Inflation, Trip Distance (agg)

This is a notebook to compute variance inflation factors and correlation coefficients between urban form and selected other features for models of aggregate trip distance in 19 European cities.

In general, we try to avoid VIF>5, and correlation factors > 0.7 (absolute value)

In [1]:
# script to model avergage trip distances in all cities
# last update Peter Berrill 20 Nov 2023

# load in required packages
import numpy as np
import pandas as pd
import shap
import re
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_validate, GroupKFold, StratifiedGroupKFold, RepeatedKFold, StratifiedKFold, GridSearchCV, KFold
from sklearn import metrics, linear_model
from xgboost import XGBClassifier, XGBRegressor
import os
import sys
import matplotlib.pyplot as plt
import pickle
import statsmodels.formula.api as smf
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import seaborn as sns

cities_all=['Berlin','Dresden','Düsseldorf','Frankfurt am Main','Kassel','Leipzig','Magdeburg','Potsdam','Clermont','Dijon','Lille','Lyon','Montpellier','Nantes','Nimes','Paris','Toulouse','Madrid','Wien','France_other','Germany_other']
countries=['Germany','Germany','Germany','Germany','Germany','Germany','Germany','Germany','France','France','France','France','France','France','France','France','France','Spain','Austria','France','Germany']
cities_small=['Dresden','Düsseldorf','Frankfurt am Main','Kassel','Leipzig','Magdeburg','Potsdam','Clermont','Dijon','Lille','Lyon','Montpellier','Nantes','Nimes','Toulouse']
cities_main=['Berlin','Paris','Madrid','Wien','Germany_other','France_other']


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [2]:
def vif_corr(city):
    country=countries[cities_all.index(city)]
    print(city, country)
    if city=='Germany_other':
        city0='Dresden'
        df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
        df0['Commute_Trip']=0
        df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df0=df0.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
        #'PopDensity_res','BuildDensity_res', 
        'UrbPopDensity_res', 'UrbBuildDensity_res',
        'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
        'LU_Comm_res',  'Commute_Trip','Age','Trip_Distance']] # 'LU_Road_res', 'LU_Urban_res',
        df0['City']=city0
        df_all=df0.copy()

        cities0=['Leipzig','Magdeburg','Potsdam','Frankfurt am Main','Düsseldorf','Kassel']
        for city1 in cities0:
                df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
                df1['Commute_Trip']=0
                df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
                df1=df1.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
                #'PopDensity_res','BuildDensity_res', 
                'UrbPopDensity_res', 'UrbBuildDensity_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]

                df1['City']=city1
                if len(df1.columns==df_all.columns):
                       df_all=pd.concat([df_all,df1])

        df_UF=df_all.copy()
    elif city=='France_other':
        city0='Clermont'
        df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
        df0['Commute_Trip']=0
        df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df0=df0.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
        #'PopDensity_res','BuildDensity_res', 
        'UrbPopDensity_res', 'UrbBuildDensity_res',
        'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
        'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]

        df0['City']=city0
        df_all=df0.copy()

        cities0=['Toulouse','Montpellier','Lyon','Nantes','Nimes','Lille','Dijon']
        for city1 in cities0:
                df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
                df1['Commute_Trip']=0
                df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
                df1=df1.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
                #'PopDensity_res','BuildDensity_res', 
                'UrbPopDensity_res', 'UrbBuildDensity_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
                df1['City']=city1
                if len(df1.columns==df_all.columns):
                       df_all=pd.concat([df_all,df1])

        df_UF=df_all.copy()
    else:
            df=pd.read_csv('../outputs/Combined/' + city + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
            df['Commute_Trip']=0
            df.loc[df['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
            df_UF=df.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
                            #'PopDensity_res','BuildDensity_res',
                            'UrbPopDensity_res', 'UrbBuildDensity_res',
                            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                            'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]

    count=df_UF.groupby('Res_geocode')['Trip_Distance'].count().reset_index()
    count.rename(columns={'Trip_Distance':'count'},inplace=True)
    df_UF=df_UF.groupby('Res_geocode').mean().drop_duplicates() #
    df_UF.reset_index(inplace=True)

    df_UF=df_UF.merge(count)
    df_UF=df_UF.loc[df_UF['count']>4,]
    df_agg=df_UF.copy()
    

    df_agg.sort_values(by='Res_geocode',inplace=True)
    df_agg.dropna(subset=['Trip_Distance'],inplace=True)
    if city in ['Leipzig','Germany_other']:
           df_agg=df_agg.loc[df_agg['UrbBuildDensity_res']<1e8,:]
#     if city=='Wien':
#            df_agg=df_agg.loc[:,['Res_geocode', 'DistCenter_res','UrbPopDensity_res','Commute_Trip','Trip_Distance','count']]
    # elif city in cities_small:
    #        df_agg=df_agg.loc[:,['Res_geocode', 'DistCenter_res','UrbPopDensity_res','Commute_Trip','LU_UrbFab_res','DistSubcenter_res','Trip_Distance','count']]
    # else:
    #        df_agg=df_agg.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res','UrbPopDensity_res',
    #                             'UrbBuildDensity_res','IntersecDensity_res',  'LU_UrbFab_res','Commute_Trip','Age','Trip_Distance','count']]

    target='Trip_Distance'

    X=df_agg.drop(columns=['Res_geocode','count',target])
    return X

In [3]:
X=vif_corr('Berlin')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Berlin Germany
                Feature         VIF
0                 const  329.902832
1     DistSubcenter_res    1.388953
2        DistCenter_res    2.857265
3     UrbPopDensity_res    4.038368
4   UrbBuildDensity_res    3.456792
5   IntersecDensity_res    3.340469
6     street_length_res    2.082586
7         LU_UrbFab_res    2.353261
8           LU_Comm_res    1.816269
9          Commute_Trip    1.332942
10                  Age    1.672473


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.427,-0.434,-0.413,-0.301,0.199,-0.247,-0.191,-0.126,0.179
DistCenter_res,0.427,1.0,-0.614,-0.72,-0.549,0.516,-0.241,-0.287,-0.158,0.441
UrbPopDensity_res,-0.434,-0.614,1.0,0.716,0.638,-0.473,0.596,-0.115,0.165,-0.443
UrbBuildDensity_res,-0.413,-0.72,0.716,1.0,0.554,-0.503,0.347,0.28,0.205,-0.353
IntersecDensity_res,-0.301,-0.549,0.638,0.554,1.0,-0.69,0.635,-0.168,-0.086,-0.211
street_length_res,0.199,0.516,-0.473,-0.503,-0.69,1.0,-0.411,-0.026,0.052,0.205
LU_UrbFab_res,-0.247,-0.241,0.596,0.347,0.635,-0.411,1.0,-0.352,-0.095,-0.037
LU_Comm_res,-0.191,-0.287,-0.115,0.28,-0.168,-0.026,-0.352,1.0,0.148,-0.081
Commute_Trip,-0.126,-0.158,0.165,0.205,-0.086,0.052,-0.095,0.148,1.0,-0.42
Age,0.179,0.441,-0.443,-0.353,-0.211,0.205,-0.037,-0.081,-0.42,1.0


In Berlin, VIFs are acceptable, although all 3 density metrics are moderately high. Built-up density has a high correlation (0.716) with population density.

Repeating without built-up density leads to notably lower VIFs for population and intersection density.

In [4]:
X=vif_corr('Berlin')
X.drop(columns='UrbBuildDensity_res',inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Berlin Germany
               Feature         VIF
0                const  329.608467
1    DistSubcenter_res    1.387893
2       DistCenter_res    2.640121
3    UrbPopDensity_res    3.088421
4  IntersecDensity_res    3.281482
5    street_length_res    2.069237
6        LU_UrbFab_res    2.351799
7          LU_Comm_res    1.548786
8         Commute_Trip    1.305474
9                  Age    1.646900


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.427,-0.434,-0.301,0.199,-0.247,-0.191,-0.126,0.179
DistCenter_res,0.427,1.0,-0.614,-0.549,0.516,-0.241,-0.287,-0.158,0.441
UrbPopDensity_res,-0.434,-0.614,1.0,0.638,-0.473,0.596,-0.115,0.165,-0.443
IntersecDensity_res,-0.301,-0.549,0.638,1.0,-0.69,0.635,-0.168,-0.086,-0.211
street_length_res,0.199,0.516,-0.473,-0.69,1.0,-0.411,-0.026,0.052,0.205
LU_UrbFab_res,-0.247,-0.241,0.596,0.635,-0.411,1.0,-0.352,-0.095,-0.037
LU_Comm_res,-0.191,-0.287,-0.115,-0.168,-0.026,-0.352,1.0,0.148,-0.081
Commute_Trip,-0.126,-0.158,0.165,-0.086,0.052,-0.095,0.148,1.0,-0.42
Age,0.179,0.441,-0.443,-0.211,0.205,-0.037,-0.081,-0.42,1.0


In [5]:
X=vif_corr('Paris')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Paris France
                Feature         VIF
0                 const  531.074906
1     DistSubcenter_res    1.811977
2        DistCenter_res    3.793421
3     UrbPopDensity_res    4.008735
4   UrbBuildDensity_res    3.210486
5   IntersecDensity_res    4.229459
6     street_length_res    1.932611
7         LU_UrbFab_res    2.849552
8           LU_Comm_res    2.034762
9          Commute_Trip    1.196867
10                  Age    1.335738


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.618,-0.429,-0.514,-0.424,0.084,-0.092,-0.173,-0.081,0.004
DistCenter_res,0.618,1.0,-0.75,-0.725,-0.635,0.169,-0.325,0.009,0.028,-0.18
UrbPopDensity_res,-0.429,-0.75,1.0,0.728,0.669,-0.323,0.498,-0.248,-0.041,0.181
UrbBuildDensity_res,-0.514,-0.725,0.728,1.0,0.506,-0.165,0.146,0.085,-0.138,0.195
IntersecDensity_res,-0.424,-0.635,0.669,0.506,1.0,-0.588,0.624,-0.26,0.011,0.138
street_length_res,0.084,0.169,-0.323,-0.165,-0.588,1.0,-0.326,0.334,-0.019,-0.178
LU_UrbFab_res,-0.092,-0.325,0.498,0.146,0.624,-0.326,1.0,-0.589,-0.015,0.118
LU_Comm_res,-0.173,0.009,-0.248,0.085,-0.26,0.334,-0.589,1.0,0.009,-0.266
Commute_Trip,-0.081,0.028,-0.041,-0.138,0.011,-0.019,-0.015,0.009,1.0,-0.343
Age,0.004,-0.18,0.181,0.195,0.138,-0.178,0.118,-0.266,-0.343,1.0


In Paris, VIFs are acceptable, although all 3 density metrics and distance to center are moderately high. Built-up density has a high correlation (0.728) with population density.

Repeating without built-up density leads to notably lower VIF for population density, although it remains almost as high for intersection density.

In [6]:
X=vif_corr('Paris')
X.drop(columns='UrbBuildDensity_res',inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Paris France
               Feature         VIF
0                const  528.835042
1    DistSubcenter_res    1.790058
2       DistCenter_res    3.624931
3    UrbPopDensity_res    2.951846
4  IntersecDensity_res    4.175236
5    street_length_res    1.928134
6        LU_UrbFab_res    2.681047
7          LU_Comm_res    1.988665
8         Commute_Trip    1.165444
9                  Age    1.322872


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.618,-0.429,-0.424,0.084,-0.092,-0.173,-0.081,0.004
DistCenter_res,0.618,1.0,-0.75,-0.635,0.169,-0.325,0.009,0.028,-0.18
UrbPopDensity_res,-0.429,-0.75,1.0,0.669,-0.323,0.498,-0.248,-0.041,0.181
IntersecDensity_res,-0.424,-0.635,0.669,1.0,-0.588,0.624,-0.26,0.011,0.138
street_length_res,0.084,0.169,-0.323,-0.588,1.0,-0.326,0.334,-0.019,-0.178
LU_UrbFab_res,-0.092,-0.325,0.498,0.624,-0.326,1.0,-0.589,-0.015,0.118
LU_Comm_res,-0.173,0.009,-0.248,-0.26,0.334,-0.589,1.0,0.009,-0.266
Commute_Trip,-0.081,0.028,-0.041,0.011,-0.019,-0.015,0.009,1.0,-0.343
Age,0.004,-0.18,0.181,0.138,-0.178,0.118,-0.266,-0.343,1.0


In [7]:
X=vif_corr('Madrid')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Madrid Spain
                Feature         VIF
0                 const  232.830446
1     DistSubcenter_res    1.338943
2        DistCenter_res    1.280247
3     UrbPopDensity_res    2.065712
4   UrbBuildDensity_res    1.125301
5   IntersecDensity_res    2.480232
6     street_length_res    2.130265
7         LU_UrbFab_res    2.168301
8           LU_Comm_res    1.338040
9          Commute_Trip    1.178086
10                  Age    1.385659


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.192,-0.384,-0.162,-0.39,0.318,-0.187,-0.151,-0.058,-0.121
DistCenter_res,0.192,1.0,-0.166,-0.22,-0.27,-0.003,-0.176,0.086,0.066,-0.285
UrbPopDensity_res,-0.384,-0.166,1.0,0.227,0.548,-0.451,0.615,-0.143,-0.136,0.331
UrbBuildDensity_res,-0.162,-0.22,0.227,1.0,0.09,-0.07,0.206,-0.042,-0.079,0.171
IntersecDensity_res,-0.39,-0.27,0.548,0.09,1.0,-0.671,0.442,0.03,-0.081,0.332
street_length_res,0.318,-0.003,-0.451,-0.07,-0.671,1.0,-0.381,-0.126,0.129,-0.22
LU_UrbFab_res,-0.187,-0.176,0.615,0.206,0.442,-0.381,1.0,-0.365,-0.286,0.433
LU_Comm_res,-0.151,0.086,-0.143,-0.042,0.03,-0.126,-0.365,1.0,0.219,-0.162
Commute_Trip,-0.058,0.066,-0.136,-0.079,-0.081,0.129,-0.286,0.219,1.0,-0.285
Age,-0.121,-0.285,0.331,0.171,0.332,-0.22,0.433,-0.162,-0.285,1.0


In Madrid, VIFs are all low. No correlations are > 0.7.

In [8]:
X=vif_corr('Wien')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Wien Austria
                Feature         VIF
0                 const  433.199771
1     DistSubcenter_res    3.490305
2        DistCenter_res    7.239788
3     UrbPopDensity_res    9.384325
4   UrbBuildDensity_res    3.645296
5   IntersecDensity_res   10.073036
6     street_length_res    4.362897
7         LU_UrbFab_res    7.545909
8           LU_Comm_res    1.661924
9          Commute_Trip    1.563596
10                  Age    1.399181


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.767,-0.483,-0.541,-0.616,0.443,-0.588,-0.305,-0.117,-0.176
DistCenter_res,0.767,1.0,-0.789,-0.726,-0.821,0.654,-0.749,-0.194,-0.179,-0.137
UrbPopDensity_res,-0.483,-0.789,1.0,0.645,0.808,-0.661,0.874,0.047,0.389,0.042
UrbBuildDensity_res,-0.541,-0.726,0.645,1.0,0.71,-0.57,0.591,0.445,0.154,0.041
IntersecDensity_res,-0.616,-0.821,0.808,0.71,1.0,-0.852,0.852,0.097,0.155,0.297
street_length_res,0.443,0.654,-0.661,-0.57,-0.852,1.0,-0.745,-0.058,0.023,-0.345
LU_UrbFab_res,-0.588,-0.749,0.874,0.591,0.852,-0.745,1.0,0.094,0.206,0.167
LU_Comm_res,-0.305,-0.194,0.047,0.445,0.097,-0.058,0.094,1.0,0.044,0.052
Commute_Trip,-0.117,-0.179,0.389,0.154,0.155,0.023,0.206,0.044,1.0,-0.206
Age,-0.176,-0.137,0.042,0.041,0.297,-0.345,0.167,0.052,-0.206,1.0


Vienna is very problematic in terms of correlations and VIF, possibly due to it's relatively small sample size. 

We can drop many features outside of our two main Urban Form feautres of interest, distance to center and population density.
This helps a lot, although distance to center and population density have a high correlation (0.789). 

In [9]:
X=vif_corr('Wien')
X.drop(columns=['IntersecDensity_res','LU_UrbFab_res','UrbBuildDensity_res','street_length_res','DistSubcenter_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Wien Austria
             Feature         VIF
0              const  212.357275
1     DistCenter_res    3.054534
2  UrbPopDensity_res    3.323917
3        LU_Comm_res    1.082402
4       Commute_Trip    1.313317
5                Age    1.080270


Unnamed: 0,DistCenter_res,UrbPopDensity_res,LU_Comm_res,Commute_Trip,Age
DistCenter_res,1.0,-0.789,-0.194,-0.179,-0.137
UrbPopDensity_res,-0.789,1.0,0.047,0.389,0.042
LU_Comm_res,-0.194,0.047,1.0,0.044,0.052
Commute_Trip,-0.179,0.389,0.044,1.0,-0.206
Age,-0.137,0.042,0.052,-0.206,1.0


Trying different model specifications, the linear model worked much better in terms of accuracy (r2) when commercial land-use and age were not included.
Again perhaps due to the small sample size (44), the model is maybe not capable of handling too many input features.

In [10]:
X=vif_corr('Wien')
X.drop(columns=['IntersecDensity_res','LU_UrbFab_res','UrbBuildDensity_res','street_length_res','DistSubcenter_res','LU_Comm_res','Age'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Wien Austria
             Feature        VIF
0              const  30.287055
1     DistCenter_res   2.798870
2  UrbPopDensity_res   3.194024
3       Commute_Trip   1.243340


Unnamed: 0,DistCenter_res,UrbPopDensity_res,Commute_Trip
DistCenter_res,1.0,-0.789,-0.179
UrbPopDensity_res,-0.789,1.0,0.389
Commute_Trip,-0.179,0.389,1.0


In [11]:
X=vif_corr('France_other')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

France_other France
                Feature         VIF
0                 const  134.033249
1     DistSubcenter_res    1.466370
2        DistCenter_res    1.458918
3     UrbPopDensity_res    2.792524
4   UrbBuildDensity_res    2.447014
5   IntersecDensity_res    4.021977
6     street_length_res    1.742977
7         LU_UrbFab_res    2.228169
8           LU_Comm_res    1.774276
9          Commute_Trip    1.071516
10                  Age    1.154597


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.459,-0.297,-0.293,-0.451,0.344,-0.307,-0.245,0.028,0.059
DistCenter_res,0.459,1.0,-0.375,-0.291,-0.462,0.267,-0.278,-0.193,-0.071,0.036
UrbPopDensity_res,-0.297,-0.375,1.0,0.523,0.691,-0.331,0.53,0.08,-0.007,-0.167
UrbBuildDensity_res,-0.293,-0.291,0.523,1.0,0.457,-0.206,0.041,0.564,0.034,-0.234
IntersecDensity_res,-0.451,-0.462,0.691,0.457,1.0,-0.629,0.627,0.249,-0.02,-0.117
street_length_res,0.344,0.267,-0.331,-0.206,-0.629,1.0,-0.413,-0.148,-0.002,0.047
LU_UrbFab_res,-0.307,-0.278,0.53,0.041,0.627,-0.413,1.0,-0.13,-0.016,0.066
LU_Comm_res,-0.245,-0.193,0.08,0.564,0.249,-0.148,-0.13,1.0,0.054,-0.195
Commute_Trip,0.028,-0.071,-0.007,0.034,-0.02,-0.002,-0.016,0.054,1.0,-0.216
Age,0.059,0.036,-0.167,-0.234,-0.117,0.047,0.066,-0.195,-0.216,1.0


In rest of France, VIF is highest for intersection density, which has high correlation (although < 0.7) with population density.

Without intersection density, max VIF is low (2.4) and no individual correlation is problematic.

In [12]:
X=vif_corr('France_other')
X.drop(columns=['IntersecDensity_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

France_other France
               Feature         VIF
0                const  104.189437
1    DistSubcenter_res    1.457221
2       DistCenter_res    1.415082
3    UrbPopDensity_res    2.419412
4  UrbBuildDensity_res    2.406285
5    street_length_res    1.334422
6        LU_UrbFab_res    1.858778
7          LU_Comm_res    1.711509
8         Commute_Trip    1.066359
9                  Age    1.152301


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.459,-0.297,-0.293,0.344,-0.307,-0.245,0.028,0.059
DistCenter_res,0.459,1.0,-0.375,-0.291,0.267,-0.278,-0.193,-0.071,0.036
UrbPopDensity_res,-0.297,-0.375,1.0,0.523,-0.331,0.53,0.08,-0.007,-0.167
UrbBuildDensity_res,-0.293,-0.291,0.523,1.0,-0.206,0.041,0.564,0.034,-0.234
street_length_res,0.344,0.267,-0.331,-0.206,1.0,-0.413,-0.148,-0.002,0.047
LU_UrbFab_res,-0.307,-0.278,0.53,0.041,-0.413,1.0,-0.13,-0.016,0.066
LU_Comm_res,-0.245,-0.193,0.08,0.564,-0.148,-0.13,1.0,0.054,-0.195
Commute_Trip,0.028,-0.071,-0.007,0.034,-0.002,-0.016,0.054,1.0,-0.216
Age,0.059,0.036,-0.167,-0.234,0.047,0.066,-0.195,-0.216,1.0


In [13]:
X=vif_corr('Germany_other')
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Germany_other Germany
                Feature         VIF
0                 const  223.647581
1     DistSubcenter_res    1.277411
2        DistCenter_res    2.019739
3     UrbPopDensity_res    2.792178
4   UrbBuildDensity_res    1.664990
5   IntersecDensity_res    6.274010
6     street_length_res    2.326492
7         LU_UrbFab_res    2.977671
8           LU_Comm_res    1.377117
9          Commute_Trip    1.591730
10                  Age    1.230517


Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.298,-0.207,-0.383,-0.246,0.155,-0.117,-0.289,-0.039,0.018
DistCenter_res,0.298,1.0,-0.431,-0.44,-0.64,0.356,-0.451,-0.239,-0.273,0.285
UrbPopDensity_res,-0.207,-0.431,1.0,0.317,0.697,-0.525,0.748,0.006,0.127,-0.292
UrbBuildDensity_res,-0.383,-0.44,0.317,1.0,0.392,-0.226,0.244,0.443,0.276,-0.092
IntersecDensity_res,-0.246,-0.64,0.697,0.392,1.0,-0.733,0.729,0.073,0.467,-0.365
street_length_res,0.155,0.356,-0.525,-0.226,-0.733,1.0,-0.507,-0.033,-0.331,0.318
LU_UrbFab_res,-0.117,-0.451,0.748,0.244,0.729,-0.507,1.0,-0.054,0.176,-0.285
LU_Comm_res,-0.289,-0.239,0.006,0.443,0.073,-0.033,-0.054,1.0,0.19,-0.049
Commute_Trip,-0.039,-0.273,0.127,0.276,0.467,-0.331,0.176,0.19,1.0,-0.28
Age,0.018,0.285,-0.292,-0.092,-0.365,0.318,-0.285,-0.049,-0.28,1.0


In rest of Germany, intersection density VIF is high (6.3), and it is highly correlated with popultion density, street length and urban fabric land use.

Dropping intersection density makes the VIF and correlations much better. One problematic correlation remains between population density and urban fabric land use (0.748), so we also drop urban fabric land use

After dropping these two feautures, all VIFs are low, and no problematic correlations remain.

In [14]:
X=vif_corr('Germany_other')
X.drop(columns=['IntersecDensity_res','LU_UrbFab_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Germany_other Germany
               Feature         VIF
0                const  140.529973
1    DistSubcenter_res    1.256266
2       DistCenter_res    1.540123
3    UrbPopDensity_res    1.661801
4  UrbBuildDensity_res    1.658051
5    street_length_res    1.578462
6          LU_Comm_res    1.336953
7         Commute_Trip    1.290401
8                  Age    1.229880


Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,street_length_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.298,-0.207,-0.383,0.155,-0.289,-0.039,0.018
DistCenter_res,0.298,1.0,-0.431,-0.44,0.356,-0.239,-0.273,0.285
UrbPopDensity_res,-0.207,-0.431,1.0,0.317,-0.525,0.006,0.127,-0.292
UrbBuildDensity_res,-0.383,-0.44,0.317,1.0,-0.226,0.443,0.276,-0.092
street_length_res,0.155,0.356,-0.525,-0.226,1.0,-0.033,-0.331,0.318
LU_Comm_res,-0.289,-0.239,0.006,0.443,-0.033,1.0,0.19,-0.049
Commute_Trip,-0.039,-0.273,0.127,0.276,-0.331,0.19,1.0,-0.28
Age,0.018,0.285,-0.292,-0.092,0.318,-0.049,-0.28,1.0


In [15]:
# now check for all cities data combined
city0='Berlin'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
df0['Commute_Trip']=0
df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
#print(len(df0.columns), 'columns in the data for ', city0)
df0=df0.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
#'PopDensity_res','BuildDensity_res', 
'UrbPopDensity_res', 'UrbBuildDensity_res',
'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
'LU_Comm_res',  'Commute_Trip','Age','Trip_Distance']] # 'LU_Road_res', 'LU_Urban_res',
df0['City']=city0
df0['Country']='Germany'
df_all=df0.copy()

cities0=['Dresden','Leipzig','Magdeburg','Potsdam','Frankfurt am Main','Düsseldorf','Kassel']
for city1 in cities0:
        print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
        df1['Commute_Trip']=0
        df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df1=df1.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
        #'PopDensity_res','BuildDensity_res', 
        'UrbPopDensity_res', 'UrbBuildDensity_res',
        'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
        'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
        #print(len(df1.columns), 'columns in the data for ', city1)
        df1['City']=city1
        df1['Country']='Germany'
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')
                print(len(df_all), 'rows in the combined dataframe')
df_DE=df_all.copy()
# FR, other
city0='Clermont'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
df0['Commute_Trip']=0
df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
df0=df0.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
#'PopDensity_res','BuildDensity_res', 
'UrbPopDensity_res', 'UrbBuildDensity_res',
'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
#print(len(df0.columns), 'columns in the data for ', city0)
df0['City']=city0
df0['Country']='France'
df_all=df0.copy()

cities0=['Toulouse','Montpellier','Lyon','Nantes','Nimes','Lille','Dijon','Paris']
for city1 in cities0:
        print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
        df1['Commute_Trip']=0
        df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df1=df1.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
        #'PopDensity_res','BuildDensity_res', 
        'UrbPopDensity_res', 'UrbBuildDensity_res',
        'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
        'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
#print(len(df0.columns), 'columns in the data for ', city0)
        print(len(df1.columns), 'columns in the data for ', city1)
        df1['City']=city1
        df1['Country']='France'
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')
                print(len(df_all), 'rows in the combined dataframe')
df_FR=df_all.copy()
df=pd.read_csv('../outputs/Combined/' + 'Madrid' + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
df['Commute_Trip']=0
df.loc[df['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
df_UF=df.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
                #'PopDensity_res','BuildDensity_res',
                'UrbPopDensity_res', 'UrbBuildDensity_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
df_UF['City']='Madrid'
df_UF['Country']='Spain'
df_Madrid=df_UF.copy()
df=pd.read_csv('../outputs/Combined/' + 'Wien' + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
df['Commute_Trip']=0
df.loc[df['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
df_UF=df.loc[:,['Res_geocode', 'DistSubcenter_res', 'DistCenter_res',
                #'PopDensity_res','BuildDensity_res',
                'UrbPopDensity_res', 'UrbBuildDensity_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Commute_Trip','Age','Trip_Distance']]
df_UF['City']='Wien'
df_UF['Country']='Austria'
df_Wien=df_UF.copy()
del df_UF
df_UF=pd.concat([df_DE,df_FR,df_Madrid,df_Wien],ignore_index=True)
df_UF['Res_geocode']= df_UF['City'] + '_' + df_UF['Res_geocode'].astype(str)
df_UF.drop(columns='City',inplace=True)


Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.


Dresden
Dresden added.
126573 rows in the combined dataframe
Leipzig
Leipzig added.
138169 rows in the combined dataframe
Magdeburg
Magdeburg added.
147545 rows in the combined dataframe
Potsdam
Potsdam added.
153368 rows in the combined dataframe
Frankfurt am Main
Frankfurt am Main added.
159403 rows in the combined dataframe
Düsseldorf
Düsseldorf added.
183247 rows in the combined dataframe
Kassel
Kassel added.
192614 rows in the combined dataframe


Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.


Toulouse
12 columns in the data for  Toulouse
Toulouse added.
49864 rows in the combined dataframe
Montpellier
12 columns in the data for  Montpellier
Montpellier added.
80127 rows in the combined dataframe
Lyon
12 columns in the data for  Lyon
Lyon added.
132884 rows in the combined dataframe
Nantes
12 columns in the data for  Nantes
Nantes added.
167926 rows in the combined dataframe
Nimes
12 columns in the data for  Nimes
Nimes added.
179143 rows in the combined dataframe
Lille
12 columns in the data for  Lille
Lille added.
215992 rows in the combined dataframe
Dijon
12 columns in the data for  Dijon
Dijon added.
230246 rows in the combined dataframe
Paris
12 columns in the data for  Paris
Paris added.
293160 rows in the combined dataframe


In [16]:
count=df_UF.groupby('Res_geocode')['Trip_Distance'].count().reset_index()
count.rename(columns={'Trip_Distance':'count'},inplace=True)
df_UF=df_UF.groupby('Res_geocode').mean().drop_duplicates() #
df_UF.reset_index(inplace=True)

df_UF=df_UF.merge(count)
df_UF=df_UF.loc[df_UF['count']>4,]
df_agg=df_UF.copy()

No correlations or VIFs are high enough for concern with all data combined


In [17]:
X=df_agg.drop(columns=['Res_geocode','count','Trip_Distance'])
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature         VIF
0                 const  127.579704
1     DistSubcenter_res    1.392080
2        DistCenter_res    1.250864
3     UrbPopDensity_res    2.068669
4   UrbBuildDensity_res    1.205950
5   IntersecDensity_res    3.281989
6     street_length_res    1.926918
7         LU_UrbFab_res    1.731523
8           LU_Comm_res    1.229449
9          Commute_Trip    1.140095
10                  Age    1.064541


Unnamed: 0,DistSubcenter_res,DistCenter_res,UrbPopDensity_res,UrbBuildDensity_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res,Commute_Trip,Age
DistSubcenter_res,1.0,0.363,-0.334,-0.196,-0.384,0.225,-0.254,-0.229,-0.057,-0.004
DistCenter_res,0.363,1.0,-0.106,-0.135,-0.256,0.104,-0.183,-0.123,0.12,0.029
UrbPopDensity_res,-0.334,-0.106,1.0,0.382,0.623,-0.33,0.492,-0.028,0.245,0.035
UrbBuildDensity_res,-0.196,-0.135,0.382,1.0,0.264,-0.111,0.151,0.09,0.115,0.052
IntersecDensity_res,-0.384,-0.256,0.623,0.264,1.0,-0.676,0.557,0.089,0.181,-0.036
street_length_res,0.225,0.104,-0.33,-0.111,-0.676,1.0,-0.382,-0.045,-0.105,0.069
LU_UrbFab_res,-0.254,-0.183,0.492,0.151,0.557,-0.382,1.0,-0.202,0.002,0.064
LU_Comm_res,-0.229,-0.123,-0.028,0.09,0.089,-0.045,-0.202,1.0,0.063,-0.185
Commute_Trip,-0.057,0.12,0.245,0.115,0.181,-0.105,0.002,0.063,1.0,-0.111
Age,-0.004,0.029,0.035,0.052,-0.036,0.069,0.064,-0.185,-0.111,1.0
