## Correlation and Variance Inflation, Trip Distance (commute)

This is a notebook to compute variance inflation factors and correlation coefficients between urban form and selected other features for models of commute trip distance in 19 European cities.

In general, we try to avoid VIF>5, and correlation factors > 0.7 (absolute value)

In [1]:
# script to model avergage trip distances for commute trips in all cities
# last update Peter Berrill Nov 20 2023

# load in required packages
import numpy as np
import pandas as pd
import shap
import re
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_validate, GroupKFold, StratifiedGroupKFold, RepeatedKFold, StratifiedKFold, GridSearchCV, KFold
from sklearn import metrics, linear_model
from xgboost import XGBClassifier, XGBRegressor
import os
import sys
import matplotlib.pyplot as plt
import pickle
import statsmodels.formula.api as smf
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import seaborn as sns

cities_all=['Berlin','Dresden','Düsseldorf','Frankfurt am Main','Kassel','Leipzig','Magdeburg','Potsdam','Clermont','Dijon','Lille','Lyon','Montpellier','Nantes','Nimes','Paris','Toulouse','Madrid','Wien','France_other','Germany_other']
countries=['Germany','Germany','Germany','Germany','Germany','Germany','Germany','Germany','France','France','France','France','France','France','France','France','France','Spain','Austria','France','Germany']


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [3]:
def vif_corr(city):
    country=countries[cities_all.index(city)]
    print(city, country)
    if city=='Germany_other':
        city0='Dresden'
        df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
        df0.loc[(df0['Training'].isin(['Apprenticeship/Business','Craftsman/Technical'])) & (df0['Education']!='University'),'Education']='Apprenticeship'
        df0=df0.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                    'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                    'Sex', 'Occupation', 'Education','Age',
                    #'PopDensity_res','BuildDensity_res',
                    'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                    'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                    'LU_Comm_res' ,'Trip_Distance']]
        df0['City']=city0
        df_all=df0.copy()

        cities0=['Leipzig','Magdeburg','Potsdam','Frankfurt am Main','Düsseldorf','Kassel']
        for city1 in cities0:
                df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
                df1.loc[(df1['Training'].isin(['Apprenticeship/Business','Craftsman/Technical'])) & (df1['Education']!='University'),'Education']='Apprenticeship'
                df1=df1.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                            'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                            'Sex', 'Occupation', 'Education','Age',
                            #'PopDensity_res','BuildDensity_res',
                            'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                            'LU_Comm_res','Trip_Distance']]
                df1['City']=city1
                if len(df1.columns==df_all.columns):
                       df_all=pd.concat([df_all,df1])
        df_all['HHNR']=df_all['City']+'_'+df_all['HHNR'].astype(int).astype(str)
        df_all['HH_PNR']=df_all['City']+'_'+df_all['HH_PNR'].astype(int).astype(str)
        df_all['HH_P_WNR']=df_all['City']+'_'+df_all['HH_P_WNR'].astype(str)
        df_all.drop(columns='City',inplace=True)
        df_UF=df_all.copy()
    elif city=='France_other':
        city0='Clermont'
        df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
        df0=df0.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                    'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                    'Sex', 'Occupation', 'Education','Age',
                    #'PopDensity_res','BuildDensity_res',
                    'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                    'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                    'LU_Comm_res', 'Trip_Distance']]
        df0['City']=city0
        df_all=df0.copy()

        cities0=['Toulouse','Montpellier','Lyon','Nantes','Nimes','Lille','Dijon']
        for city1 in cities0:
                df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
                df1=df1.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                            'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                            'Sex', 'Occupation', 'Education','Age',
                            #'PopDensity_res','BuildDensity_res',
                            'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                            'LU_Comm_res', 'Trip_Distance']]
                df1['City']=city1
                if len(df1.columns==df_all.columns):
                       df_all=pd.concat([df_all,df1])
        df_all['HHNR']=df_all['City']+'_'+df_all['HHNR'].astype(int).astype(str)
        df_all['HH_PNR']=df_all['City']+'_'+df_all['HH_PNR'].astype(int).astype(str)
        df_all['HH_P_WNR']=df_all['City']+'_'+df_all['HH_P_WNR'].astype(str)
        df_all.drop(columns='City',inplace=True)
        df_UF=df_all.copy()
    else:
            df=pd.read_csv('../outputs/Combined/' + city + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
            if country=='Germany':
                df.loc[(df['Training'].isin(['Apprenticeship/Business','Craftsman/Technical'])) & (df['Education']!='University'),'Education']='Apprenticeship'
            df_UF=df.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                            'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                            'Sex', 'Occupation', 'Education','Age',
                            #'PopDensity_res','BuildDensity_res',
                            'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                            'LU_Comm_res', 'Trip_Distance']]
    # restrict to trips between home and work (commuting trips)        
    df_UF=df_UF.loc[df_UF['Trip_Purpose_Agg']=='Home↔Work',]
    df_UF.drop(columns='Trip_Purpose_Agg',inplace=True)
    # restrict to those in employment
    df_UF=df_UF.loc[df_UF['Occupation'].isin(['Trainee','Employed_FullTime','Employed_PartTime','Employed']),]

    Edu_dict={'University':'University','Secondary':'Secondary','Secondary+BAC':'Secondary','Secondary+Matura':'Secondary',
          'Apprenticeship':'Apprenticeship',
          'Elementary':'Primary/None','Pre-School':'Primary/None','No diploma yet':'Primary/None','Unknown':'Primary/None','Other':'Primary/None'}

    df_UF['Education']=df_UF['Education'].map(Edu_dict)
    if city in ['Clermont','Nimes']:
          df_UF.loc[df_UF['Education']=='Apprenticeship','Education']='Secondary'

    df=df_UF.dropna()
    df['Sex']=df['Sex']-1 # change from [1,2] to [0,1], for plotting purposes
    df=df.loc[df['UrbBuildDensity_res']<1e8,]   # remove high building density outliers (For Leipzig)

    # identify the feature columns
    N_non_feature=6 # how many non-features are at the start of the df
    cols=df.columns
    newcols=(df.columns[:N_non_feature].tolist()) + ('FeatureD' +'_'+ cols[N_non_feature:-1]).tolist() + (df.columns[-1:].tolist())
    # change column names
    df.set_axis(newcols,axis=1,inplace=True)
    df = df.reset_index(drop=True)
    df0=df.copy()

    # convert  all categorical variables to dummies
    df_Cat=df.select_dtypes('object')[[col for col in df.select_dtypes('object').columns if "FeatureD" in col]]
    for col in df_Cat:
        dum=pd.get_dummies(df[[col]])
        df = pd.concat([df, dum], axis = 1)
        # remove the original categorical columns
    df.drop(df_Cat.columns.tolist(),axis=1,inplace=True)

    X=df[[col for col in df.columns if "FeatureD" in col]]
    return X

In [4]:
X=vif_corr('Berlin')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Berlin Germany
                Feature         VIF
0                 const  227.031972
1                HHSize    1.049988
2                   Sex    1.004526
3                   Age    1.084612
4     UrbPopDensity_res    4.185069
5   UrbBuildDensity_res    4.025003
6     DistSubcenter_res    1.408009
7        DistCenter_res    3.046970
8   IntersecDensity_res    2.917028
9     street_length_res    1.935022
10        LU_UrbFab_res    1.930529
11          LU_Comm_res    1.959696


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.063,-0.127,-0.084,-0.101,0.085,0.094,-0.022,0.035,0.029,-0.076
Sex,-0.063,1.0,0.003,-0.001,0.0,0.005,0.009,0.003,-0.006,0.003,-0.004
Age,-0.127,0.003,1.0,-0.172,-0.151,0.105,0.166,-0.049,0.077,0.019,-0.115
UrbPopDensity_res,-0.084,-0.001,-0.172,1.0,0.768,-0.482,-0.668,0.62,-0.444,0.482,-0.04
UrbBuildDensity_res,-0.101,0.0,-0.151,0.768,1.0,-0.441,-0.749,0.534,-0.459,0.259,0.326
DistSubcenter_res,0.085,0.005,0.105,-0.482,-0.441,1.0,0.43,-0.337,0.178,-0.226,-0.148
DistCenter_res,0.094,0.009,0.166,-0.668,-0.749,0.43,1.0,-0.523,0.506,-0.143,-0.34
IntersecDensity_res,-0.022,0.003,-0.049,0.62,0.534,-0.337,-0.523,1.0,-0.655,0.561,-0.145
street_length_res,0.035,-0.006,0.077,-0.444,-0.459,0.178,0.506,-0.655,1.0,-0.298,-0.043
LU_UrbFab_res,0.029,0.003,0.019,0.482,0.259,-0.226,-0.143,0.561,-0.298,1.0,-0.395


In Berlin, VIFs are acceptable, although built-up density has a high correlation (0.768) with population density.

Repeating without built-up density leads to notably lower VIFs for population density.

In [5]:
# Berlin better VIF w/o bld dens
X.drop(columns='UrbBuildDensity_res',inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature         VIF
0                 const  226.606180
1                HHSize    1.049856
2                   Sex    1.004491
3                   Age    1.082793
4     UrbPopDensity_res    2.870375
5     DistSubcenter_res    1.407901
6        DistCenter_res    2.872738
7   IntersecDensity_res    2.881996
8     street_length_res    1.932668
9         LU_UrbFab_res    1.929822
10          LU_Comm_res    1.599221


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.063,-0.127,-0.084,0.085,0.094,-0.022,0.035,0.029,-0.076
Sex,-0.063,1.0,0.003,-0.001,0.005,0.009,0.003,-0.006,0.003,-0.004
Age,-0.127,0.003,1.0,-0.172,0.105,0.166,-0.049,0.077,0.019,-0.115
UrbPopDensity_res,-0.084,-0.001,-0.172,1.0,-0.482,-0.668,0.62,-0.444,0.482,-0.04
DistSubcenter_res,0.085,0.005,0.105,-0.482,1.0,0.43,-0.337,0.178,-0.226,-0.148
DistCenter_res,0.094,0.009,0.166,-0.668,0.43,1.0,-0.523,0.506,-0.143,-0.34
IntersecDensity_res,-0.022,0.003,-0.049,0.62,-0.337,-0.523,1.0,-0.655,0.561,-0.145
street_length_res,0.035,-0.006,0.077,-0.444,0.178,0.506,-0.655,1.0,-0.298,-0.043
LU_UrbFab_res,0.029,0.003,0.019,0.482,-0.226,-0.143,0.561,-0.298,1.0,-0.395
LU_Comm_res,-0.076,-0.004,-0.115,-0.04,-0.148,-0.34,-0.145,-0.043,-0.395,1.0


In [6]:
X=vif_corr('Paris')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Paris France
                Feature         VIF
0                 const  376.262980
1                HHSize    1.046552
2                   Sex    1.007918
3                   Age    1.009086
4     UrbPopDensity_res    5.519999
5   UrbBuildDensity_res    4.168286
6     DistSubcenter_res    1.489145
7        DistCenter_res    3.615204
8   IntersecDensity_res    4.151349
9     street_length_res    2.128021
10        LU_UrbFab_res    2.504173
11          LU_Comm_res    1.811359


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.075,-0.06,-0.166,-0.154,0.066,0.168,-0.157,0.095,-0.111,0.076
Sex,-0.075,1.0,0.042,0.008,0.01,0.001,0.0,0.0,-0.006,-0.008,0.011
Age,-0.06,0.042,1.0,0.03,0.046,-0.053,-0.043,0.016,-0.004,0.009,-0.012
UrbPopDensity_res,-0.166,0.008,0.03,1.0,0.821,-0.378,-0.772,0.684,-0.397,0.514,-0.307
UrbBuildDensity_res,-0.154,0.01,0.046,0.821,1.0,-0.447,-0.769,0.554,-0.247,0.278,-0.031
DistSubcenter_res,0.066,0.001,-0.053,-0.378,-0.447,1.0,0.528,-0.397,0.114,-0.107,-0.044
DistCenter_res,0.168,0.0,-0.043,-0.772,-0.769,0.528,1.0,-0.612,0.235,-0.294,0.108
IntersecDensity_res,-0.157,0.0,0.016,0.684,0.554,-0.397,-0.612,1.0,-0.64,0.612,-0.329
street_length_res,0.095,-0.006,-0.004,-0.397,-0.247,0.114,0.235,-0.64,1.0,-0.337,0.391
LU_UrbFab_res,-0.111,-0.008,0.009,0.514,0.278,-0.107,-0.294,0.612,-0.337,1.0,-0.57


In Paris, VIFs is high for population density, and other density measures.

Repeating without built-up density leads to notably lower VIFs for population density, it is still a bit high for intersection density (4.1) but acceptable.

Correlation remains high between population density and distance to center (0.772) but we retain these variables due to their high importance.

In [7]:
# Paris better VIF w/o bld dens
X.drop(columns='UrbBuildDensity_res',inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature         VIF
0                 const  374.397246
1                HHSize    1.045705
2                   Sex    1.007918
3                   Age    1.008345
4     UrbPopDensity_res    3.450744
5     DistSubcenter_res    1.476201
6        DistCenter_res    3.407399
7   IntersecDensity_res    4.149978
8     street_length_res    2.127007
9         LU_UrbFab_res    2.485582
10          LU_Comm_res    1.691672


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.075,-0.06,-0.166,0.066,0.168,-0.157,0.095,-0.111,0.076
Sex,-0.075,1.0,0.042,0.008,0.001,0.0,0.0,-0.006,-0.008,0.011
Age,-0.06,0.042,1.0,0.03,-0.053,-0.043,0.016,-0.004,0.009,-0.012
UrbPopDensity_res,-0.166,0.008,0.03,1.0,-0.378,-0.772,0.684,-0.397,0.514,-0.307
DistSubcenter_res,0.066,0.001,-0.053,-0.378,1.0,0.528,-0.397,0.114,-0.107,-0.044
DistCenter_res,0.168,0.0,-0.043,-0.772,0.528,1.0,-0.612,0.235,-0.294,0.108
IntersecDensity_res,-0.157,0.0,0.016,0.684,-0.397,-0.612,1.0,-0.64,0.612,-0.329
street_length_res,0.095,-0.006,-0.004,-0.397,0.114,0.235,-0.64,1.0,-0.337,0.391
LU_UrbFab_res,-0.111,-0.008,0.009,0.514,-0.107,-0.294,0.612,-0.337,1.0,-0.57
LU_Comm_res,0.076,0.011,-0.012,-0.307,-0.044,0.108,-0.329,0.391,-0.57,1.0


In [8]:
X=vif_corr('Madrid')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Madrid Spain



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                Feature         VIF
0                 const  222.577266
1                HHSize    1.029913
2                   Sex    1.001847
3                   Age    1.017704
4     UrbPopDensity_res    2.380048
5   UrbBuildDensity_res    1.248022
6     DistSubcenter_res    1.282889
7        DistCenter_res    1.594868
8   IntersecDensity_res    2.718169
9     street_length_res    2.072874
10        LU_UrbFab_res    2.098204
11          LU_Comm_res    1.311277


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.035,-0.089,-0.11,-0.06,0.065,0.083,-0.106,0.06,-0.042,-0.005
Sex,-0.035,1.0,-0.004,-0.004,0.0,0.008,-0.011,0.001,-0.001,0.006,-0.012
Age,-0.089,-0.004,1.0,-0.006,0.023,0.025,-0.061,0.014,0.014,0.047,-0.02
UrbPopDensity_res,-0.11,-0.004,-0.006,1.0,0.216,-0.353,-0.278,0.542,-0.419,0.658,-0.175
UrbBuildDensity_res,-0.06,0.0,0.023,0.216,1.0,-0.225,-0.338,0.047,-0.002,0.148,-0.045
DistSubcenter_res,0.065,0.008,0.025,-0.353,-0.225,1.0,0.164,-0.289,0.289,-0.126,-0.142
DistCenter_res,0.083,-0.011,-0.061,-0.278,-0.338,0.164,1.0,-0.429,0.039,-0.203,0.096
IntersecDensity_res,-0.106,0.001,0.014,0.542,0.047,-0.289,-0.429,1.0,-0.635,0.402,-0.1
street_length_res,0.06,-0.001,0.014,-0.419,-0.002,0.289,0.039,-0.635,1.0,-0.269,-0.111
LU_UrbFab_res,-0.042,0.006,0.047,0.658,0.148,-0.126,-0.203,0.402,-0.269,1.0,-0.393


No concerns surrounding VIF or correlations in Madrid. 
However, the highest correlation factor between population density and urban fabric land use (0.658) does coincide with suspicious coefficientsin the linear regression models (positive coefficient for pop density and a very large negative coeff for LU UF).
Therefore, we will drop urban fabric land use, for more reliable regression coefficient estimates.

VIF for pop density than drops from 2.38 to 1.65.

In [9]:
# we drop LU Urb Fab from this model. Similar for DE, other
X.drop(columns=['LU_UrbFab_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature         VIF
0                 const  216.650848
1                HHSize    1.028034
2                   Sex    1.001762
3                   Age    1.013341
4     UrbPopDensity_res    1.653727
5   UrbBuildDensity_res    1.246587
6     DistSubcenter_res    1.272258
7        DistCenter_res    1.594677
8   IntersecDensity_res    2.712061
9     street_length_res    2.069157
10          LU_Comm_res    1.148918


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_Comm_res
HHSize,1.0,-0.035,-0.089,-0.11,-0.06,0.065,0.083,-0.106,0.06,-0.005
Sex,-0.035,1.0,-0.004,-0.004,0.0,0.008,-0.011,0.001,-0.001,-0.012
Age,-0.089,-0.004,1.0,-0.006,0.023,0.025,-0.061,0.014,0.014,-0.02
UrbPopDensity_res,-0.11,-0.004,-0.006,1.0,0.216,-0.353,-0.278,0.542,-0.419,-0.175
UrbBuildDensity_res,-0.06,0.0,0.023,0.216,1.0,-0.225,-0.338,0.047,-0.002,-0.045
DistSubcenter_res,0.065,0.008,0.025,-0.353,-0.225,1.0,0.164,-0.289,0.289,-0.142
DistCenter_res,0.083,-0.011,-0.061,-0.278,-0.338,0.164,1.0,-0.429,0.039,0.096
IntersecDensity_res,-0.106,0.001,0.014,0.542,0.047,-0.289,-0.429,1.0,-0.635,-0.1
street_length_res,0.06,-0.001,0.014,-0.419,-0.002,0.289,0.039,-0.635,1.0,-0.111
LU_Comm_res,-0.005,-0.012,-0.02,-0.175,-0.045,-0.142,0.096,-0.1,-0.111,1.0


In [11]:
X=vif_corr('Wien')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Wien Austria
                Feature         VIF
0                 const  334.483550
1                HHSize    1.063261
2                   Sex    1.022925
3                   Age    1.015436
4     UrbPopDensity_res    7.474097
5   UrbBuildDensity_res    2.985189
6     DistSubcenter_res    2.231792
7        DistCenter_res    4.596859
8   IntersecDensity_res    9.262111
9     street_length_res    5.719510
10        LU_UrbFab_res    5.739174
11          LU_Comm_res    1.792086


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.125,-0.0,-0.164,-0.156,0.013,0.153,-0.089,0.064,-0.09,-0.082
Sex,-0.125,1.0,0.003,-0.011,-0.006,-0.001,-0.022,-0.031,0.017,-0.026,0.048
Age,-0.0,0.003,1.0,-0.09,-0.085,0.027,0.087,-0.041,0.027,-0.058,-0.066
UrbPopDensity_res,-0.164,-0.011,-0.09,1.0,0.619,-0.197,-0.743,0.78,-0.669,0.845,-0.012
UrbBuildDensity_res,-0.156,-0.006,-0.085,0.619,1.0,-0.37,-0.684,0.558,-0.44,0.515,0.421
DistSubcenter_res,0.013,-0.001,0.027,-0.197,-0.37,1.0,0.545,-0.248,0.22,-0.354,-0.269
DistCenter_res,0.153,-0.022,0.087,-0.743,-0.684,0.545,1.0,-0.662,0.59,-0.626,-0.173
IntersecDensity_res,-0.089,-0.031,-0.041,0.78,0.558,-0.248,-0.662,1.0,-0.902,0.8,-0.169
street_length_res,0.064,0.017,0.027,-0.669,-0.44,0.22,0.59,-0.902,1.0,-0.718,0.183
LU_UrbFab_res,-0.09,-0.026,-0.058,0.845,0.515,-0.354,-0.626,0.8,-0.718,1.0,-0.053


Vienna has several problematic correlations, the highest VIF is for intersection density, which has corr of 0.9 with street lenght. LU urban fabric also has high VIF and correlation with pop density.
After dropping intersection density and LU urban fabric VIFs are much lower, distance to center remains somewhat high at 4.0, but acceptable. 
Correlation is high (0.743) between population density and distance to center, but we keep both features due to their importance.

In [12]:
X.drop(columns=['IntersecDensity_res','LU_UrbFab_res'],inplace=True)
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

               Feature         VIF
0                const  112.804796
1               HHSize    1.061433
2                  Sex    1.022084
3                  Age    1.015182
4    UrbPopDensity_res    3.381877
5  UrbBuildDensity_res    2.679027
6    DistSubcenter_res    1.702374
7       DistCenter_res    4.022401
8    street_length_res    2.116933
9          LU_Comm_res    1.636831


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,street_length_res,LU_Comm_res
HHSize,1.0,-0.125,-0.0,-0.164,-0.156,0.013,0.153,0.064,-0.082
Sex,-0.125,1.0,0.003,-0.011,-0.006,-0.001,-0.022,0.017,0.048
Age,-0.0,0.003,1.0,-0.09,-0.085,0.027,0.087,0.027,-0.066
UrbPopDensity_res,-0.164,-0.011,-0.09,1.0,0.619,-0.197,-0.743,-0.669,-0.012
UrbBuildDensity_res,-0.156,-0.006,-0.085,0.619,1.0,-0.37,-0.684,-0.44,0.421
DistSubcenter_res,0.013,-0.001,0.027,-0.197,-0.37,1.0,0.545,0.22,-0.269
DistCenter_res,0.153,-0.022,0.087,-0.743,-0.684,0.545,1.0,0.59,-0.173
street_length_res,0.064,0.017,0.027,-0.669,-0.44,0.22,0.59,1.0,0.183
LU_Comm_res,-0.082,0.048,-0.066,-0.012,0.421,-0.269,-0.173,0.183,1.0


In [16]:
X=vif_corr('Germany_other')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

Germany_other Germany


Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.


                Feature         VIF
0                 const  185.006054
1                HHSize    1.047870
2                   Sex    1.005770
3                   Age    1.042088
4     UrbPopDensity_res    3.263240
5   UrbBuildDensity_res    2.040155
6     DistSubcenter_res    1.199650
7        DistCenter_res    1.597555
8   IntersecDensity_res    3.866636
9     street_length_res    1.921117
10        LU_UrbFab_res    3.261613
11          LU_Comm_res    1.472240


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.072,-0.109,-0.105,-0.108,0.031,0.117,-0.104,0.049,-0.075,-0.078
Sex,-0.072,1.0,0.013,-0.01,-0.01,0.006,0.008,-0.01,0.009,-0.012,0.004
Age,-0.109,0.013,1.0,-0.097,-0.084,0.068,0.128,-0.101,0.077,-0.069,-0.077
UrbPopDensity_res,-0.105,-0.01,-0.097,1.0,0.468,-0.15,-0.392,0.691,-0.514,0.772,0.064
UrbBuildDensity_res,-0.108,-0.01,-0.084,0.468,1.0,-0.373,-0.386,0.361,-0.255,0.274,0.511
DistSubcenter_res,0.031,0.006,0.068,-0.15,-0.373,1.0,0.247,-0.181,0.155,-0.088,-0.27
DistCenter_res,0.117,0.008,0.128,-0.392,-0.386,0.247,1.0,-0.552,0.328,-0.36,-0.217
IntersecDensity_res,-0.104,-0.01,-0.101,0.691,0.361,-0.181,-0.552,1.0,-0.683,0.739,0.063
street_length_res,0.049,0.009,0.077,-0.514,-0.255,0.155,0.328,-0.683,1.0,-0.513,-0.078
LU_UrbFab_res,-0.075,-0.012,-0.069,0.772,0.274,-0.088,-0.36,0.739,-0.513,1.0,0.007


VIFs are acceptable low for rest of Germany. Some high correlation between urban fabric land use with pop density (0.772) and intersection density (0.739). Intersection density also quite highly correlated with street length and population density. 

Dropping urban fabric land use and intersection density leads to all very low VIFs and correlations.

In [17]:
X.drop(columns=['LU_UrbFab_res','IntersecDensity_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

               Feature         VIF
0                const  101.048895
1               HHSize    1.046360
2                  Sex    1.005740
3                  Age    1.041378
4    UrbPopDensity_res    1.790977
5  UrbBuildDensity_res    1.985419
6    DistSubcenter_res    1.198307
7       DistCenter_res    1.335819
8    street_length_res    1.404958
9          LU_Comm_res    1.459573


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,street_length_res,LU_Comm_res
HHSize,1.0,-0.072,-0.109,-0.105,-0.108,0.031,0.117,0.049,-0.078
Sex,-0.072,1.0,0.013,-0.01,-0.01,0.006,0.008,0.009,0.004
Age,-0.109,0.013,1.0,-0.097,-0.084,0.068,0.128,0.077,-0.077
UrbPopDensity_res,-0.105,-0.01,-0.097,1.0,0.468,-0.15,-0.392,-0.514,0.064
UrbBuildDensity_res,-0.108,-0.01,-0.084,0.468,1.0,-0.373,-0.386,-0.255,0.511
DistSubcenter_res,0.031,0.006,0.068,-0.15,-0.373,1.0,0.247,0.155,-0.27
DistCenter_res,0.117,0.008,0.128,-0.392,-0.386,0.247,1.0,0.328,-0.217
street_length_res,0.049,0.009,0.077,-0.514,-0.255,0.155,0.328,1.0,-0.078
LU_Comm_res,-0.078,0.004,-0.077,0.064,0.511,-0.27,-0.217,-0.078,1.0


In [21]:
X=vif_corr('France_other')
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

France_other France



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                Feature         VIF
0                 const  118.898692
1                HHSize    1.055082
2                   Sex    1.009441
3                   Age    1.032519
4     UrbPopDensity_res    3.345826
5   UrbBuildDensity_res    2.868368
6     DistSubcenter_res    1.535069
7        DistCenter_res    1.838679
8   IntersecDensity_res    4.004131
9     street_length_res    1.839450
10        LU_UrbFab_res    1.959435
11          LU_Comm_res    1.381690


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,IntersecDensity_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.076,-0.019,-0.136,-0.14,0.097,0.194,-0.176,0.111,-0.078,-0.058
Sex,-0.076,1.0,0.053,-0.011,-0.014,-0.003,-0.013,0.006,-0.019,0.01,-0.012
Age,-0.019,0.053,1.0,-0.129,-0.135,0.08,0.09,-0.124,0.07,-0.053,-0.09
UrbPopDensity_res,-0.136,-0.011,-0.129,1.0,0.697,-0.331,-0.47,0.667,-0.302,0.532,-0.025
UrbBuildDensity_res,-0.14,-0.014,-0.135,0.697,1.0,-0.334,-0.428,0.601,-0.258,0.151,0.283
DistSubcenter_res,0.097,-0.003,0.08,-0.331,-0.334,1.0,0.505,-0.483,0.373,-0.261,-0.261
DistCenter_res,0.194,-0.013,0.09,-0.47,-0.428,0.505,1.0,-0.615,0.356,-0.321,-0.192
IntersecDensity_res,-0.176,0.006,-0.124,0.667,0.601,-0.483,-0.615,1.0,-0.641,0.507,0.117
street_length_res,0.111,-0.019,0.07,-0.302,-0.258,0.373,0.356,-0.641,1.0,-0.315,-0.073
LU_UrbFab_res,-0.078,0.01,-0.053,0.532,0.151,-0.261,-0.321,0.507,-0.315,1.0,-0.255


Rest of France has acceptably low VIFs and correlations, highest VIF is 4.0 for intersection density, which has moderatly high correlation with population density.
Dropping intersection density would also improve comparability between rest of France and rest of Germany, and would maintain consistency between dist agg and dist commute models for rest of France.

In [22]:
X.drop(columns=['IntersecDensity_res'],inplace=True)
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature        VIF
0                 const  72.721763
1                HHSize   1.053835
2                   Sex   1.009433
3                   Age   1.032183
4     UrbPopDensity_res   3.233387
5   UrbBuildDensity_res   2.624639
6     DistSubcenter_res   1.526662
7        DistCenter_res   1.667172
8     street_length_res   1.282810
9         LU_UrbFab_res   1.818256
10          LU_Comm_res   1.380271


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.076,-0.019,-0.136,-0.14,0.097,0.194,0.111,-0.078,-0.058
Sex,-0.076,1.0,0.053,-0.011,-0.014,-0.003,-0.013,-0.019,0.01,-0.012
Age,-0.019,0.053,1.0,-0.129,-0.135,0.08,0.09,0.07,-0.053,-0.09
UrbPopDensity_res,-0.136,-0.011,-0.129,1.0,0.697,-0.331,-0.47,-0.302,0.532,-0.025
UrbBuildDensity_res,-0.14,-0.014,-0.135,0.697,1.0,-0.334,-0.428,-0.258,0.151,0.283
DistSubcenter_res,0.097,-0.003,0.08,-0.331,-0.334,1.0,0.505,0.373,-0.261,-0.261
DistCenter_res,0.194,-0.013,0.09,-0.47,-0.428,0.505,1.0,0.356,-0.321,-0.192
street_length_res,0.111,-0.019,0.07,-0.302,-0.258,0.373,0.356,1.0,-0.315,-0.073
LU_UrbFab_res,-0.078,0.01,-0.053,0.532,0.151,-0.261,-0.321,-0.315,1.0,-0.255
LU_Comm_res,-0.058,-0.012,-0.09,-0.025,0.283,-0.261,-0.192,-0.073,-0.255,1.0


In [23]:
# all city model
city='Berlin'
country=countries[cities_all.index(city)]
print(city, country)

city0='Berlin'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
# add to make consistent with definition of apprenticeship within Education in other countries
df0.loc[(df0['Training'].isin(['Apprenticeship/Business','Craftsman/Technical'])) & (df0['Education']!='University'),'Education']='Apprenticeship'
df0=df0.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
            'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
            'Sex', 'Occupation', 'Education','Age',
            #'PopDensity_res','BuildDensity_res',
            'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
            'LU_Comm_res' ,'Trip_Distance']]
df0['City']=city0
df0['Country']='Germany'
df_all=df0.copy()

cities0=['Dresden', 'Leipzig','Magdeburg','Potsdam','Frankfurt am Main','Düsseldorf','Kassel']
for city1 in cities0:
        print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
        df1.loc[(df1['Training'].isin(['Apprenticeship/Business','Craftsman/Technical'])) & (df0['Education']!='University'),'Education']='Apprenticeship'
        df1=df1.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                    'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                    'Sex', 'Occupation', 'Education','Age',
                    #'PopDensity_res','BuildDensity_res',
                    'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                    'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                    'LU_Comm_res','Trip_Distance']]
        df1['City']=city1
        df1['Country']='Germany'
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')
                #print(len(df_all), 'rows in the combined dataframe')
df_all['HHNR']=df_all['City']+'_'+df_all['HHNR'].astype(int).astype(str)
df_all['HH_PNR']=df_all['City']+'_'+df_all['HH_PNR'].astype(int).astype(str)
df_all['HH_P_WNR']=df_all['City']+'_'+df_all['HH_P_WNR'].astype(str)
df_all.drop(columns='City',inplace=True)
df_DE=df_all.copy()
city0='Clermont'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv')
df0=df0.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
            'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
            'Sex', 'Occupation', 'Education','Age',
            #'PopDensity_res','BuildDensity_res',
            'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
            'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
            'LU_Comm_res', 'Trip_Distance']]
df0['City']=city0
df0['Country']='France'
df_all=df0.copy()

cities0=['Toulouse','Montpellier','Lyon','Nantes','Nimes','Lille','Dijon','Paris']
for city1 in cities0:
        print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv')
        df1=df1.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                    'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                    'Sex', 'Occupation', 'Education','Age',
                    #'PopDensity_res','BuildDensity_res',
                    'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                    'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                    'LU_Comm_res', 'Trip_Distance']]
        df1['City']=city1
        df1['Country']='France'
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')
                #print(len(df_all), 'rows in the combined dataframe')
df_all['HHNR']=df_all['City']+'_'+df_all['HHNR'].astype(str)
df_all['HH_PNR']=df_all['City']+'_'+df_all['HH_PNR'].astype(str)
df_all['HH_P_WNR']=df_all['City']+'_'+df_all['HH_P_WNR'].astype(str)
df_all.drop(columns='City',inplace=True)
df_FR=df_all.copy()
df=pd.read_csv('../outputs/Combined/' + 'Madrid' + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
df_UF=df.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                'Sex', 'Occupation', 'Education','Age',
                #'PopDensity_res','BuildDensity_res',
                'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Trip_Distance']]
df_UF['City']='Madrid'
df_UF['Country']='Spain'
df_UF['HHNR']=df_UF['City']+'_'+df_UF['HHNR'].astype(str)
df_UF['HH_PNR']=df_UF['City']+'_'+df_UF['HH_PNR'].astype(str)
df_UF['HH_P_WNR']=df_UF['City']+'_'+df_UF['HH_P_WNR'].astype(str)
df_UF.drop(columns='City',inplace=True)
df_Madrid=df_UF.copy()
df=pd.read_csv('../outputs/Combined/' + 'Wien' + '_UF.csv',dtype={'Ori_geocode': str, 'Des_geocode': str,'Res_geocode': str })
df_UF=df.loc[:,['HH_P_WNR','HH_PNR', 'HHNR','Ori_geocode', 'Des_geocode','Res_geocode', 
                'Trip_Time', 'Season','Trip_Purpose_Agg','HHSize',
                'Sex', 'Occupation', 'Education','Age',
                #'PopDensity_res','BuildDensity_res',
                'UrbPopDensity_res', 'UrbBuildDensity_res','DistSubcenter_res', 'DistCenter_res',
                'IntersecDensity_res', 'street_length_res', 'LU_UrbFab_res',#'bike_lane_share_res',
                'LU_Comm_res', 'Trip_Distance']]
df_UF['City']='Wien'
df_UF['Country']='Austria'
df_UF['HHNR']=df_UF['City']+'_'+df_UF['HHNR'].astype(str)
df_UF['HH_PNR']=df_UF['City']+'_'+df_UF['HH_PNR'].astype(str)
df_UF['HH_P_WNR']=df_UF['City']+'_'+df_UF['HH_P_WNR'].astype(str)
df_UF.drop(columns='City',inplace=True)
df_Wien=df_UF.copy()
df_UF=pd.concat([df_DE,df_FR,df_Madrid,df_Wien],ignore_index=True)
df_UF['Trip_Purpose_Agg'].value_counts()

Berlin Germany


Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.


Dresden
Dresden added.
Leipzig
Leipzig added.
Magdeburg
Magdeburg added.
Potsdam
Potsdam added.
Frankfurt am Main
Frankfurt am Main added.
Düsseldorf
Düsseldorf added.
Kassel
Kassel added.


Columns (4,5) have mixed types. Specify dtype option on import or set low_memory=False.


Toulouse
Toulouse added.
Montpellier
Montpellier added.
Lyon
Lyon added.
Nantes
Nantes added.
Nimes
Nimes added.
Lille
Lille added.
Dijon
Dijon added.
Paris
Paris added.


Home↔Leisure      156611
Other             132906
Home↔Work         130623
Home↔Shopping     108008
Home↔School        83984
Home↔Companion     62002
Name: Trip_Purpose_Agg, dtype: int64

In [5]:
df_UF=pd.concat([df_DE,df_FR,df_Madrid,df_Wien],ignore_index=True)
df_UF=df_UF.loc[df_UF['Trip_Purpose_Agg']=='Home↔Work',]
df_UF.drop(columns='Trip_Purpose_Agg',inplace=True)

Occ_dict={'Employed_FullTime':'Employed','Employed_PartTime':'Employed','Employed':'Employed','Trainee':'Employed',
          'Student_School':'Student_School','Student_3rdLevel':'Student_3rdLevel','Pre-School':'Pre-School','Retired':'Retired',
          'Unemployed':'Unemployed/Other','Other':'Unemployed/Other','Home_Partner':'Unemployed/Other'}
Edu_dict={'University':'University','Secondary':'Secondary','Secondary+BAC':'Secondary','Secondary+Matura':'Secondary',
          'Apprenticeship':'Apprenticeship',
          'Elementary':'Primary/None','Pre-School':'Primary/None','No diploma yet':'Primary/None','Unknown':'Primary/None','Other':'Primary/None'}

df_UF['Occupation']=df_UF['Occupation'].map(Occ_dict)
df_UF['Education']=df_UF['Education'].map(Edu_dict)

df_UF=pd.concat([df_UF.drop(columns='Trip_Distance'),df_UF['Trip_Distance']],axis=1)

df=df_UF.dropna()
df['Sex']=df['Sex']-1 # change from [1,2] to [0,1], for plotting purposes
df=df.loc[df['UrbBuildDensity_res']<1e8,]   # remove high building density outliers (For Leipzig)

# identify the feature columns
N_non_feature=6 # how many non-features are at the start of the df
cols=df.columns
newcols=(df.columns[:N_non_feature].tolist()) + ('FeatureD' +'_'+ cols[N_non_feature:-1]).tolist() + (df.columns[-1:].tolist())
# change column names
df.set_axis(newcols,axis=1,inplace=True)
df = df.reset_index(drop=True)
df0=df.copy()

# convert  all categorical variables to dummies
df_Cat=df.select_dtypes('object')[[col for col in df.select_dtypes('object').columns if "FeatureD" in col]]
for col in df_Cat:
    dum=pd.get_dummies(df[[col]])
    df = pd.concat([df, dum], axis = 1)
    # remove the original categorical columns
df.drop(df_Cat.columns.tolist(),axis=1,inplace=True)

X=df[[col for col in df.columns if "FeatureD" in col]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


No VIF or correlation concerns with all cities. Built-up and population density have high correlation (0.697) but VIFs are fine.

In [24]:
# no correlation concerns
X_disp=[re.sub('FeatureD_','', x) for x in X.columns]
X.columns=X_disp
X = X[X.columns.drop(list(X.filter(regex='Education')))]
X = X[X.columns.drop(list(X.filter(regex='Season')))]
X = X[X.columns.drop(list(X.filter(regex='Trip_Time')))]
X = X[X.columns.drop(list(X.filter(regex='Occupation')))]
X = X[X.columns.drop(list(X.filter(regex='Country')))]
X_selc = add_constant(X)
vif=pd.Series([variance_inflation_factor(X_selc.values, i) for i in range(X_selc.shape[1])], index=X_selc.columns)
vif=vif.to_frame().reset_index()
vif.columns=['Feature','VIF']
corr = X.corr()
print(vif)
corr.style.format(precision=3).background_gradient(cmap='coolwarm')

                Feature        VIF
0                 const  72.721763
1                HHSize   1.053835
2                   Sex   1.009433
3                   Age   1.032183
4     UrbPopDensity_res   3.233387
5   UrbBuildDensity_res   2.624639
6     DistSubcenter_res   1.526662
7        DistCenter_res   1.667172
8     street_length_res   1.282810
9         LU_UrbFab_res   1.818256
10          LU_Comm_res   1.380271


Unnamed: 0,HHSize,Sex,Age,UrbPopDensity_res,UrbBuildDensity_res,DistSubcenter_res,DistCenter_res,street_length_res,LU_UrbFab_res,LU_Comm_res
HHSize,1.0,-0.076,-0.019,-0.136,-0.14,0.097,0.194,0.111,-0.078,-0.058
Sex,-0.076,1.0,0.053,-0.011,-0.014,-0.003,-0.013,-0.019,0.01,-0.012
Age,-0.019,0.053,1.0,-0.129,-0.135,0.08,0.09,0.07,-0.053,-0.09
UrbPopDensity_res,-0.136,-0.011,-0.129,1.0,0.697,-0.331,-0.47,-0.302,0.532,-0.025
UrbBuildDensity_res,-0.14,-0.014,-0.135,0.697,1.0,-0.334,-0.428,-0.258,0.151,0.283
DistSubcenter_res,0.097,-0.003,0.08,-0.331,-0.334,1.0,0.505,0.373,-0.261,-0.261
DistCenter_res,0.194,-0.013,0.09,-0.47,-0.428,0.505,1.0,0.356,-0.321,-0.192
street_length_res,0.111,-0.019,0.07,-0.302,-0.258,0.373,0.356,1.0,-0.315,-0.073
LU_UrbFab_res,-0.078,0.01,-0.053,0.532,0.151,-0.261,-0.321,-0.315,1.0,-0.255
LU_Comm_res,-0.058,-0.012,-0.09,-0.025,0.283,-0.261,-0.192,-0.073,-0.255,1.0
