In [1]:
# code to make summary stats of travel in combined DE other and FR other

In [13]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from math import trunc

In [3]:
# load pickled dictionary files
with open('../dictionaries/Germany_var.pkl','rb') as f:
    var_all = pickle.load(f)

with open('../dictionaries/Germany_val.pkl','rb') as f:
    value_all = pickle.load(f)

with open('../dictionaries/Germany_na.pkl','rb') as f:
    na_all = pickle.load(f)

with open('../dictionaries/city_postcode_DE.pkl','rb') as f:
    city_plz = pickle.load(f)

In [5]:
def get_sP_sH_sW(city):
    fn_hh='../../MSCA_data/SrV/' + city + '/SrV2018_Einzeldaten_' + city + '_SciUse_H2018.csv'
    fn_p='../../MSCA_data/SrV/' + city + '/SrV2018_Einzeldaten_' + city + '_SciUse_P2018.csv'
    fn_t='../../MSCA_data/SrV/' + city + '/SrV2018_Einzeldaten_' + city + '_SciUse_W2018.csv'

    sH=pd.read_csv(fn_hh,encoding='latin_1',sep=';',dtype={'PLZ':str,'GEWICHT_HH':str})
    sH.dropna(subset=['HHNR'],inplace=True)
    sP=pd.read_csv(fn_p,encoding='latin_1',sep=';')
    sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})

    for k in var_all['HH'].keys():
        if len(value_all['HH'][k])>0:
            sH[k]=sH[var_all['HH'][k]].map(value_all['HH'][k])
        elif len(value_all['HH'][k])==0:
            sH[k]=sH[var_all['HH'][k]]

    for k in var_all['P'].keys():
        if len(value_all['P'][k])>0:
            sP[k]=sP[var_all['P'][k]].map(value_all['P'][k])
        elif len(value_all['P'][k])==0:
            sP[k]=sP[var_all['P'][k]]

    for k in var_all['W'].keys():
        if len(value_all['W'][k])>0:
            sW[k]=sW[var_all['W'][k]].map(value_all['W'][k])
        elif len(value_all['W'][k])==0:
            sW[k]=sW[var_all['W'][k]]

    # fill in na's as necessary
    sH.fillna(value=na_all['HH'],inplace=True)
    sP.fillna(value=na_all['P'],inplace=True)
    sW.fillna(value=na_all['W'],inplace=True)

    # keep only variables needed, i.e. the variables that are included as dictionary keys
    sH=sH[list(value_all['HH'].keys())]
    sP=sP[list(value_all['P'].keys())]
    sW=sW[list(value_all['W'].keys())]

    # change decimal from , to . and convert those variables from string to float
    sH.loc[:,'HH_Weight']=sH.loc[:,'HH_Weight'].map(lambda x: x.replace(',','.')).astype('float')
    sP.loc[:,'Per_Weight']=sP.loc[:,'Per_Weight'].map(lambda x: x.replace(',','.')).astype('float')
    sW.loc[:,'Trip_Weight']=sW.loc[:,'Trip_Weight'].map(lambda x: x.replace(',','.')).astype('float')
    sW.loc[:,'Trip_Distance']=sW.loc[:,'Trip_Distance'].map(lambda x: x.replace(',','.')).astype('float')
    sW.loc[:,'Trip_Distance_GIS']=sW.loc[:,'Trip_Distance_GIS'].map(lambda x: x.replace(',','.')).astype('float')

    # define unique person and trip numbers
    sP['HH_PNR']=sP['HHNR'].astype('str')+'_'+sP['Person'].astype('str')
    sW['HH_PNR']=sW['HHNR'].astype('str')+'_'+sW['Person'].astype('str')
    sW['HH_P_WNR']=sW['HH_PNR']+'_'+sW['Trip'].astype('str')

    # bring HHNR and geo_unit to the left side of the HH df
    cols=sH.columns.tolist()
    cols_new = ['HHNR', 'Res_geocode'] + [value for value in cols if value not in {'HHNR', 'Res_geocode'}]
    sH=sH[cols_new]

    # bring HHNR, HH_PNR, to the left side of the Per df
    cols=sP.columns.tolist()
    cols_new = ['HHNR','HH_PNR'] + [value for value in cols if value not in {'HHNR', 'HH_PNR'}]
    sP=sP[cols_new]

    # bring HHNR, HH_PNR, HH_P_WNR to the left side of the W df
    cols=sW.columns.tolist()
    cols_new = ['HHNR','HH_PNR','HH_P_WNR'] + [value for value in cols if value not in {'HHNR', 'HH_PNR','HH_P_WNR'}]
    sW=sW[cols_new]

    # count company cars towards car ownership
    sH.loc[sH['CompanyCarHH']==1,'CarOwnershipHH']=1

    sW['Hour']=sW.loc[:,'Time']

    # Define trip time categories
    sW['Trip_Time']='Nighttime Off-Peak'
    sW.loc[sW['Hour'].isin([6,7,8,9]),'Trip_Time']='AM_Rush'
    sW.loc[sW['Hour'].isin([12,13]),'Trip_Time']='Lunch'
    sW.loc[sW['Hour'].isin([16,17,18]),'Trip_Time']='PM Rush'
    sW.loc[sW['Hour'].isin([19,20,21]),'Trip_Time']='Evening'
    sW.loc[sW['Hour'].isin([10,11,14,15]),'Trip_Time']='Daytime Off-Peak'

    # remove rows with NA age
    sP=sP.loc[sP['Age']>=0,]

    sW.drop(columns=['HHNR','Person', 'Ori_Reason', 'Des_Reason','Time'],inplace=True)
    sW=sW.loc[sW['Trip_Valid']==1,:]
    # merge together the household, person, and trip files
    sHP=sH.merge(sP,on='HHNR')
    sHPW=sHP.merge(sW,on='HH_PNR')

    cols=sHPW.columns.tolist()
    cols_new = ['HHNR','HH_PNR','HH_P_WNR','Res_geocode','Ori_Plz','Des_Plz','Trip_Time','Trip_Purpose'] + [value for value in cols if value not in {'HHNR','HH_PNR','HH_P_WNR','Res_geocode','Ori_Plz','Des_Plz','Trip_Time','Trip_Purpose','Mode', 'Trip_Distance'}] +['Mode', 'Trip_Distance']
    sHPW=sHPW[cols_new]

    # responses should be of people who live in the city, and trips should either start or end in the city
    # same for other dataframes
    sP=sP.merge(sH.loc[:,['HHNR','Res_geocode']])
    sW=sW.merge(sP.loc[:,['HH_PNR','Res_geocode']])

    sH=sH.loc[sH['Res_geocode'].isin(city_plz[city]),:]
    sP=sP.loc[sP['Res_geocode'].isin(city_plz[city]),:]
    sHP=sHP.loc[sHP['Res_geocode'].isin(city_plz[city]),:]
    sW=sW.loc[sW['Res_geocode'].isin(city_plz[city]),:]
    sW=sW.loc[(sW['Ori_Plz'].isin(city_plz[city])) | (sW['Des_Plz'].isin(city_plz[city]))]

    sP['City']=city
    sH['City']=city
    sW['City']=city

    return sP, sH, sW

In [23]:
city0='Dresden'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv',dtype={'Res_geocode':str})
df0['Commute_Trip']=0
df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
df0['City']=city0
df_all=df0.copy()

sP0, sH0, sW0=get_sP_sH_sW(city0)
sP_all=sP0.copy()
sH_all=sH0.copy()
sW_all=sW0.copy()
cities0=['Leipzig','Magdeburg','Potsdam','Frankfurt am Main','Düsseldorf','Kassel']
for city1 in cities0:
        #print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv',dtype={'Res_geocode':str})
        df1['Commute_Trip']=0
        df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df1['City']=city0
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')

        sP1, sH1, sW1=get_sP_sH_sW(city1)
        if len(sP1.columns)==len(sP_all.columns):# & (sH1.columns==sH_all.columns) & (sW1.columns==sW_all.columns):
                sP_all=pd.concat([sP_all,sP1])
                sH_all=pd.concat([sH_all,sH1])
                sW_all=pd.concat([sW_all,sW1])
                print(city1, 'added again.')



df_DE=df_all.copy()
sP_DE=sP_all.copy()
sH_DE=sH_all.copy()
sW_DE=sW_all.copy()

  sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})


Leipzig added.


  sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})


Leipzig added again.
Magdeburg added.


  sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})


Magdeburg added again.
Potsdam added.
Potsdam added again.
Frankfurt am Main added.
Frankfurt am Main added again.
Düsseldorf added.


  sP=pd.read_csv(fn_p,encoding='latin_1',sep=';')
  sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})


Düsseldorf added again.
Kassel added.


  df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv',dtype={'Res_geocode':str})
  sW=pd.read_csv(fn_t,encoding='latin_1',sep=';',dtype={'V_START_PLZ':str,'V_ZIEL_PLZ':str})


Kassel added again.


In [24]:
sHPW=df_DE.copy()
sP=sP_DE.copy()
sH=sH_DE.copy()
sW=sW_DE.copy()

In [25]:
# daily distance per person
daily_dist=sHPW.groupby('HH_PNR')['Trip_Distance'].sum().to_frame().reset_index()
# daily distance per person, including people with out travel distances, for a variety of reasons
daily_dist_all=daily_dist.merge(sP['HH_PNR'],how='right')
# get the household-person ids for those who did travel, but their travels were recorded as invalid for some reason
HH_PNR_na=sW.loc[sW['HH_PNR'].isin(daily_dist_all.loc[daily_dist_all['Trip_Distance'].isna(),'HH_PNR']),'HH_PNR']
na_PNR=HH_PNR_na.drop_duplicates().values.tolist()


weighted=sHPW.loc[:,('HH_PNR','Per_Weight','Mode','Trip_Distance','Trip_Purpose_Agg')]
weighted=weighted.loc[~weighted['HH_PNR'].isin(na_PNR),:]
weighted['Dist_Weighted_P']=weighted['Per_Weight']*weighted['Trip_Distance']

print('Person weights and trip weights are all the same: ' ,all(sHPW['Per_Weight']==sHPW['Trip_Weight']))

# calculate number of persons using the whole sP file, so we can accuractely calculate km/cap/day. i.e. including those who didn't travel on the survey date
unique_persons=sP.loc[:,['HH_PNR','Per_Weight']].drop_duplicates()
unique_persons=unique_persons.loc[~unique_persons['HH_PNR'].isin(na_PNR),:]

weight_daily_travel=pd.DataFrame(0.001*weighted.groupby('Mode')['Dist_Weighted_P'].sum()/unique_persons['Per_Weight'].sum()).reset_index()
commute_avg=0.001*weighted.loc[weighted['Trip_Purpose_Agg']=='Home↔Work','Dist_Weighted_P'].sum()/weighted.loc[weighted['Trip_Purpose_Agg']=='Home↔Work','Per_Weight'].sum()
trip_avg=0.001*weighted['Dist_Weighted_P'].sum()/weighted['Per_Weight'].sum()
weight_trip_avg=pd.DataFrame(data={'Mode':['All','Commute'],'Avg_trip_dist':[trip_avg,commute_avg]})

weight_daily_travel.rename(columns={'Dist_Weighted_P':'Daily_Travel_cap'},inplace=True)
weight_daily_travel['Mode_Share']=weight_daily_travel['Daily_Travel_cap']/weight_daily_travel['Daily_Travel_cap'].sum()

# calculate car ownership for all households
carown=sH.loc[:,['HHNR','HH_Weight','CarOwnershipHH']].drop_duplicates()
own=pd.DataFrame(data={'Mode':['Car'],'Ownership':sum(carown['CarOwnershipHH']*carown['HH_Weight'])/sum(carown['HH_Weight'])})
weight_daily_travel=weight_daily_travel.merge(own,how='left')
weight_daily_travel=weight_daily_travel.merge(weight_trip_avg,how='outer')
weight_daily_travel.loc[weight_daily_travel['Mode']=='All','Daily_Travel_cap']=weight_daily_travel['Daily_Travel_cap'].sum()

Person weights and trip weights are all the same:  True


In [26]:
weight_daily_travel

Unnamed: 0,Mode,Daily_Travel_cap,Mode_Share,Ownership,Avg_trip_dist
0,2_3_Wheel,0.101793,0.00679,,
1,Bike,1.598363,0.106624,,
2,Car,8.902532,0.59387,0.707317,
3,Foot,0.743414,0.049592,,
4,Transit,3.644605,0.243124,,
5,All,14.990707,,,5.016166
6,Commute,,,,9.885428


In [38]:
def get_FR(city):
    print(city)
    if city == 'Clermont':
        fn_hh='../../MSCA_data/FranceRQ/lil-0924_Clermont.csv/Csv/Fichiers_Standard_Face_a_face/clermontfer_2012_std_faf_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-0924_Clermont.csv/Csv/Fichiers_Standard_Face_a_face/clermontfer_2012_std_faf_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-0924_Clermont.csv/Csv/Fichiers_Standard_Face_a_face/clermontfer_2012_std_faf_depl.csv'
        fn_hh0='../../MSCA_data/FranceRQ/lil-0924_Clermont.csv/Csv/Fichiers_Original_Face_a_face/clermontfer_2012_ori_faf_men.csv'

    if city == 'Toulouse':
        fn_hh='../../MSCA_data/FranceRQ/lil-0933_Toulouse.csv/Csv/Fichiers_Standard/toulouse_2013_std_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-0933_Toulouse.csv/Csv/Fichiers_Standard/toulouse_2013_std_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-0933_Toulouse.csv/Csv/Fichiers_Standard/toulouse_2013_std_depl.csv'
        fn_hh0='../../MSCA_data/FranceRQ/lil-0933_Toulouse.csv/Csv/Fichiers_Original/toulouse_2013_ori_men.csv'

    if city == 'Montpellier':
        fn_hh='../../MSCA_data/FranceRQ/lil-0937_Montpellier.csv/Csv/Fichiers_Standard_Face_a_face/montpellier_2014_std_faf_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-0937_Montpellier.csv/Csv/Fichiers_Standard_Face_a_face/montpellier_2014_std_faf_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-0937_Montpellier.csv/Csv/Fichiers_Standard_Face_a_face/montpellier_2014_std_faf_depl.csv'

    if city == 'Lyon':
        fn_hh='../../MSCA_data/FranceRQ/lil-1023_Lyon.csv/Csv/Fichiers_Standard_Face_a_face/lyon_2015_std_faf_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-1023_Lyon.csv/Csv/Fichiers_Standard_Face_a_face/lyon_2015_std_faf_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-1023_Lyon.csv/Csv/Fichiers_Standard_Face_a_face/lyon_2015_std_faf_depl.csv'

    if city == 'Nantes':
        fn_hh='../../MSCA_data/FranceRQ/lil-1024_Nantes.csv/Csv/Fichiers_Standard_Face_a_face/nantes_2015_std_faf_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-1024_Nantes.csv/Csv/Fichiers_Standard_Face_a_face/nantes_2015_std_faf_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-1024_Nantes.csv/Csv/Fichiers_Standard_Face_a_face/nantes_2015_std_faf_depl.csv'

    if city == 'Nimes':
        fn_hh='../../MSCA_data/FranceRQ/lil-1135_Nimes.csv/Csv/Fichiers_Standard/nimes_2015_std_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-1135_Nimes.csv/Csv/Fichiers_Standard/nimes_2015_std_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-1135_Nimes.csv/Csv/Fichiers_Standard/nimes_2015_std_depl.csv'

    if city == 'Lille':
        fn_hh='../../MSCA_data/FranceRQ/lil-1152_Lille.csv/Csv/Fichiers_Standard/lille_2016_std_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-1152_Lille.csv/Csv/Fichiers_Standard/lille_2016_std_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-1152_Lille.csv/Csv/Fichiers_Standard/lille_2016_std_depl.csv'

    if city == 'Dijon':
        fn_hh='../../MSCA_data/FranceRQ/lil-1214_Dijon.csv/Csv/Fichiers_Standard_Face_a_face/dijon_2016_std_faf_men.csv'
        fn_p='../../MSCA_data/FranceRQ/lil-1214_Dijon.csv/Csv/Fichiers_Standard_Face_a_face/dijon_2016_std_faf_pers.csv'
        fn_t='../../MSCA_data/FranceRQ/lil-1214_Dijon.csv/Csv/Fichiers_Standard_Face_a_face/dijon_2016_std_faf_depl.csv'

    sH=pd.read_csv(fn_hh,sep=';')
    sP=pd.read_csv(fn_p,sep=';')
    sW=pd.read_csv(fn_t,sep=';')

    # load pickled dictionary files
    with open('../dictionaries/' + city + '_var.pkl','rb') as f:
        var_all = pickle.load(f)

    with open('../dictionaries/' + city + '_val.pkl','rb') as f:
        value_all = pickle.load(f)

    with open('../dictionaries/' + city + '_na.pkl','rb') as f:
        na_all = pickle.load(f)

    for k in var_all['HH'].keys():
        if len(value_all['HH'][k])>0:
            sH[k]=sH[var_all['HH'][k]].map(value_all['HH'][k])
        elif len(value_all['HH'][k])==0:
            sH[k]=sH[var_all['HH'][k]]

    for k in var_all['P'].keys():
        if len(value_all['P'][k])>0:
            sP[k]=sP[var_all['P'][k]].map(value_all['P'][k])
        elif len(value_all['P'][k])==0:
            sP[k]=sP[var_all['P'][k]]

    for k in var_all['W'].keys():
        if len(value_all['W'][k])>0:
            sW[k]=sW[var_all['W'][k]].map(value_all['W'][k])
        elif len(value_all['W'][k])==0:
            sW[k]=sW[var_all['W'][k]]

    # fill in na's as necessary
    sH.fillna(value=na_all['HH'],inplace=True)
    sP.fillna(value=na_all['P'],inplace=True)
    sW.fillna(value=na_all['W'],inplace=True)

    # keep only variables needed, i.e. the variables that are included as dictionary keys
    sH=sH[list(value_all['HH'].keys())]
    sP=sP[list(value_all['P'].keys())]
    sW=sW[list(value_all['W'].keys())]

    # define household id !! this is city specific !!
    # sH0['HHNR']=sH0['geo_unit'].astype('str')+'_'+sH0['Sample'].astype('str')

    # define, sector, zone, and household id !! this is city specific !!

    sH['geo_unit'] = sH['Sector_Zone'].apply(lambda y: trunc(0.001*y))
    sH['Zone'] = sH['Sector_Zone'].apply(lambda y: y % 1000)
    sH['HHNR']=sH['geo_unit'].astype('str')+'_'+sH['Sample'].astype('str')

    # define, sector, zone, household, and person id !! this is city specific !!
    sP['geo_unit'] = sP['Sector_Zone'].apply(lambda y: trunc(0.001*y))
    sP['Zone'] = sP['Sector_Zone'].apply(lambda y: y % 1000)
    sP['HHNR']=sP['geo_unit'].astype('str')+'_'+sP['Sample'].astype('str')
    sP['HH_PNR']=sP['HHNR']+'_'+sP['Person'].astype('str')

    # define, sector, zone, household, and person, and trip id !! this is city specific !!
    sW['geo_unit'] = sW['Sector_Zone'].apply(lambda y: trunc(0.001*y))
    sW['Zone'] = sW['Sector_Zone'].apply(lambda y: y % 1000)
    sW['HHNR']=sW['geo_unit'].astype('str')+'_'+sW['Sample'].astype('str')
    sW['HH_PNR']=sW['HHNR']+'_'+sW['Person'].astype('str')
    sW['HH_P_WNR']=sW['HH_PNR']+'_'+sW['Trip'].astype('str')

    if len(sH['HHNR'].unique())!=len(sH):
        print('Unique HHNR not found with sector + sample. Defining HHNR instead with sector_zone + sample.')
        sH['HHNR']=sH['Sector_Zone'].astype('str').str.zfill(6)+'_'+sH['Sample'].astype('str')
        
        sP['HHNR']=sP['Sector_Zone'].astype('str').str.zfill(6)+'_'+sP['Sample'].astype('str')
        sP['HH_PNR']=sP['HHNR']+'_'+sP['Person'].astype('str')

        sW['HHNR']=sW['Sector_Zone'].astype('str').str.zfill(6)+'_'+sW['Sample'].astype('str')
        sW['HH_PNR']=sW['HHNR']+'_'+sW['Person'].astype('str')
        sW['HH_P_WNR']=sW['HH_PNR']+'_'+sW['Trip'].astype('str')

        if len(sH['HHNR'].unique())!=len(sH):
            print('Unique HHNR still not found with sector_zone + sample.')

    # merge the household income variable into the household file !! this is only for some cities !!
    if 'fn_hh0' in locals():
        sH0=pd.read_csv(fn_hh0,sep=';')

        for k in var_all['HH0'].keys():
            if len(value_all['HH0'][k])>0:
                sH0[k]=sH0[var_all['HH0'][k]].map(value_all['HH0'][k])
            elif len(value_all['HH0'][k])==0:
                sH0[k]=sH0[var_all['HH0'][k]]
        sH0=sH0[list(value_all['HH0'].keys())]
        sH0['HHNR']=sH0['geo_unit'].astype('str')+'_'+sH0['Sample'].astype('str')
        sH0.drop(columns=['Sample','geo_unit'],inplace=True)
        sH=sH.merge(sH0,on='HHNR')

    # bring HHNR and geo_unit to the left side of the HH df
    cols=sH.columns.tolist()
    cols_new = ['HHNR', 'geo_unit'] + [value for value in cols if value not in {'HHNR', 'geo_unit'}]
    sH=sH[cols_new]

    # bring HHNR, HH_PNR, to the left side of the Per df
    cols=sP.columns.tolist()
    cols_new = ['HHNR','HH_PNR'] + [value for value in cols if value not in {'HHNR', 'HH_PNR'}]
    sP=sP[cols_new]

    # bring HHNR, HH_PNR, HH_P_WNR to the left side of the W df
    cols=sW.columns.tolist()
    cols_new = ['HHNR','HH_PNR','HH_P_WNR'] + [value for value in cols if value not in {'HHNR', 'HH_PNR','HH_P_WNR'}]
    sW=sW[cols_new]

    # calculate household size from the sP data !! This is city specific !! at least in other surevys it is included as it's own variable
    hhs=sP['HHNR'].value_counts().reset_index()
    hhs.rename(columns={'index':'HHNR','HHNR':'HHSize'},inplace=True)
    sH=sH.merge(hhs,on='HHNR')

    # address inconsistencies with the Person 'Education' variable, arising from respondents who have completed a certain level of education/training responding no dimploma yet, even though they have lower diplomas than the one they are currently studying for
    # these assumptions are based on French law, in which it is mandatory to go to school until age 16 (end of secondary school), so anyone in an occupation post-16 is very likely to have some education. https://www.expatica.com/fr/education/children-education/french-education-system-101147/
    # they may need defined differently for non-French surveys
    sP.loc[(sP['Age']>11) & (sP['Age']<17) & (sP['Education']=='No diploma yet'),'Education']="Elementary" # if aged between 12 and 16, assume at least an Elementary education
    sP.loc[(sP['Age']>16) & (sP['Age']<20) & (sP['Education']=='No diploma yet'),'Education']="Secondary" # if aged between 17 and 19, assume at least a Secondary education.
    sP.loc[(sP['Age']>15) & (sP['Occupation']=='Student_3rdLevel') & (sP['Education']=='No diploma yet'),'Education']="Secondary+BAC" # if a 3rd level student, assume at least Secondary eduction with BAC
    sP.loc[(sP['Age']>15) & (sP['Occupation']=='Trainee') & (sP['Education']=='No diploma yet'),'Education']="Secondary" # If a trainee, assume at least a secondary education

    # address the NA values for Education and Occupation for children under 5
    sP.loc[sP['Age']<5,['Education','Occupation']]='Pre-School'

    # combine the two 'car parking available' variables 
    sP['Work/Study_CarParkAvailable']=0
    sP.loc[(sP['Work/Study_CarParkAvailable1']==1) | (sP['Work/Study_CarParkAvailable2']==1),'Work/Study_CarParkAvailable']=1
    sP.drop(columns=['Sector_Zone','Zone','Sample','geo_unit','Work/Study_CarParkAvailable1','Work/Study_CarParkAvailable2'],inplace=True)

    # create the origin and destination id's based on what we extracted from the microdata !! this is city specific !!
    sW['Ori_geo_unit']=sW['Ori_Sec_Zone'].apply(lambda y: trunc(0.001*y))
    sW['Des_geo_unit']=sW['Des_Sec_Zone'].apply(lambda y: trunc(0.001*y))

    # combine the trip reasons for independent and accompanied travellers !! this is city specific !! 
    # This may not be achievable, as not all cities specify the origin and desination of the accompanied person, and it may be better then to simply classify the trip reason as accompanying/kids
    # sW.loc[sW['Ori_Reason1'].isin([61,62,63,64,71,72,73,74]),'Ori_Reason1']=sW['Ori_Reason2']
    # sW.loc[sW['Des_Reason1'].isin([61,62,63,64,71,72,73,74]),'Des_Reason1']=sW['Des_Reason2']

    # define the simplified origin and destination reasons. !! This is city specific !! 
    sW['Ori_Reason_Agg']='Other'
    sW.loc[sW['Ori_Reason1'].isin([1,2]),'Ori_Reason_Agg']='Home'
    sW.loc[sW['Ori_Reason1'].isin([11,12,13,14,81]),'Ori_Reason_Agg']='Work'
    # In Toulouse, 'Personal' refers to two categories: "Being looked after (childminder, crèche...)" and "Receiving care (health)"
    sW.loc[sW['Ori_Reason1'].isin([21,41]),'Ori_Reason_Agg']='Personal' # this was prev. classified as 'Care' but was converted to 'Personal' for ease of harmonization with other cities. 
    sW.loc[sW['Ori_Reason1'].isin([22,23,24,25,26,27,28,29,96,97]),'Ori_Reason_Agg']='School'
    sW.loc[sW['Ori_Reason1'].isin([30,31,32,33,34,35,82,98]),'Ori_Reason_Agg']='Shopping'
    sW.loc[sW['Ori_Reason1'].isin([51,52,53,54]),'Ori_Reason_Agg']='Leisure'
    sW.loc[sW['Ori_Reason1'].isin([61,62,63,64,71,72,73,74]),'Ori_Reason_Agg']='Accompanying/Kids'

    sW['Des_Reason_Agg']='Other'
    sW.loc[sW['Des_Reason1'].isin([1,2]),'Des_Reason_Agg']='Home'
    sW.loc[sW['Des_Reason1'].isin([11,12,13,14,81]),'Des_Reason_Agg']='Work'
    sW.loc[sW['Des_Reason1'].isin([21,41]),'Des_Reason_Agg']='Personal' # this was prev. classified as 'Care' but was converted to 'Personal' for ease of harmonization with other cities.
    sW.loc[sW['Des_Reason1'].isin([22,23,24,25,26,27,28,29,96,97]),'Des_Reason_Agg']='School'
    sW.loc[sW['Des_Reason1'].isin([30,31,32,33,34,35,82,98]),'Des_Reason_Agg']='Shopping'
    sW.loc[sW['Des_Reason1'].isin([51,52,53,54]),'Des_Reason_Agg']='Leisure'
    sW.loc[sW['Des_Reason1'].isin([61,62,63,64,71,72,73,74]),'Des_Reason_Agg']='Accompanying/Kids'

    sW['trip_type_all']=sW['Ori_Reason_Agg']+'-'+sW['Des_Reason_Agg']

    # now calculate the detailed o-d trip purposes, this should be harmonized 
    sW['Trip_Purpose']='Other'
    sW.loc[(sW['Ori_Reason_Agg'].isin(['Home','Personal'])) & (sW['Des_Reason_Agg'].isin(['Home','Personal'])),'Trip_Purpose']='Home↔Personal' #
    sW.loc[(sW['Ori_Reason_Agg'].isin(['Home','Accompanying/Kids'])) & (sW['Des_Reason_Agg'].isin(['Home','Accompanying/Kids'])),'Trip_Purpose']='Home↔Companion' #
    sW.loc[(sW['Ori_Reason_Agg'].isin(['Work','Accompanying/Kids'])) & (sW['Des_Reason_Agg'].isin(['Work','Accompanying/Kids'])),'Trip_Purpose']='Work↔Companion' #
    sW.loc[(sW['Ori_Reason_Agg'].isin(['Home','Other'])) & (sW['Des_Reason_Agg'].isin(['Home','Other'])),'Trip_Purpose']='Other↔Home'
    sW.loc[sW['trip_type_all']=='Home-Shopping','Trip_Purpose']='Home-Shopping'
    sW.loc[sW['trip_type_all']=='Shopping-Home','Trip_Purpose']='Shopping-Home'
    sW.loc[sW['trip_type_all']=='Home-School','Trip_Purpose']='Home-School'
    sW.loc[sW['trip_type_all']=='School-Home','Trip_Purpose']='School-Home'
    sW.loc[sW['trip_type_all']=='Home-Work','Trip_Purpose']='Home-Work'
    sW.loc[sW['trip_type_all']=='Work-Home','Trip_Purpose']='Work-Home'
    sW.loc[sW['trip_type_all']=='Home-Leisure','Trip_Purpose']='Home-Leisure'
    sW.loc[sW['trip_type_all']=='Leisure-Home','Trip_Purpose']='Leisure-Home'
    sW.loc[sW['trip_type_all']=='Shopping-Shopping','Trip_Purpose']='Shopping'
    sW.loc[sW['trip_type_all']=='Work-Work','Trip_Purpose']='Work'
    sW.loc[sW['trip_type_all']=='Work-Leisure','Trip_Purpose']='Work-Leisure'
    sW.loc[sW['trip_type_all']=='Work-Work','Trip_Purpose']='Work'
    sW.loc[sW['trip_type_all']=='Work-Leisure','Trip_Purpose']='Work-Leisure'
    sW.loc[sW['trip_type_all']=='Work-Shopping','Trip_Purpose']='Work-Shopping'
    sW.loc[sW['trip_type_all']=='Leisure-Work','Trip_Purpose']='Leisure-Work'
    sW.loc[sW['trip_type_all']=='Leisure-Leisure','Trip_Purpose']='Leisure'
    sW.loc[sW['trip_type_all']=='Accompanying/Kids-Accompanying/Kids','Trip_Purpose']='Companion'

    # make the aggregated trip purpose
    sW['Trip_Purpose_Agg']='Other'
    sW.loc[sW['Trip_Purpose'].isin(['Home-Work','Work-Home']),'Trip_Purpose_Agg']='Home↔Work'
    sW.loc[sW['Trip_Purpose'].isin(['Home-School','School-Home']),'Trip_Purpose_Agg']='Home↔School'
    sW.loc[sW['Trip_Purpose'].isin(['Home-Shopping','Shopping-Home']),'Trip_Purpose_Agg']='Home↔Shopping'
    sW.loc[sW['Trip_Purpose'].isin(['Home↔Companion']),'Trip_Purpose_Agg']='Home↔Companion'
    sW.loc[sW['Trip_Purpose'].isin(['Home-Leisure','Leisure-Home','Home↔Personal']),'Trip_Purpose_Agg']='Home↔Leisure'

    sW.drop(columns=['Sector_Zone','Sample','Zone','geo_unit','HHNR','Person', 'Ori_Reason1','Ori_Reason2', 'Des_Reason1', 'Des_Reason2','Hour','Time'],inplace=True,errors='ignore')

    sP['City']=city
    sH['City']=city
    sW['City']=city

    return sP, sH, sW

In [None]:
city0='Clermont'
df0=pd.read_csv('../outputs/Combined/' + city0 + '_UF.csv',dtype={'Res_geocode':str})
df0.drop(columns=['IncomeDetailed','IncomeHarmonised'],errors='ignore',inplace=True)
df0['Commute_Trip']=0
df0.loc[df0['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
df0['City']=city0
df_all=df0.copy()

sP0, sH0, sW0=get_FR(city0)
sP_all=sP0.copy()
sH_all=sH0.copy()
sW_all=sW0.copy()
cities0=['Dijon','Lille','Lyon','Montpellier','Nantes','Nimes','Toulouse']
for city1 in cities0:
        print(city1)
        df1=pd.read_csv('../outputs/Combined/' + city1 + '_UF.csv',dtype={'Res_geocode':str})
        df1.drop(columns=['IncomeDetailed','IncomeHarmonised'],errors='ignore',inplace=True)
        df1['Commute_Trip']=0
        df1.loc[df1['Trip_Purpose_Agg']=='Home↔Work','Commute_Trip']=1
        df1['City']=city0
        if len(df1.columns==df_all.columns):
                df_all=pd.concat([df_all,df1])
                print(city1, 'added.')

        sP1, sH1, sW1=get_FR(city1)
        if len(sP1.columns)==len(sP_all.columns):# & (sH1.columns==sH_all.columns) & (sW1.columns==sW_all.columns):
                sP_all=pd.concat([sP_all,sP1])
                sH_all=pd.concat([sH_all,sH1])
                sW_all=pd.concat([sW_all,sW1])
                print(city1, 'added again.')

In [40]:
sHPW=df_all.copy()
sP=sP_all.copy()
sH=sH_all.copy()
sW=sW_all.copy()

In [41]:
# daily distance per person
daily_dist=sHPW.groupby('HH_PNR')['Trip_Distance'].sum().to_frame().reset_index()
# daily distance per person, including people with out travel distances, for a variety of reasons
daily_dist_all=daily_dist.merge(sP['HH_PNR'],how='right')
# get the household-person ids for those who did travel, but their travels were recorded as invalid for some reason
HH_PNR_na=sW.loc[sW['HH_PNR'].isin(daily_dist_all.loc[daily_dist_all['Trip_Distance'].isna(),'HH_PNR']),'HH_PNR']
na_PNR=HH_PNR_na.drop_duplicates().values.tolist()

weighted=sHPW.loc[:,('HH_PNR','Per_Weight','Mode','Trip_Distance','Trip_Purpose_Agg')]
weighted=weighted.loc[~weighted['HH_PNR'].isin(na_PNR),:]
weighted['Dist_Weighted_P']=weighted['Per_Weight']*weighted['Trip_Distance']

if 'Trip_Weight' in sW.columns:
    print('Person weights and trip weights are all the same: ' ,all(sHPW['Per_Weight']==sHPW['Trip_Weight']))
else: 
    print('No trip weights, person weights used instead.')

# calculate number of persons using the whole sP file, so we can accuractely calculate km/cap/day. i.e. including those who didn't travel on the survey date
unique_persons=sP.loc[:,['HH_PNR','Per_Weight']].drop_duplicates()
unique_persons=unique_persons.loc[~unique_persons['HH_PNR'].isin(na_PNR),:]

weight_daily_travel=pd.DataFrame(0.001*weighted.groupby('Mode')['Dist_Weighted_P'].sum()/unique_persons['Per_Weight'].sum()).reset_index()
commute_avg=0.001*weighted.loc[weighted['Trip_Purpose_Agg']=='Home↔Work','Dist_Weighted_P'].sum()/weighted.loc[weighted['Trip_Purpose_Agg']=='Home↔Work','Per_Weight'].sum()
trip_avg=0.001*weighted['Dist_Weighted_P'].sum()/weighted['Per_Weight'].sum()
weight_trip_avg=pd.DataFrame(data={'Mode':['All','Commute'],'Avg_trip_dist':[trip_avg,commute_avg]})

weight_daily_travel.rename(columns={'Dist_Weighted_P':'Daily_Travel_cap'},inplace=True)
weight_daily_travel['Mode_Share']=weight_daily_travel['Daily_Travel_cap']/weight_daily_travel['Daily_Travel_cap'].sum()

# calcylate car ownership for all households
carown=sH.loc[:,['HHNR','HH_Weight','CarOwnershipHH']].drop_duplicates()
own=pd.DataFrame(data={'Mode':['Car'],'Ownership':sum(carown['CarOwnershipHH']*carown['HH_Weight'])/sum(carown['HH_Weight'])})
weight_daily_travel=weight_daily_travel.merge(own,how='left')
weight_daily_travel=weight_daily_travel.merge(weight_trip_avg,how='outer')
weight_daily_travel.loc[weight_daily_travel['Mode']=='All','Daily_Travel_cap']=weight_daily_travel['Daily_Travel_cap'].sum()

No trip weights, person weights used instead.


In [42]:
weight_daily_travel

Unnamed: 0,Mode,Daily_Travel_cap,Mode_Share,Ownership,Avg_trip_dist
0,2_3_Wheel,0.183688,0.010722,,
1,Bike,0.209792,0.012246,,
2,Car,12.666457,0.739363,0.781252,
3,Foot,0.823116,0.048047,,
4,Transit,3.248538,0.189623,,
5,All,17.13159,,,4.987259
6,Commute,,,,9.281932
