In [1]:
import pandas as pd
# run this command to allow more data to be displayed than default
pd.set_option('display.max_rows', 200)

In [2]:
#read in the data  (this is the dis1.zip file from Maarten)
travel_rail = pd.read_csv("Data/dis3.csv")
#Check that it's been read in 
travel_rail

Unnamed: 0,origin_i,origin_msoacode,destination_j,destination_msoacode,data
0,0,E02000001,0,E02000001,1.274559
1,0,E02000001,1,E02000002,20.273945
2,0,E02000001,2,E02000003,18.789007
3,0,E02000001,3,E02000004,21.915258
4,0,E02000001,4,E02000005,18.904354
...,...,...,...,...,...
71166091,8435,S02001235,8431,S02001231,27.386349
71166092,8435,S02001235,8432,S02001232,3.891746
71166093,8435,S02001235,8433,S02001233,31.453745
71166094,8435,S02001235,8434,S02001234,1.363395


In [3]:
# Keep only England and Wales
travel_rail = travel_rail[~travel_rail['origin_msoacode'].astype(str).str.startswith('S')]
travel_rail = travel_rail[~travel_rail['destination_msoacode'].astype(str).str.startswith('S')]
travel_rail

Unnamed: 0,origin_i,origin_msoacode,destination_j,destination_msoacode,data
0,0,E02000001,0,E02000001,1.274559
1,0,E02000001,1,E02000002,20.273945
2,0,E02000001,2,E02000003,18.789007
3,0,E02000001,3,E02000004,21.915258
4,0,E02000001,4,E02000005,18.904354
...,...,...,...,...,...
60746396,7200,W02000423,7196,W02000419,206.846510
60746397,7200,W02000423,7197,W02000420,175.686050
60746398,7200,W02000423,7198,W02000421,218.041530
60746399,7200,W02000423,7199,W02000422,1.339161


In [4]:
# drop the origin_i and destination_j columns
travel_rail.drop(['origin_i', 'destination_j'], axis=1, inplace=True)
# rename the data column 
travel_rail.rename({'data': 'time_rail_min'}, axis=1, inplace=True)

The data column is travel time between MSOAs in minutes. We want to get:

    1 - UNWEIGHTED Average Travel Time From MSOA to all Other MSOAs (by mode)

    2 - WEIGHTED Average Travel Time From MSOA to all Other MSOAs (by mode)

1 - UNWEIGHTED Average Travel Time From MSOA to all Other MSOAs (by mode)

In [5]:
#merge by origin MSOA and average over all transport options
avg_rail = travel_rail.groupby("origin_msoacode").mean()
# rename the time_rail_min column 
avg_rail.rename({'time_rail_min': 'avg_time_from_origin_rail_UNWEIGHTED'}, axis=1, inplace=True)
avg_rail

Unnamed: 0_level_0,avg_time_from_origin_rail_UNWEIGHTED
origin_msoacode,Unnamed: 1_level_1
E02000001,106.993596
E02000002,123.991276
E02000003,122.518744
E02000004,124.616367
E02000005,122.390085
...,...
W02000419,184.373645
W02000420,172.831765
W02000421,281.459917
W02000422,191.265110


2 - WEIGHTED Average Travel Time From MSOA to all Other MSOAs (by mode)

In [6]:
#import the flow data (from here https://www.nomisweb.co.uk/census/2011/bulk/rOD1)
flow = pd.read_csv("Data/wu03ew_msoa.csv")
flow

Unnamed: 0,Area of residence,Area of workplace,All categories: Method of travel to work,Work mainly at or from home,"Underground, metro, light rail, tram",Train,"Bus, minibus or coach",Taxi,"Motorcycle, scooter or moped",Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work
0,E02000001,E02000001,1506,0,73,41,32,9,1,8,1,33,1304,4
1,E02000001,E02000014,2,0,2,0,0,0,0,0,0,0,0,0
2,E02000001,E02000016,3,0,1,0,2,0,0,0,0,0,0,0
3,E02000001,E02000025,1,0,0,1,0,0,0,0,0,0,0,0
4,E02000001,E02000028,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402196,W02000423,W02000411,6,0,0,0,3,0,0,1,0,0,2,0
2402197,W02000423,W02000412,58,0,0,0,10,0,1,29,1,4,13,0
2402198,W02000423,W02000415,3,0,0,0,0,0,0,3,0,0,0,0
2402199,W02000423,W02000422,525,0,1,2,17,2,0,125,11,31,333,3


In [7]:
# keep only the 'Underground, metro....' column
flow = flow[['Area of residence','Area of workplace', 'Underground, metro, light rail, tram']]
flow

Unnamed: 0,Area of residence,Area of workplace,"Underground, metro, light rail, tram"
0,E02000001,E02000001,73
1,E02000001,E02000014,2
2,E02000001,E02000016,1
3,E02000001,E02000025,0
4,E02000001,E02000028,0
...,...,...,...
2402196,W02000423,W02000411,0
2402197,W02000423,W02000412,0
2402198,W02000423,W02000415,0
2402199,W02000423,W02000422,1


In [8]:
# merge the df with the travel times to the flow df. The condition is that the origin MSOA 
# and the Destination MSOA are the same
flow_distance = pd.merge(travel_rail, flow,  how='left', 
                         left_on=['origin_msoacode','destination_msoacode'], 
                         right_on = ['Area of residence','Area of workplace'])
flow_distance.head(5)

Unnamed: 0,origin_msoacode,destination_msoacode,time_rail_min,Area of residence,Area of workplace,"Underground, metro, light rail, tram"
0,E02000001,E02000001,1.274559,E02000001,E02000001,73.0
1,E02000001,E02000002,20.273945,,,
2,E02000001,E02000003,18.789007,,,
3,E02000001,E02000004,21.915258,,,
4,E02000001,E02000005,18.904354,,,


In [9]:
#replace all Nan values in the last column with 0
flow_distance['Underground, metro, light rail, tram'].fillna(0, inplace=True)
# check 
flow_distance.head(5)

Unnamed: 0,origin_msoacode,destination_msoacode,time_rail_min,Area of residence,Area of workplace,"Underground, metro, light rail, tram"
0,E02000001,E02000001,1.274559,E02000001,E02000001,73.0
1,E02000001,E02000002,20.273945,,,0.0
2,E02000001,E02000003,18.789007,,,0.0
3,E02000001,E02000004,21.915258,,,0.0
4,E02000001,E02000005,18.904354,,,0.0


In [10]:
# create a new column with the weighted travel time
flow_distance['cumulative_time_rail'] = flow_distance['time_rail_min'] * flow_distance['Underground, metro, light rail, tram']
#check
flow_distance.head(5)

Unnamed: 0,origin_msoacode,destination_msoacode,time_rail_min,Area of residence,Area of workplace,"Underground, metro, light rail, tram",cumulative_time_rail
0,E02000001,E02000001,1.274559,E02000001,E02000001,73.0,93.042836
1,E02000001,E02000002,20.273945,,,0.0,0.0
2,E02000001,E02000003,18.789007,,,0.0,0.0
3,E02000001,E02000004,21.915258,,,0.0,0.0
4,E02000001,E02000005,18.904354,,,0.0,0.0


In [11]:
# group by origin MSOA and get the sum
avg_rail_weighted = flow_distance.groupby("origin_msoacode").sum()
avg_rail_weighted.head(5)

Unnamed: 0_level_0,time_rail_min,"Underground, metro, light rail, tram",cumulative_time_rail
origin_msoacode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
E02000001,770460.884583,796.0,10536.165719
E02000002,892861.175968,321.0,8910.798812
E02000003,882257.478564,494.0,13681.823887
E02000004,897362.460281,244.0,6169.729638
E02000005,881331.002598,364.0,9579.924252


In [12]:
# add column to get average travel time by rail
avg_rail_weighted['avg_time_rail'] = avg_rail_weighted['cumulative_time_rail'] / avg_rail_weighted['Underground, metro, light rail, tram']
avg_rail_weighted.head(5)

Unnamed: 0_level_0,time_rail_min,"Underground, metro, light rail, tram",cumulative_time_rail,avg_time_rail
origin_msoacode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E02000001,770460.884583,796.0,10536.165719,13.236389
E02000002,892861.175968,321.0,8910.798812,27.759498
E02000003,882257.478564,494.0,13681.823887,27.696
E02000004,897362.460281,244.0,6169.729638,25.285777
E02000005,881331.002598,364.0,9579.924252,26.318473


In [13]:
#merge the avg travel time df with this df 

rail = pd.merge(avg_rail, avg_rail_weighted,  how='left', 
                         left_on=['origin_msoacode'], 
                         right_on = ['origin_msoacode'])
rail

Unnamed: 0_level_0,avg_time_from_origin_rail_UNWEIGHTED,time_rail_min,"Underground, metro, light rail, tram",cumulative_time_rail,avg_time_rail
origin_msoacode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E02000001,106.993596,7.704609e+05,796.0,10536.165719,13.236389
E02000002,123.991276,8.928612e+05,321.0,8910.798812,27.759498
E02000003,122.518744,8.822575e+05,494.0,13681.823887,27.696000
E02000004,124.616367,8.973625e+05,244.0,6169.729638,25.285777
E02000005,122.390085,8.813310e+05,364.0,9579.924252,26.318473
...,...,...,...,...,...
W02000419,184.373645,1.327675e+06,1.0,186.314590,186.314590
W02000420,172.831765,1.244562e+06,7.0,925.495322,132.213617
W02000421,281.459917,2.026793e+06,2.0,441.514720,220.757360
W02000422,191.265110,1.377300e+06,3.0,474.398830,158.132943


In [14]:
# drop time_rail_min: we don't want a sum of the time from each MSOA to all other MSOAs
rail.drop('time_rail_min', axis=1, inplace=True)
rail

Unnamed: 0_level_0,avg_time_from_origin_rail_UNWEIGHTED,"Underground, metro, light rail, tram",cumulative_time_rail,avg_time_rail
origin_msoacode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E02000001,106.993596,796.0,10536.165719,13.236389
E02000002,123.991276,321.0,8910.798812,27.759498
E02000003,122.518744,494.0,13681.823887,27.696000
E02000004,124.616367,244.0,6169.729638,25.285777
E02000005,122.390085,364.0,9579.924252,26.318473
...,...,...,...,...
W02000419,184.373645,1.0,186.314590,186.314590
W02000420,172.831765,7.0,925.495322,132.213617
W02000421,281.459917,2.0,441.514720,220.757360
W02000422,191.265110,3.0,474.398830,158.132943


In [15]:
#save to csv
rail.to_csv('Data/travel_time_rail.csv')