In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import geopandas as gpd
import camelot

In [None]:
import os
import fiona
# import camelot

In [None]:
from urllib.request import urlretrieve
from zipfile import ZipFile
from beis_indicators import project_dir

In [None]:
from beis_indicators.geo.coders import NutsCoder, LepCoder
from beis_indicators.geo.nuts import auto_nuts2_uk
from beis_indicators.indicators import points_to_indicator, save_indicator

In [None]:
lsoa_shp_01 = gpd.read_file('../../data/raw/travel/Lower_Layer_Super_Output_Areas__December_2001__EW_BGC-shp/Lower_Layer_Super_Output_Areas__December_2001__EW_BGC.shp')
lsoa_shp_11 = gpd.read_file('../../data/raw/travel/Lower_Layer_Super_Output_Areas__December_2011__Boundaries_EW_BGC_v3-shp/Lower_Layer_Super_Output_Areas__December_2011__Boundaries_EW_BGC_v3.shp')

In [None]:
# LSOA to lon/lat
lsoa_shp_geo_01 = lsoa_shp_01.to_crs(epsg=4326)
lsoa_shp_geo_11 = lsoa_shp_11.to_crs(epsg=4326)

In [None]:
lsoa_shp_geo_11.head(2)

In [None]:
# print(lsoa_shp_geo.iloc[0].geometry.centroid.x, lsoa_shp_geo.iloc[0].geometry.centroid.y)

lsoa_shp_geo_01['lon'] = lsoa_shp_geo_01.geometry.apply(lambda i: i.centroid.x)
lsoa_shp_geo_01['lat'] = lsoa_shp_geo_01.geometry.apply(lambda i: i.centroid.y)

lsoa_shp_geo_11['lon'] = lsoa_shp_geo_11.geometry.apply(lambda i: i.centroid.x)
lsoa_shp_geo_11['lat'] = lsoa_shp_geo_11.geometry.apply(lambda i: i.centroid.y)

In [None]:
lsoa_shp_geo_re_01 = lsoa_shp_geo_01.rename(columns={'LSOA01CD':'LSOA_code'})
lsoa_shp_geo_re_11 = lsoa_shp_geo_11.rename(columns={'LSOA11CD':'LSOA_code'})

2011 data (England and Wales) uses 2001 LSOA data <br />
2013 data (England only) uses 2011 LSOA data

- Read more here: http://data.dft.gov.uk.s3.amazonaws.com/connectivity-data/Brief-guide-to-connectivity-travel-time-data_v2.pdf

## Road Junctions

Method: Averaging time over the nearest 5 road junctions 

Time of day: AM

Mode of transport: Car

In [None]:
#2011 data
df_road = pd.read_csv('../../data/raw/travel/Road-junctions-travel-times/Junctions_HW_AM.csv')
#2013 data
df_road_13 = pd.read_csv('../../data/raw/travel/2013_Junctions_HW_AM.csv')

In [None]:
print(len(df_road))
print(len(df_road_13))

In [None]:
df_road['NearOrder']

In [None]:
df_road_filter = df_road[df_road['NearOrder'] <= 4]
df_road_filter_13 = df_road_13[df_road_13['NearOrder'] <= 4]

In [None]:
len(df_road_filter)
len(df_road_filter_13)

In [None]:
df_road_filter.reset_index(inplace=True, drop=True)
df_road_filter_13.reset_index(inplace=True, drop=True)

In [None]:
df_road_lat_lon = df_road_filter.merge(lsoa_shp_geo_re_01, on='LSOA_code', how='left')#
df_road_lat_lon_13 = df_road_filter_13.merge(lsoa_shp_geo_re_11, on='LSOA_code', how='left')

In [None]:
df_road_lat_lon = df_road_lat_lon[['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']]
df_road_lat_lon_13 = df_road_lat_lon_13[['LSOA_code', 'RepTime', 'Percentage Services', 'UID', 'NearOrder', 'lon', 'lat']]
df_road_lat_lon_13.columns = ['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']

In [None]:
df_road_lat_lon['year'] = 2011
df_road_lat_lon_13['year'] = 2013

In [None]:
df_road_latlon_11_13 = pd.concat([df_road_lat_lon, df_road_lat_lon_13]).reset_index(drop=True)

### NUTS 2

In [None]:
df_road_nuts2 = points_to_indicator(df_road_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=2),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_road_nuts2.rename(columns={"RepTime": "travel_time_to_road_junctions"}, inplace=True)

In [None]:
df_road_nuts2 = df_road_nuts2.sort_values(by='nuts_id')

In [None]:
df_road_nuts2.to_csv('../../data/processed/travel/travel_time_to_road_junctions.nuts2.csv', index=False)

### NUTS 3

In [None]:
df_road_nuts3 = points_to_indicator(df_road_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=3),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_road_nuts3.rename(columns={"RepTime": "travel_time_to_road_junctions"}, inplace=True)

In [None]:
df_road_nuts3 = df_road_nuts3.sort_values(by='nuts_id')

In [None]:
df_road_nuts3.to_csv('../../data/processed/travel/travel_time_to_road_junctions.nuts3.csv', index=False)

### LEP

In [None]:
df_road_lep = points_to_indicator(df_road_latlon_11_13, value_col='RepTime', coder=LepCoder(),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_road_lep.rename(columns={"RepTime": "travel_time_to_road_junctions"}, inplace=True)

In [None]:
df_road_lep = df_road_lep.sort_values(by='lep_id')

In [None]:
df_road_lep.to_csv('../../data/processed/travel/travel_time_to_road_junctions.lep.csv', index=False)

## Airport

Method: Closest airport 

Time of Day: AM

Mode of transport: Car

In [None]:
df_air_11 = pd.read_csv('../../data/raw/travel/Airports-travel-times/Airports_HW_AM.csv')
df_air_13 = pd.read_csv('../../data/raw/travel/2013_Airports_HW_AM.csv')

In [None]:
df_air_filter_11 = df_air_11[df_air_11['NearOrder'] <= 0]
df_air_filter_13 = df_air_13[df_air_13['NearOrder'] <= 0]

In [None]:
print(len(df_air_filter_11))
print(len(df_air_filter_13))

In [None]:
df_air_filter_11.reset_index(inplace=True, drop=True)
df_air_filter_13.reset_index(inplace=True, drop=True)

In [None]:
df_air_lat_lon = df_air_filter_11.merge(lsoa_shp_geo_re_01, on='LSOA_code', how='left')
df_air_lat_lon_13 = df_air_filter_13.merge(lsoa_shp_geo_re_11, on='LSOA_code', how='left')

In [None]:
df_air_lat_lon = df_air_lat_lon[['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']]
df_air_lat_lon_13 = df_air_lat_lon_13[['LSOA_code', 'RepTime', 'Percentage Services', 'UID', 'NearOrder', 'lon', 'lat']]
df_air_lat_lon_13.columns = ['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']

In [None]:
df_air_lat_lon['year'] = 2011
df_air_lat_lon_13['year'] = 2013

In [None]:
df_air_latlon_11_13 = pd.concat([df_air_lat_lon, df_air_lat_lon_13]).reset_index(drop=True)

### NUTS 2

In [None]:
df_air_nuts2 = points_to_indicator(df_air_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=2),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_air_nuts2.rename(columns={"RepTime": "travel_time_to_airport"}, inplace=True)

In [None]:
df_air_nuts2 = df_air_nuts2.sort_values(by='nuts_id')

In [None]:
df_air_nuts2.to_csv('../../data/processed/travel/travel_time_to_airport.nuts2.csv', index=False)

### NUTS 3

In [None]:
df_air_nuts3 = points_to_indicator(df_air_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=3),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_air_nuts3.rename(columns={"RepTime": "travel_time_to_airport"}, inplace=True)

In [None]:
df_air_nuts3 = df_air_nuts3.sort_values(by='nuts_id')

In [None]:
df_air_nuts3.to_csv('../../data/processed/travel/travel_time_to_airport.nuts3.csv', index=False)

### LEP

In [None]:
df_air_lep = points_to_indicator(df_air_latlon_11_13, value_col='RepTime', coder=LepCoder(),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_air_lep.rename(columns={"RepTime": "travel_time_to_airport"}, inplace=True)

In [None]:
df_air_lep = df_air_lep.sort_values(by='lep_id')

In [None]:
df_air_lep.to_csv('../../data/processed/travel/travel_time_to_airport.lep.csv', index=False)

## Rail Stations

Method: Closest rail station 

Time of Day: AM

Mode of transport: Car

In [None]:
df_rail_11 = pd.read_csv('../../data/raw/travel/Rail-stations-travel-times/Stations_HW_AM.csv')
df_rail_13 = pd.read_csv('../../data/raw/travel/2013_Stations_HW_AM.csv')

In [None]:
df_rail_filter_11 = df_rail_11[df_rail_11['NearOrder'] <= 0]
df_rail_filter_13 = df_rail_13[df_rail_13['NearOrder'] <= 0]

In [None]:
print(len(df_rail_filter_11))
print(len(df_rail_filter_13))

In [None]:
df_rail_filter_11.reset_index(inplace=True, drop=True)
df_rail_filter_13.reset_index(inplace=True, drop=True)

In [None]:
df_rail_lat_lon = df_rail_filter_11.merge(lsoa_shp_geo_re_01, on='LSOA_code', how='left')
df_rail_lat_lon_13 = df_rail_filter_13.merge(lsoa_shp_geo_re_11, on='LSOA_code', how='left')

In [None]:
df_rail_lat_lon = df_rail_lat_lon[['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']]
df_rail_lat_lon_13 = df_rail_lat_lon_13[['LSOA_code', 'RepTime', 'Percentage Services', 'UID', 'NearOrder', 'lon', 'lat']]
df_rail_lat_lon_13.columns = ['LSOA_code', 'RepTime', 'Percentage Services', 'uid', 'NearOrder', 'lon', 'lat']

In [None]:
df_rail_lat_lon['year'] = 2011
df_rail_lat_lon_13['year'] = 2013

In [None]:
df_rail_latlon_11_13 = pd.concat([df_rail_lat_lon, df_rail_lat_lon_13]).reset_index(drop=True)

### NUTS 2

In [None]:
df_rail_nuts2 = points_to_indicator(df_rail_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=2),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_rail_nuts2.rename(columns={"RepTime": "travel_time_to_rail"}, inplace=True)

In [None]:
df_rail_nuts2 = df_rail_nuts2.sort_values(by='nuts_id')

In [None]:
df_rail_nuts2.to_csv('../../data/processed/travel/travel_time_to_rail.nuts2.csv', index=False)

### NUTS 3

In [None]:
df_rail_nuts3 = points_to_indicator(df_rail_latlon_11_13, value_col='RepTime', coder=NutsCoder(level=3),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_rail_nuts3.rename(columns={"RepTime": "travel_time_to_rail"}, inplace=True)

In [None]:
df_rail_nuts3 = df_rail_nuts3.sort_values(by='nuts_id')

In [None]:
df_rail_nuts3.to_csv('../../data/processed/travel/travel_time_to_rail.nuts3.csv', index=False)

### LEP

In [None]:
df_rail_lep = points_to_indicator(df_air_latlon_11_13, value_col='RepTime', coder=LepCoder(),
                    aggfunc=np.mean, value_rename='RepTime',
                    projection='EPSG:4326', x_col='lon', y_col='lat')

In [None]:
df_rail_lep.rename(columns={"RepTime": "travel_time_to_rail"}, inplace=True)

In [None]:
df_rail_lep = df_rail_lep.sort_values(by='lep_id')

In [None]:
df_rail_lep.to_csv('../../data/processed/travel/travel_time_to_rail.lep.csv', index=False)

## Travel to Work

In [None]:
xl_16 = pd.ExcelFile('../../data/raw/travel/travel_to_work_2016.xls')
xl_17 = pd.ExcelFile('../../data/raw/travel/travel_to_work_2017.xls')
xl_18 = pd.ExcelFile('../../data/raw/travel/travel_to_work_2018.xls')

In [None]:
xl_16.sheet_names

In [None]:
df_16 = xl_16.parse('OD16').drop('Office For National Statistics', axis=1)
df_17 = xl_17.parse('OD17').drop('Office For National Statistics', axis=1)
df_18 = xl_18.parse('OD18').drop('Office For National Statistics', axis=1)

In [None]:
df_16.columns = ['UALADGB UA / LAD of residence', 'Mean']
df_17.columns = ['UALADGB UA / LAD of residence', 'Mean']
df_18.columns = ['UALADGB UA / LAD of residence', 'Mean']

In [None]:
df_16 = df_16[9:419].reset_index(drop=True)
df_17 = df_17[9:419].reset_index(drop=True)
df_18 = df_18[9:419].reset_index(drop=True)

In [None]:
'47UD Redditch'.split()[1]

In [None]:
df_16['Code'] = df_16['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[0])
df_16['LAD'] = df_16['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[1])

df_17['Code'] = df_17['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[0])
df_17['LAD'] = df_17['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[1])

df_18['Code'] = df_18['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[0])
df_18['LAD'] = df_18['UALADGB UA / LAD of residence'].apply(lambda x: x.strip().split(" ",1)[1])

In [None]:
del df_16['UALADGB UA / LAD of residence']
del df_17['UALADGB UA / LAD of residence']
del df_18['UALADGB UA / LAD of residence']

In [None]:
# df = df[['LAD', 'Mean']]

In [None]:
len(df_18)

In [None]:
df_16['Code'] = df_16['Code'].apply(lambda x: ('00'+ x) if len(x) <= 2 else x)
df_17['Code'] = df_17['Code'].apply(lambda x: ('00'+ x) if len(x) <= 2 else x)
df_18['Code'] = df_18['Code'].apply(lambda x: ('00'+ x) if len(x) <= 2 else x)

In [None]:
equivs = pd.read_csv('../../data/aux/equivalents_regions.csv',encoding='cp1252')

In [None]:
equivs.head(2)

In [None]:
len(df_16['Code'].tolist())

In [None]:
equiv_df = equivs[equivs.GEOGCDO.isin(df_16['Code'].tolist())][['GEOGCD','GEOGCDO', 'STATUS']]

In [None]:
equiv_df

In [None]:
equiv_df.columns = ['GEOGCD', 'Code', 'Status']

In [None]:
df_16 = df_16.merge(equiv_df, on='Code', how='left')
df_17 = df_17.merge(equiv_df, on='Code', how='left')
df_18 = df_18.merge(equiv_df, on='Code', how='left')

In [None]:
df_16.drop_duplicates(subset='Code', inplace=True)
df_17.drop_duplicates(subset='Code', inplace=True)
df_18.drop_duplicates(subset='Code', inplace=True)

In [None]:
df_16.reset_index(drop=True,inplace=True)
df_17.reset_index(drop=True,inplace=True)
df_18.reset_index(drop=True,inplace=True)

Reading in PDF table to get updated codes

In [None]:
file = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/migrationwithintheuk/methodologies/interalmigrationmethodology/internalmigrationmethodology2016.pdf"

In [None]:
tables = camelot.read_pdf(file, pages = "15-end")

In [None]:
tables

In [None]:
changes_1 = pd.concat([tables[0].df,tables[1].df]).iloc[1:] #,tables[2].df,tables[3].df

In [None]:
changes_1[0] = changes_1[0].apply(lambda x: x.replace('\n', ''))
changes_1[1] = changes_1[1].apply(lambda x: x.replace('\n', ''))
changes_1[2] = changes_1[2].apply(lambda x: x.replace('\n', ''))
changes_1[3] = changes_1[3].apply(lambda x: x.replace('\n', ''))

In [None]:
changes_2 = pd.concat([tables[2].df,tables[3].df]).drop([0,0])

In [None]:
changes_2[0] = changes_2[0].apply(lambda x: x.replace('\n', ''))
changes_2[1] = changes_2[1].apply(lambda x: x.replace('\n', ''))
changes_2[2] = changes_2[2].apply(lambda x: x.replace('\n', ''))

In [None]:
convert_dict = dict(zip(changes_1[2], changes_1[3]))

In [None]:
convert_dict.update(dict(zip(changes_2[1], changes_2[2])))

In [None]:
# import json

# json.dump(convert_dict, open("../../data/aux/terminated_to_active_lad_lookup.json", "w"))

In [None]:
df_16

Update old codes to new

In [None]:
df_16['GEOGCD'] = df_16['GEOGCD'].replace(convert_dict)
df_17['GEOGCD'] = df_17['GEOGCD'].replace(convert_dict)
df_18['GEOGCD'] = df_18['GEOGCD'].replace(convert_dict)

In [None]:
df_16[df_16['LAD']=='Kennet']

In [None]:
df_16 = df_16[~df_16['Mean'].isna()]
df_16['Mean'] = df_16['Mean'].astype(float)

df_17 = df_17[~df_17['Mean'].isna()]
df_17['Mean'] = df_17['Mean'].astype(float)

df_18 = df_18[~df_18['Mean'].isna()]
df_18['Mean'] = df_18['Mean'].astype(float)

In [None]:
df_16 = df_16.groupby('GEOGCD').mean().reset_index()
df_17 = df_17.groupby('GEOGCD').mean().reset_index()
df_18 = df_18.groupby('GEOGCD').mean().reset_index()

In [None]:
df_16['year'] = 2016
df_17['year'] = 2017
df_18['year'] = 2018

In [None]:
len(df_18)

In [None]:
# Use lat long coordinates for 

lad_lat_lon = pd.read_csv('../../data/raw/travel/Local_Authority_Districts__December_2016__Boundaries_UK.csv')

In [None]:
lad_lat_lon.rename(columns={'lad16cd': 'GEOGCD'}, inplace = True)

In [None]:
df_16_geo = df_16.merge(lad_lat_lon, on='GEOGCD', how='left').drop_duplicates(subset='GEOGCD').reset_index(drop=True)
df_17_geo = df_17.merge(lad_lat_lon, on='GEOGCD', how='left').drop_duplicates(subset='GEOGCD').reset_index(drop=True)
df_18_geo = df_18.merge(lad_lat_lon, on='GEOGCD', how='left').drop_duplicates(subset='GEOGCD').reset_index(drop=True)

In [None]:
df_16_geo = df_16_geo[['Mean', 'year', 'long', 'lat']]
df_17_geo = df_17_geo[['Mean', 'year', 'long', 'lat']]
df_18_geo = df_18_geo[['Mean', 'year', 'long', 'lat']]

In [None]:
df_geo = pd.concat([df_16_geo, df_17_geo, df_18_geo])
# df_lep.rename(columns = {'Mean': 'average_travel_to_work_times'}, inplace=True)
df_geo.columns

In [None]:
df_geo

### NUTS 2

In [None]:
df_nuts2 = points_to_indicator(df_geo, value_col='Mean', coder=NutsCoder(level=2),
                    aggfunc=np.mean, value_rename='Mean',
                    projection='EPSG:4326', x_col='long', y_col='lat')

In [None]:
df_nuts2 = df_nuts2.rename(columns = {'Mean': 'travel_time_to_work'}).sort_values(['nuts_id', 'year']).reset_index(drop=True)

In [None]:
save_indicator(df_nuts2, 'travel', 'nuts2')

### NUTS 3

In [None]:
df_nuts3 = points_to_indicator(df_geo, value_col='Mean', coder=NutsCoder(level=3),
                    aggfunc=np.mean, value_rename='Mean',
                    projection='EPSG:4326', x_col='long', y_col='lat')

In [None]:
df_nuts3 = df_nuts3.rename(columns = {'Mean': 'travel_time_to_work'}).sort_values(['nuts_id', 'year']).reset_index(drop=True)

In [None]:
save_indicator(df_nuts3, 'travel', 'nuts3')

### LEP

In [None]:
df_lep_final = points_to_indicator(df_geo, value_col='Mean', coder=LepCoder(),
                    aggfunc=np.mean, value_rename='Mean',
                    projection='EPSG:4326', x_col='long', y_col='lat')
# centroid coords used - may fall in overlapping regions (chance of over/underrep)

In [None]:
df_lep_final = df_lep_final.rename(columns = {'Mean': 'travel_time_to_work'}).sort_values(['lep_id', 'year']).reset_index(drop=True)

In [None]:
# df_lep_final['travel_to_work_times_average'] = df_lep_final['travel_mean_time_to_work'].round(2)


In [None]:
save_indicator(df_lep_final, 'travel', 'lep')