# Preamble

In [None]:
import numpy as np
import pandas as pd
import glob
import ast
import geopandas as gpd

import beis_indicators.utils.nuts_utils

# Data Collection

"Postcode to coordinates" data

In [None]:
#Collecting mulptiple postcode to lat/lon datasets to lessen the chance of missing data
postcode_lat_lon_1 = pd.read_csv('../../data/raw/ukpostcodes.csv')
postcode_lat_lon_2 = pd.read_csv('../../data/raw/open_postcode_geo.csv', header=None)
postcode_lat_lon = pd.read_csv('../../data/raw/postcodes/postcodes.csv')

Raw broadband speed data

In [None]:
raw_2014 = pd.read_csv('../../data/raw/Fixed_postcode_2014/fixed_postcode_2014.csv')

In [None]:
raw_2015 = pd.read_csv('../../data/raw/Fixed_Postcode_2015/Fixed_Postcode_updated_01022016.csv')

In [None]:
files = glob.glob("../../data/raw/2016_fixed_pc_r01/*.csv")

dfs = [pd.read_csv(f) for f in files]
# dfs
raw_2016 = pd.concat(dfs,ignore_index=True)

In [None]:
files = glob.glob("../../data/raw/fixed-postcode-2017/*.csv")

dfs = [pd.read_csv(f) for f in files]
raw_2017 = pd.concat(dfs, ignore_index=True)

In [None]:
raw_2018 = pd.read_csv('../../data/raw/2018_fixed_pc_r03/201805_fixed_pc_performance_r03.csv')

In [None]:
files = glob.glob('../../data/raw/connected-nations-2019-fixed-postcode-data/201905_fixed_pc_performance/*.csv')

dfs = [pd.read_csv(f) for f in files]
raw_2019 = pd.concat(dfs, ignore_index=True)

---

# Formatting speed values

In [None]:
raw_2014.loc[raw_2014['Average download speed (Mbit/s) by PC'] == '<4', 'Average download speed (Mbit/s) by PC'] = 4
raw_2014['Average download speed (Mbit/s) by PC'] = raw_2014['Average download speed (Mbit/s) by PC'].apply(lambda x: float(x) if type(x) == str else x)

raw_2015.loc[raw_2015['Average download speed (Mbit/s) by PC'] == '<4', 'Average download speed (Mbit/s) by PC'] = 4
raw_2015['Average download speed (Mbit/s) by PC'] = raw_2015['Average download speed (Mbit/s) by PC'].apply(lambda x: float(x) if type(x) == str else x)

In [None]:
raw_2016['Average download speed (Mbit/s)'] = raw_2016['Average download speed (Mbit/s)'].apply(lambda x: float(x))

In [None]:
raw_2017['Average download speed (Mbit/s)'] = raw_2017['Average download speed (Mbit/s)'].apply(lambda x: float(x))

In [None]:
raw_2018['Average download speed (Mbit/s)'] = raw_2018['Average download speed (Mbit/s)'].apply(lambda x: float(x))

In [None]:
raw_2019['Average download speed (Mbit/s)'] = raw_2019['Average download speed (Mbit/s)'].apply(lambda x: float(x))

# Mapping coordinates to postcodes

In [None]:
# postcode_to_latlon = dict(zip(postcode_lat_lon['postcode'].values, zip(postcode_lat_lon['latitude'].values, postcode_lat_lon['longitude'].values)))
postcode_to_latlon_1 = postcode_lat_lon_1[['postcode', 'latitude', 'longitude']]
postcode_to_latlon_2 = postcode_lat_lon_2[[0, 7, 8]]

postcode_to_latlon = postcode_lat_lon[['Postcode', 'Latitude', 'Longitude']]
postcode_to_latlon.columns = ['postcode', 'latitude', 'longitude']

postcode_to_latlon_2.columns = ['postcode', 'latitude', 'longitude']

postcode_to_latlon_1['postcode'] = postcode_to_latlon_1['postcode'].apply(lambda x: x.replace(" ", ""))
postcode_to_latlon_2['postcode'] = postcode_to_latlon_2['postcode'].apply(lambda x: x.replace(" ", ""))

postcode_to_latlon['postcode'] = postcode_to_latlon['postcode'].apply(lambda x: x.replace(" ", ""))

In [None]:
postcode_final = pd.concat([postcode_to_latlon_1, postcode_to_latlon_2, postcode_to_latlon], ignore_index=True).drop_duplicates(subset='postcode')

In [None]:
postcode_final = postcode_final.reset_index(drop=True)

In [None]:
# removal of invalid postcodes found in 2014
x = raw_2014['postcode'].values#
y = postcode_to_latlon['postcode'].values

In [None]:
x_= list(set(x).difference(set(y)))

In [None]:
raw_2014 = raw_2014[~raw_2014['postcode'].isin(x_)].reset_index(drop=True)

In [None]:
set(raw_2014['postcode'].values).difference(set(y))

In [None]:
# merging lat/lon coordinates to postcode
raw_2014 = pd.merge(raw_2014, postcode_to_latlon, on="postcode")
raw_2015 = pd.merge(raw_2015, postcode_to_latlon, on="postcode")
raw_2016 = pd.merge(raw_2016, postcode_to_latlon, on="postcode")
raw_2017 = pd.merge(raw_2017, postcode_to_latlon, on="postcode")
raw_2018 = pd.merge(raw_2018, postcode_to_latlon, on="postcode")
raw_2019 = pd.merge(raw_2019, postcode_to_latlon, on="postcode")

In [None]:
raw_2014_s = raw_2014[['postcode','Average download speed (Mbit/s) by PC', 'latitude', 'longitude']]
raw_2015_s = raw_2015[['postcode','Average download speed (Mbit/s) by PC', 'latitude', 'longitude']]
raw_2016_s = raw_2016[['postcode','Average download speed (Mbit/s)', 'latitude', 'longitude']]
raw_2017_s = raw_2017[['postcode','Average download speed (Mbit/s)', 'latitude', 'longitude']]
raw_2018_s = raw_2018[['postcode','Average download speed (Mbit/s)', 'latitude', 'longitude']]
raw_2019_s = raw_2019[['postcode','Average download speed (Mbit/s)', 'latitude', 'longitude']]


# Finalising Data

## NUTS 2 regions

In [None]:
#NUTS 2 level 2010 codes for 2014
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2010-01m.shp/NUTS_RG_01M_2010_4326_LEVL_2.shp/NUTS_RG_01M_2010_4326_LEVL_2.shp')
points_gdf = gpd.GeoDataFrame(raw_2014_s, geometry=gpd.points_from_xy(raw_2014_s['longitude'], raw_2014_s['latitude']))
points_in_poly_2014 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
#NUTS 2 level 2013 codes for 2015, 2016, 2017
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2013-01m.shp/NUTS_RG_01M_2013_4326_LEVL_2.shp/NUTS_RG_01M_2013_4326_LEVL_2.shp')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2015_s, geometry=gpd.points_from_xy(raw_2015_s['longitude'], raw_2015_s['latitude']))
points_in_poly_2015 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2016_s, geometry=gpd.points_from_xy(raw_2016_s['longitude'], raw_2016_s['latitude']))
points_in_poly_2016 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2017_s, geometry=gpd.points_from_xy(raw_2017_s['longitude'], raw_2017_s['latitude']))
points_in_poly_2017 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
#NUTS 2 level 2016 codes for 2018, 2019
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2016-01m.shp/NUTS_RG_01M_2016_4326_LEVL_2.shp/NUTS_RG_01M_2016_4326_LEVL_2.shp')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2018_s, geometry=gpd.points_from_xy(raw_2018_s['longitude'], raw_2018_s['latitude']))
points_in_poly_2018 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2019_s, geometry=gpd.points_from_xy(raw_2019_s['longitude'], raw_2019_s['latitude']))
points_in_poly_2019 = gpd.sjoin(points_gdf, poly_gdf, op='within')

Grouping

In [None]:
df_2014 = points_in_poly_2014[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s) by PC']]
df_2014_reset = df_2014.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2014_reset = df_2014_reset[df_2014_reset['NUTS_ID'].str.contains('UK')]
df_2014_reset['year'] = [2014]*len(df_2014_reset)
df_2014_reset['nuts_year_spec'] = [2010]*len(df_2014_reset)

df_2014_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2014_reset = df_2014_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2015 = points_in_poly_2015[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s) by PC']]
df_2015_reset = df_2015.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2015_reset = df_2015_reset[df_2015_reset['NUTS_ID'].str.contains('UK')]
df_2015_reset['year'] = [2015]*len(df_2015_reset)
df_2015_reset['nuts_year_spec'] = [2013]*len(df_2015_reset)

df_2015_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2015_reset = df_2015_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2016 = points_in_poly_2016[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2016_reset = df_2016.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2016_reset = df_2016_reset[df_2016_reset['NUTS_ID'].str.contains('UK')]
df_2016_reset['year'] = [2016]*len(df_2016_reset)
df_2016_reset['nuts_year_spec'] = [2013]*len(df_2016_reset)

df_2016_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2016_reset = df_2016_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2017 = points_in_poly_2017[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2017_reset = df_2017.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2017_reset = df_2017_reset[df_2017_reset['NUTS_ID'].str.contains('UK')]
df_2017_reset['year'] = [2017]*len(df_2017_reset)
df_2017_reset['nuts_year_spec'] = [2013]*len(df_2017_reset)

df_2017_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2017_reset = df_2017_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2018 = points_in_poly_2018[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2018_reset = df_2018.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2018_reset = df_2018_reset[df_2018_reset['NUTS_ID'].str.contains('UK')]
df_2018_reset['year'] = [2018]*len(df_2018_reset)
df_2018_reset['nuts_year_spec'] = [2016]*len(df_2018_reset)

df_2018_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2018_reset = df_2018_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2019 = points_in_poly_2019[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2019_reset = df_2019.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2019_reset = df_2019_reset[df_2019_reset['NUTS_ID'].str.contains('UK')]
df_2019_reset['year'] = [2019]*len(df_2019_reset)
df_2019_reset['nuts_year_spec'] = [2016]*len(df_2019_reset)

df_2019_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2019_reset = df_2019_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_broadband_nuts2 = pd.concat([df_2014_reset, df_2015_reset, df_2016_reset,
          df_2017_reset, df_2018_reset, df_2019_reset]).sort_values(by = ['nuts_id', 'year']).reset_index(drop=True)

In [None]:
# rounding data to 3 decimal places
df_broadband_nuts2['broadband_download_speed_data'] = df_broadband_nuts2['broadband_download_speed_data'].round(3)

In [None]:
# save df
df_broadband_nuts2.to_csv('../../data/processed/broadband/broadband_download_speed_data.nuts2.csv',index=False)

## NUTS 3 regions

In [None]:
#NUTS 3 level 2010 codes for 2014
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2010-01m.shp/NUTS_RG_01M_2010_4326_LEVL_3.shp/NUTS_RG_01M_2010_4326_LEVL_3.shp')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2014_s, geometry=gpd.points_from_xy(raw_2014_s['longitude'], raw_2014_s['latitude']))
points_in_poly_2014 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
#NUTS 3 level 2013 codes for 2015, 2016, 2017
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2013-01m.shp/NUTS_RG_01M_2013_4326_LEVL_3.shp/NUTS_RG_01M_2013_4326_LEVL_3.shp')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2015_s, geometry=gpd.points_from_xy(raw_2015_s['longitude'], raw_2015_s['latitude']))
points_in_poly_2015 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2016_s, geometry=gpd.points_from_xy(raw_2016_s['longitude'], raw_2016_s['latitude']))
points_in_poly_2016 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2017_s, geometry=gpd.points_from_xy(raw_2017_s['longitude'], raw_2017_s['latitude']))
points_in_poly_2017 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
#NUTS 3 level 2016 codes for 2018, 2019
poly_gdf = gpd.read_file('../../data/raw/ref-nuts-2016-01m.shp/NUTS_RG_01M_2016_4326_LEVL_3.shp/NUTS_RG_01M_2016_4326_LEVL_3.shp')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2018_s, geometry=gpd.points_from_xy(raw_2018_s['longitude'], raw_2018_s['latitude']))
points_in_poly_2018 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
points_gdf = gpd.GeoDataFrame(raw_2019_s, geometry=gpd.points_from_xy(raw_2019_s['longitude'], raw_2019_s['latitude']))
points_in_poly_2019 = gpd.sjoin(points_gdf, poly_gdf, op='within')

Grouping

In [None]:
df_2014 = points_in_poly_2014[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s) by PC']]
df_2014_reset = df_2014.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2014_reset = df_2014_reset[df_2014_reset['NUTS_ID'].str.contains('UK')]
df_2014_reset['year'] = [2014]*len(df_2014_reset)
df_2014_reset['nuts_year_spec'] = [2010]*len(df_2014_reset)

df_2014_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2014_reset = df_2014_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2015 = points_in_poly_2015[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s) by PC']]
df_2015_reset = df_2015.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2015_reset = df_2015_reset[df_2015_reset['NUTS_ID'].str.contains('UK')]
df_2015_reset['year'] = [2015]*len(df_2015_reset)
df_2015_reset['nuts_year_spec'] = [2013]*len(df_2015_reset)

df_2015_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2015_reset = df_2015_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2016 = points_in_poly_2016[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2016_reset = df_2016.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2016_reset = df_2016_reset[df_2016_reset['NUTS_ID'].str.contains('UK')]
df_2016_reset['year'] = [2016]*len(df_2016_reset)
df_2016_reset['nuts_year_spec'] = [2013]*len(df_2016_reset)

df_2016_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2016_reset = df_2016_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2017 = points_in_poly_2017[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2017_reset = df_2017.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2017_reset = df_2017_reset[df_2017_reset['NUTS_ID'].str.contains('UK')]
df_2017_reset['year'] = [2017]*len(df_2017_reset)
df_2017_reset['nuts_year_spec'] = [2013]*len(df_2017_reset)

df_2017_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2017_reset = df_2017_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2018 = points_in_poly_2018[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2018_reset = df_2018.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2018_reset = df_2018_reset[df_2018_reset['NUTS_ID'].str.contains('UK')]
df_2018_reset['year'] = [2018]*len(df_2018_reset)
df_2018_reset['nuts_year_spec'] = [2016]*len(df_2018_reset)

df_2018_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2018_reset = df_2018_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2019 = points_in_poly_2019[['NUTS_ID', 'NUTS_NAME', 'Average download speed (Mbit/s)']]
df_2019_reset = df_2019.groupby(['NUTS_ID', 'NUTS_NAME']).mean().reset_index()
df_2019_reset = df_2019_reset[df_2019_reset['NUTS_ID'].str.contains('UK')]
df_2019_reset['year'] = [2019]*len(df_2019_reset)
df_2019_reset['nuts_year_spec'] = [2016]*len(df_2019_reset)

df_2019_reset.columns = ['nuts_id','NUTS_NAME','broadband_download_speed_data', 'year', 'nuts_year_spec']
df_2019_reset = df_2019_reset[['year','nuts_id', 'nuts_year_spec', 'broadband_download_speed_data']]

In [None]:
df_broadband_nuts3 = pd.concat([df_2014_reset, df_2015_reset, df_2016_reset,
          df_2017_reset, df_2018_reset, df_2019_reset]).sort_values(by = ['nuts_id', 'year']).reset_index(drop=True)

In [None]:
# rounding data to 3 decimal places
df_broadband_nuts3['broadband_download_speed_data'] = df_broadband_nuts3['broadband_download_speed_data'].round(3)

In [None]:
# save df
df_broadband_nuts3.to_csv('../../data/processed/broadband/broadband_download_speed_data.nuts3.csv',index=False)

## LEP regions

In [None]:
#Ultra generalised LEPs for (before and) 2014 onwards
url_14 = 'https://opendata.arcgis.com/datasets/17c92615a55f4dbf945e8eaf642eaa87_4.geojson'
poly_gdf = gpd.read_file(url_14)

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2014_s, geometry=gpd.points_from_xy(raw_2014_s['longitude'], raw_2014_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2014 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2015_s, geometry=gpd.points_from_xy(raw_2015_s['longitude'], raw_2015_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2015 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2016_s, geometry=gpd.points_from_xy(raw_2016_s['longitude'], raw_2016_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2016 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
#Ultra generalised LEPs for 2017 onwards
url_2017 = 'https://opendata.arcgis.com/datasets/20c830d052c04862aaf0c1021d6b4b25_0.geojson'
poly_gdf = gpd.read_file(url_2017)

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2017_s, geometry=gpd.points_from_xy(raw_2017_s['longitude'], raw_2017_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2017 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2018_s, geometry=gpd.points_from_xy(raw_2018_s['longitude'], raw_2018_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2018 = gpd.sjoin(points_gdf, poly_gdf, op='within')

In [None]:
%%time
points_gdf = gpd.GeoDataFrame(raw_2019_s, geometry=gpd.points_from_xy(raw_2019_s['longitude'], raw_2019_s['latitude']))
points_gdf.crs = 'epsg:4326'
points_gdf = points_gdf.to_crs("EPSG:4326")
points_in_poly_2019 = gpd.sjoin(points_gdf, poly_gdf, op='within')

Grouping

In [None]:
df_2014 = points_in_poly_2014[['lep14cd', 'lep14nm', 'Average download speed (Mbit/s) by PC']]
df_2014_reset = df_2014.groupby(['lep14cd', 'lep14nm']).mean().reset_index()
df_2014_reset['year'] = [2014]*len(df_2014_reset)
df_2014_reset['lep_year_spec'] = [2014]*len(df_2014_reset)

df_2014_reset.columns = ['lep_id','lep14nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2014_reset = df_2014_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2015 = points_in_poly_2015[['lep14cd', 'lep14nm', 'Average download speed (Mbit/s) by PC']]
df_2015_reset = df_2015.groupby(['lep14cd', 'lep14nm']).mean().reset_index()
df_2015_reset['year'] = [2015]*len(df_2015_reset)
df_2015_reset['lep_year_spec'] = [2014]*len(df_2015_reset)

df_2015_reset.columns = ['lep_id','lep14nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2015_reset = df_2015_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2016 = points_in_poly_2016[['lep14cd', 'lep14nm', 'Average download speed (Mbit/s)']]
df_2016_reset = df_2016.groupby(['lep14cd', 'lep14nm']).mean().reset_index()
df_2016_reset['year'] = [2016]*len(df_2016_reset)
df_2016_reset['lep_year_spec'] = [2014]*len(df_2016_reset)

df_2016_reset.columns = ['lep_id','lep14nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2016_reset = df_2016_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2017 = points_in_poly_2017[['lep17cd', 'lep17nm', 'Average download speed (Mbit/s)']]
df_2017_reset = df_2017.groupby(['lep17cd', 'lep17nm']).mean().reset_index()
df_2017_reset['year'] = [2017]*len(df_2017_reset)
df_2017_reset['lep_year_spec'] = [2017]*len(df_2017_reset)

df_2017_reset.columns = ['lep_id','lep17nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2017_reset = df_2017_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2018 = points_in_poly_2018[['lep17cd', 'lep17nm', 'Average download speed (Mbit/s)']]
df_2018_reset = df_2018.groupby(['lep17cd', 'lep17nm']).mean().reset_index()
df_2018_reset['year'] = [2018]*len(df_2018_reset)
df_2018_reset['lep_year_spec'] = [2017]*len(df_2018_reset)

df_2018_reset.columns = ['lep_id','lep17nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2018_reset = df_2018_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_2019 = points_in_poly_2019[['lep17cd', 'lep17nm', 'Average download speed (Mbit/s)']]
df_2019_reset = df_2019.groupby(['lep17cd', 'lep17nm']).mean().reset_index()
df_2019_reset['year'] = [2019]*len(df_2019_reset)
df_2019_reset['lep_year_spec'] = [2017]*len(df_2019_reset)

df_2019_reset.columns = ['lep_id','lep17nm','broadband_download_speed_data', 'year', 'lep_year_spec']
df_2019_reset = df_2019_reset[['year','lep_id', 'lep_year_spec', 'broadband_download_speed_data']]

In [None]:
df_broadband_lep = pd.concat([df_2014_reset, df_2015_reset, df_2016_reset,
          df_2017_reset, df_2018_reset, df_2019_reset]).sort_values(by = ['lep_id', 'year']).reset_index(drop=True)

In [None]:
# rounding data to 3 decimal places
df_broadband_lep['broadband_download_speed_data'] = df_broadband_lep['broadband_download_speed_data'].round(3)

In [None]:
# save df
df_broadband_lep.to_csv('../../data/processed/broadband/broadband_download_speed_data.lep.csv',index=False)