### TransportAPI script: data collection

TODO: description

In [None]:
import urllib.request, json
import requests
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import os

In [None]:
# relevant folders
folder1= '/Users/stefgarasto/Local-Data/'
folder2 = '/Users/stefgarasto/Google Drive/Documents/data/'
folder3 = '/Users/stefgarasto/Google Drive/Documents/results/'
folder4 = '/Users/stefgarasto/Google Drive/Documents/data/ONS/derivative-data/'

In [None]:
# relevant files
ons_pc_file = folder2 + 'ONS/ONS-Postcode-Directory-Latest-Centroids.csv'
pop_density_file = folder2 + 'ONS/population_density1.csv'
ttwa_file = folder2 + 'ONS/Travel_to_Work_Areas_December_2011_Boundaries.csv'
bres_lsoa_file = folder2 + 'ONS/BRES_employment_lsoa_2011_total.csv'

In [None]:
# first, load the list of all TTWA
ttwa_data = pd.read_csv(ttwa_file)
# first column is ttwa codes, second column is ttwa names
print(ttwa_data[ttwa_data['ttwa11nm'].map(lambda x: x[0:1]) == 'S'])

In [None]:
# load the extracted dictionaries of OA centroids
loadOA = True
loadLSOA = True
oa_centroid_path = folder4 + 'oa_centroids_dictionary.pickle'
lsoa_centroid_path = folder4 + 'lsoa_centroids_dictionary.pickle'
exists = os.path.isfile(oa_centroid_path)
if exists and loadOA:
    print('Loading the OA data')
    oa_data = pd.read_pickle(oa_centroid_path)
else:
    print('File not found or not requested')
exists = os.path.isfile(lsoa_centroid_path)
if exists and loadLSOA:
    print('Loading the LSOA data')
    lsoa_data = pd.read_pickle(lsoa_centroid_path)

In [None]:
'''
# TODO just once: augment the LSOA dataset with a list of OAs in that LSOA
tmp = []
for ii,lsoa in enumerate(lsoa_data.index):
    # get which OAs are in this LSOA
    all_oas = oa_data[oa_data['lsoa11']==lsoa].index
    tmp.append([list(all_oas)])
lsoa_tmp = pd.DataFrame(tmp, index = lsoa_data.index, columns = ['oa_list'])
# join with stafford_lsoa
lsoa_data = lsoa_data.join(lsoa_tmp)
lsoa_data.to_pickle(lsoa_centroid_path)
'''
print('None')

In [None]:
print(lsoa_data.head())
print(oa_data.head())

#### First, I will focus on TTWA Stafford, with code E30000271, which has 95 LSOAs
- After getting all the OAs in that area, I will get the job breakdown for each OA

- Then I sum the number of jobs across all OAs in the same LSOA
- Then I order the LSOAs by number of jobs, in descending order
- Starting from the LSOA with the most jobs, I call the transport API to compute journey time from each other LSOA to one destination LSOA. The latter is selected going down the list of LSOA ordered by number of jobs, until I  finish the free calls available this month with transport API.

In [None]:
# get all the OAs and LSOAs in the Stafford TTWA and call the lmi for all to get the job breakdown
t0 = time.time()
stafford_ttwa = 'E30000262'
#'E30000262' is for Shrewsbury
#'E30000228' is for Leamington Spa
#'E30000209' # Gloucester
#'E30000194' Corby
#'E30000189' Cheltenham
#'E30000271' Stafford
stafford_oa = oa_data[oa_data['ttwa'] == stafford_ttwa]
stafford_lsoa = lsoa_data[lsoa_data['ttwa'] == stafford_ttwa]
ttwa_name = ttwa_data['ttwa11nm'][ttwa_data['ttwa11cd']==stafford_ttwa].values[0].lower().replace(' ','-') 
#'cheltenham'
print('There are {} OAs in the {} TTWA'.format(len(stafford_oa), ttwa_name))
print('There are {} LSOAs in the {} TTWA'.format(len(stafford_lsoa), ttwa_name))

In [None]:
# call lmi for all for the job breakdown or load from memory
stafford_oa_path = folder4 + '{}_oa_lsoa_jobs.pickle'.format(ttwa_name)
t0 = time.time()
exists = os.path.exists(stafford_oa_path)
if exists:
    with open(stafford_oa_path, 'rb') as f:
        stafford_lsoa_ordered,stafford_oa_number_of_jobs,stafford_jobs_socGroups,\
            stafford_oa_jobs_breakdown,stafford_oa_population = pickle.load(f)
else:
    stafford_oa_number_of_jobs = {}
    stafford_oa_jobs_breakdown = {}
    stafford_jobs_socGroups = {}
    stafford_oa_population = {}
    for ii,oa in enumerate(stafford_oa.index):
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/jobs_breakdown?area={:6f}%2C{:6f}'.format(
            stafford_oa.loc[oa]['lat'],stafford_oa.loc[oa]['long'])
        out = requests.get(urlname).json()
        stafford_oa_number_of_jobs[oa] = out['totalJobs']
        out = out['jobsBreakdown']
        tmp = {}
        for itmp in out:
            # use the socGroup as the key (adding value or pecentage), so that then each SOC will become a column
            tmp[itmp['socGroup']+'_value'] = itmp['value']
            tmp[itmp['socGroup']+'_percentage'] = itmp['percentage']
            # at the same time, keep a list of names associated with socgroups
            stafford_jobs_socGroups[itmp['socGroup']] = itmp['description']
        stafford_oa_jobs_breakdown[oa] = tmp
        # get also the number of residents (population based estimate)
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/resident_occupations?area={:6f}%2C{:6f}'.format(
            stafford_oa.loc[oa]['lat'],stafford_oa.loc[oa]['long'])
        out = requests.get(urlname).json()
        stafford_oa_population[oa] = out['totalResidents']

print('Done, in {:4f}s'.format(time.time()-t0))

In [None]:
# recompute the population weighted centroids and the jobs weighted centroids
pop_lats = []
pop_longs = []
jobs_lats = []
jobs_longs = []
for ii,lsoa in enumerate(stafford_lsoa.index):
    oa_list = stafford_lsoa['oa_list'].loc[lsoa]
    tmp = []
    tmp_jobs = []
    tmp_lat = []
    tmp_long = []
    for oa in oa_list:
        tmp.append(stafford_oa_population[oa])
        tmp_jobs.append(stafford_oa_number_of_jobs[oa])
        tmp_lat.append(stafford_oa['lat'].loc[oa])
        tmp_long.append(stafford_oa['long'].loc[oa])
    # transform into numpy array and normalise into proportions (so that it sums to 1)
    tmp = np.array(tmp)/sum(tmp)
    tmp_jobs = np.array(tmp_jobs)/sum(tmp_jobs)
    tmp_lat = np.array(tmp_lat)
    tmp_long = np.array(tmp_long)
    pop_lats = np.around(np.sum(tmp_lat * tmp), decimals = 5)
    pop_longs = np.around(np.sum(tmp_long * tmp), decimals = 5)
    jobs_lats = np.around(np.sum(tmp_lat * tmp_jobs), decimals = 5)
    jobs_longs = np.around(np.sum(tmp_long * tmp_jobs), decimals = 5)

# now add the columns to the main dataframe
stafford_lsoa_aug0 = stafford_lsoa.join(pd.DataFrame({'pop_lat': pop_lats, 'pop_long': pop_longs,
                                                     'jobs_lat': jobs_lats, 'jobs_long': jobs_longs},
                                                 index = stafford_lsoa.index, 
                                                    columns = ['pop_lat', 'pop_long','jobs_lat','jobs_long']))

In [None]:
# sum the number of jobs across all OAs in the same LSOA
stafford_lsoa_number_of_jobs = []
stafford_lsoa_density_of_jobs = []
stafford_lsoa_max_of_jobs = []
for ii,lsoa in enumerate(stafford_lsoa.index):
    oa_list = stafford_lsoa['oa_list'].loc[lsoa]
    tot_lsoa_jobs = []
    for oa in oa_list:
        tot_lsoa_jobs.append(stafford_oa_number_of_jobs[oa])
    # add the absolute number of jobs
    stafford_lsoa_number_of_jobs.append(sum(tot_lsoa_jobs))
    stafford_lsoa_density_of_jobs.append(np.mean(tot_lsoa_jobs))
    stafford_lsoa_max_of_jobs.append(max(tot_lsoa_jobs))
# augment the dataframe with the total number of jobs
stafford_lsoa_aug1 = stafford_lsoa_aug0.join(pd.DataFrame({'number_of_jobs': stafford_lsoa_number_of_jobs, 
                                                 'density_of_jobs': stafford_lsoa_density_of_jobs,
                                                 'max_of_jobs': stafford_lsoa_max_of_jobs},
                                                 index = stafford_lsoa.index, 
                                                    columns = ['number_of_jobs','density_of_jobs','max_of_jobs']))

print(stafford_lsoa_aug1.head(n=3))

In [None]:
# now augment with number of jobs from BRES too
# compare the numbers that you got like this with the number of jobs provided by the BRES employment data per LSOA
bres_lsoa_data = pd.read_csv(bres_lsoa_file)#, error_bad_lines = False)
# drop missing values
bres_lsoa_data = bres_lsoa_data.dropna() # = bres_lsoa_data[not np.isnan(bres_lsoa_data['Employment'])]
# remove last two lines, they are irrelevant information:
#N = len(bres_lsoa_data)-1
#bres_lsoa_data = bres_lsoa_data.drop(bres_lsoa_data.index[N])
#N = len(bres_lsoa_data)-1
#bres_lsoa_data = bres_lsoa_data.drop(bres_lsoa_data.index[N])

# load the employments figures as defined in the ONS Business Register Employment Survey
'''
bres_lsoa_data['2011 super output area - lower layer'].iloc[0][:9]

# extract the lsoa codes
def extract_code(long_code):
    try:
        output = long_code[:9]
    except:
        output = 'none'
    return output
bres_lsoa_data['lsoa'] = bres_lsoa_data['2011 super output area - lower layer']
bres_lsoa_data['lsoa'] = bres_lsoa_data['lsoa'].apply(extract_code)
# remove the rows that are 'none'
bres_lsoa_data = bres_lsoa_data[bres_lsoa_data['lsoa']!='none']
'''
# set the index as the LSOA code
bres_lsoa_data = bres_lsoa_data.set_index('mnemonic')

# get the number of jobs for Staffor LSOAs 
new_jobs = []
for ii,lsoa in enumerate(stafford_lsoa_aug1.index):
    new_jobs.append(int(bres_lsoa_data['Employment'].loc[lsoa]))
#print(stafford_lsoa_ordered.index[0])
#print(new_jobs[0], stafford_lsoa_ordered['number_of_jobs'].iloc[0])

stafford_lsoa_aug = stafford_lsoa_aug1.join(pd.DataFrame({'number_of_jobs_BRES': new_jobs},
                                                 index = stafford_lsoa.index, 
                                                    columns = ['number_of_jobs_BRES']))

In [None]:
# order the LSOAs by number of jobs, in descending order
#stafford_lsoa_ordered2 = stafford_lsoa_aug.sort_values('number_of_jobs', ascending = False)
stafford_lsoa_ordered = stafford_lsoa_aug.sort_values('number_of_jobs', ascending = False)

# plot the number of jobs, just to see if there is a knee somewhere
plt.figure(figsize = (20,5))
tmp = plt.plot(stafford_lsoa_ordered['number_of_jobs'],'-x')
tmp = plt.plot(stafford_lsoa_ordered['density_of_jobs'],'-o')
tmp = plt.plot(stafford_lsoa_ordered['max_of_jobs'],'-s')
tmp = plt.plot(stafford_lsoa_ordered['number_of_jobs_BRES'],'-d')
plt.xlabel('LSOA in {}'.format(ttwa_name) , fontsize = 12)
plt.ylabel('Jobs', fontsize = 12)
tmp =plt.xticks(stafford_lsoa_ordered.index, rotation = 'vertical', size = 11)
ax = plt.gca()
for item in (ax.get_yticklabels()):
    item.set_fontsize(11)
plt.legend(['Absolute job number', 'job density', 'max jobs in one OA', 'Absolute job number from BRES'], 
           fontsize = 12)
#plt.ylabel()
SAVEFIG = False
if SAVEFIG:
    plt.savefig(folder3 + 'PIN/{}_jobs_per_LSOA.svg'.format(ttwa_name))


In [None]:
# save the list, with the number of jobs, and the data collected before (jobs number and breakdown per OA)
SAVE_LSOA_DATA = False
if SAVE_LSOA_DATA:
    with open(stafford_oa_path, 'wb') as f:
        pickle.dump((stafford_lsoa_ordered,stafford_oa_number_of_jobs,stafford_jobs_socGroups,
                     stafford_oa_jobs_breakdown,stafford_oa_population),f)


In [None]:
# compare the two values of number of jobs for the Stafford TTWA
np.array(new_jobs).shape
plt.plot(np.array(new_jobs), stafford_lsoa_ordered['number_of_jobs'],'o')
plt.xlabel('From BRES')
plt.ylabel('From Census')

In [None]:
# print out the 10 OAs with the most jobs
origin_lsoas= list(stafford_lsoa_ordered.index)
N = int(np.ceil(len(origin_lsoas) / 10))
#N = int(np.around(92 / 10))
print(N)
dest_lsoas = list(stafford_lsoa_ordered.index[0:N])
stafford_lsoa_ordered.index[0:N]

In [None]:
# from here on, it's about the travel time matrix
# define my app_key and app_id
SETUPKEY = True
if SETUPKEY:
    app_key = '6d207ab55f2768d85de4124b5fc4844c'
    app_id = '87edbe5c'
    app_key_jyl = '09c50d6b59698d5cbe85b50ee758baf6'
    app_id_jyl = 'c99a83a0'
    lon_from= '{}'
    lat_from = '{}'
    lon_to = '{}'
    lat_to = '{}'
    dep_date = '2019-06-27' #set the departure date to the next working day
    dep_time = '07:30' # set the departure time to 7am
    # this is the base urlname to call
    MY_KEY = True
    urlname_var = 'https://transportapi.com/v3/uk/public/journey/from/lonlat:{:5f},{:5f}/to/lonlat:{:5f},{:5f}/'
    if MY_KEY:
        urlname_fix = 'at/{}/{}.json?app_id={}&app_key={}'.format(
            dep_date,dep_time,app_id,app_key)
    else:
        urlname_fix = 'at/{}/{}.json?app_id={}&app_key={}'.format(
            dep_date,dep_time,app_id_jyl,app_key_jyl)

    print(urlname_var + urlname_fix)
    nb_of_calls = 0
    max_calls = 98

In [None]:
# select destination LSOA
# CAREFUL: This will call the TransportAPI
# NOTE: I need to get one more for Chelthenham and I need to check if I missed anything for Corby
API = True #TODO: Need to finish up the missing LSOA (missing because of the limit of 100 calls per day)
t0 =time.time()
if API:
    lsoa_destination1 = 'E01028973' #'E01028945' #'E01028885' #'E01028877' 
    #'E01028873' #'E01028883' #'E01028962' #'E01028953' #'E01028941'  #'E01028942'
    
    '''
    This if for Gloucester
    #'E01022455' 'E01022247' #'E01022347' #'E01022377' #'E01022403' #'E01022318' #'E01022376' 
    #'E01022287' #'E01022277' #'E01022410' 
    #'E01022276' #'E01022348' #'E01022282' #'E01022412' #'E01022446' #'E01022341' #'E01022312'
    '''
    '''
    This is for Corby
    lsoa_destination1 = 'E01026973' #'E01026977' #'E01026962' #'E01026984' #'E01026982'
    '''
    '''
    This is for Chelthenham
     #'E01022420' #'E01022435' #'E01022454' #'E01022436' #'E01022116' #'E01022432' #'E01022106' 
    #'E01022107' #'E01022147' #'E01022123', 'E01022131'
    TODO: for Cheltenham I still need to get some missing LSOA for E01022131 and all lsoa for the 12th destination
    '''

    '''
    # This is for Stafford
    lsoa_destination1 = 'E01029723' #'E01029742' #'E01029712' 
    #'E01029734' #'E01029732' #'E01029699' #'E01029744' #'E01029722' #'E01029725' #'E01029693'
    '''
    save_path_dest1 = folder3 + 'PIN/{}_all_lsoa_commute_{}.pickle'.format(ttwa_name,lsoa_destination1)
    all_commute_times = {lsoa_destination1: {}}
    for ii,lsoa in enumerate(stafford_lsoa_ordered.index):
        if lsoa == lsoa_destination1:
            all_commute_times[lsoa_destination1][lsoa] = {'routes': []}
        # the API returns a json object, reading and decoding the json object returns a dict
        urlname = urlname_var.format(stafford_lsoa_ordered['long'].loc[lsoa],
                                 stafford_lsoa_ordered['lat'].loc[lsoa],
                                 stafford_lsoa_ordered['long'].loc[lsoa_destination1],
                                 stafford_lsoa_ordered['lat'].loc[lsoa_destination1]) + urlname_fix
        out = requests.get(urlname).json()
        nb_of_calls += 1
        all_commute_times[lsoa_destination1][lsoa] = out
        # save at every iteration
        with open(save_path_dest1, 'wb') as f:
            pickle.dump(all_commute_times, f)
        print('Got to iteration {} for origin LSOA {}'.format(ii,lsoa))
        if nb_of_calls==max_calls:
            break
print('Time spent on {} calls was {:.4f}s'.format(ii+1,time.time()-t0))
print(nb_of_calls)

In [None]:
# this version is to fill in missing data in already existing files
API = True
ADJUST = True
if ADJUST:
    # load all saved data, check what's missing
    version_load = ''
    for dest_lsoa in dest_lsoas:
        lsoa_commute_file = folder3 + 'PIN/{}_all_lsoa_commute_{}{}.pickle'.format(
            ttwa_name,dest_lsoa,version_load)
        with open(lsoa_commute_file, 'rb') as f:
            lsoa_commute = pickle.load(f)

        # get the missing origin lsoas
        missing_lsoas = []
        for ii,lsoa in enumerate(origin_lsoas):
            # check whether this origin is missing
            lsoanotin = not lsoa in lsoa_commute[dest_lsoa]
            if not lsoanotin:
                # check whether the 'routes' field is missing
                routesnotin = not 'routes' in lsoa_commute[dest_lsoa][lsoa]
                # finally, check whether 'routes' is iempty:
                if (not routesnotin) and (dest_lsoa!=lsoa):
                    isroutesempty = not len(lsoa_commute[dest_lsoa][lsoa]['routes'])
                else:
                    isroutesempty = False
            else:
                routesnotin = False
            if lsoanotin or routesnotin or isroutesempty:
                missing_lsoas.append(lsoa)
        flag_api = len(missing_lsoas) and nb_of_calls<max_calls #(len(missing_lsoas)+nb_of_calls)<= max_calls
        print(len(missing_lsoas), flag_api==True)
        #continue
        if flag_api and API:
            print('Number of LSOA to process for destination LSOA {} is {}'.format(dest_lsoa,len(missing_lsoas)))
            #all_commute_times = {lsoa_destination1: {}}
            for ii,lsoa in enumerate(missing_lsoas):
                # the API returns a json object, reading and decoding the json object returns a dict
                urlname = urlname_var.format(stafford_lsoa_ordered['long'].loc[lsoa],
                                         stafford_lsoa_ordered['lat'].loc[lsoa],
                                         stafford_lsoa_ordered['long'].loc[dest_lsoa],
                                         stafford_lsoa_ordered['lat'].loc[dest_lsoa]) + urlname_fix
                out = requests.get(urlname).json()
                nb_of_calls += 1
                lsoa_commute[dest_lsoa][lsoa] = out
                # save at every iteration
                with open(lsoa_commute_file, 'wb') as f:
                    pickle.dump(lsoa_commute, f)
                print('Got to iteration {} for destination LSOA {} and origin LSOA {}'.format(ii,dest_lsoa,lsoa))
                if nb_of_calls==max_calls:
                    break

In [None]:
out

In [None]:
'''
Maps API Key:
AIzaSyD1c5PVpgYpSh4TMLCrZ51lAiWTh3Bdg-M
'''