## Commuting distance per occupation
This notebook is to compute average distance travelled per occupation across different travel to work areas (TTWA). For each output area (OA), or lower super output area (LSOA) in a TTWA, it calls the lmiforall API that returns data on commuting distance and occupation breakdown from the 2011 census.

The centroids for each OA, or LSOA, are computed in another script (get_oa_lsoa_centroids) using the ONS postcode directory (February 2019). The same dataset also offers a lookup between OAs and TTWAs.

In [None]:
import urllib.request, json
import requests
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import os
import seaborn as sns
# set the default to darkgrid
sns.set_style('darkgrid')

In [None]:
# relevant folders
folder1= '/Users/stefgarasto/Local-Data/'
folder2 = '/Users/stefgarasto/Google Drive/Documents/data/'
folder3 = '/Users/stefgarasto/Google Drive/Documents/results/'
folder4 = '/Users/stefgarasto/Google Drive/Documents/data/ONS/derivative-data/'
folder5 = folder2 + 'ONS/Travel_to_Work_Areas_2011_guidance_and_information_V4/'

In [None]:
# relevant files
ons_pc_file = folder2 + 'ONS/ONS-Postcode-Directory-Latest-Centroids.csv'
pop_density_file = folder2 + 'ONS/population_density1.csv'
ttwa_file = folder2 + 'ONS/Travel_to_Work_Areas_December_2011_Boundaries.csv'
ttwa_info11_file = folder5 + 'TTWA-summary-statistics-2011.xls'
ttwa_info16_file = folder5 + 'TTWA-info-2016.xls'

In [None]:
# first, load the list of all TTWA
ttwa_data = pd.read_csv(ttwa_file)
# first column is ttwa codes, second column is ttwa names
ttwa_info11 = pd.read_excel(ttwa_info11_file)
ttwa_info16 = pd.read_excel(ttwa_info16_file)
#print(ttwa_info11.tail(n=3))
#print(ttwa_info16.tail(n=3))

# get small TTWAs
small_ttwas = list(ttwa_info11['ttwa11cd'][ttwa_info11['LSOAs']<40])
print('There are {} TTWAs with less than 40 LSOAs.'.format(len(small_ttwas)))

In [None]:
# load the extracted dictionaries of OA centroids
loadOA = True
loadLSOA = False
oa_path = folder4 + 'oa_centroids_dictionary.pickle'
lsoa_path = folder4 + 'lsoa_centroids_dictionary.pickle'
exists = os.path.isfile(oa_path)
if exists and loadOA:
    print('Loadin the OA data')
    oa_data = pd.read_pickle(oa_path)

exists = os.path.isfile(lsoa_path)
if exists and loadLSOA:
    print('Loading the LSOA data')
    lsoa_data = pd.read_pickle(lsoa_path)


In [None]:
# file where I'm storing all the information
save_oa_file = folder3 + 'PIN/oa_distances_and_occupations_v2.pickle'

In [None]:
APICALL = False
if APICALL:
    # for each OA, call the LMIforALL API:
    oa_distances2 = {}
    oa_occupations2 = {}
    oa_residents2 = {}

    # it will take multiple hours to gather them all. So, need to create breakpoints
    # if file exists load it and start from where it left off
    exists = os.path.isfile(save_oa_file)
    start_oa = 0
    socGroups = {}

    #TODO: actually, there is no need to have the entire pre-stored list while making the new calls, only need to join the
    # old and new dictionaries in the end.
    if exists:
        print('Loading already downloaded OAs')
        with open(save_oa_file,'rb') as f:
            # I only need to load the indices where we left off last time
            oa_data,_,_,_,socGroups,_,start_oa = pickle.load(f)
        start_oa += 1
    end_oa = min(start_oa + 7000, len(oa_data.index))
    print(start_oa, end_oa)
    if end_oa - start_oa<1:
        stop
    t0 = time.time()
    # loop through all OAs and call the LMIforALL API
    N = 999
    missing_oas = [[], []]
    for ii,oa in enumerate(oa_data.index[start_oa:end_oa]):
        if oa[0] in ['L', 'M']:
            # exclude these two OAs. They are channel islands and isle of Man, respectively, but they seem to have 
            # wrong latitude and longitude. I'll check them out better later
            continue
        # mean distances
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/distance?area={:6f}%2C{:6f}'.format(
            oa_data.loc[oa]['lat'],oa_data.loc[oa]['long'])
        out = requests.get(urlname).json()
        if 'distances' in out:
            out = out['distances']
        else:
            print(oa, out)
            missing_oas[0].append(oa)
            missing_oas[1].append(out)
            continue
        tmp = {}
        # out is now a list of dictionaries, each with a code, a description and a value
        for itmp in out:
            # saving them this way should make it easy to then convert into a dataframe
            tmp[itmp['description']] = itmp['value']
            tmp['code'] = itmp['code']
        oa_distances2[oa] = tmp
        # occupation breakdown
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/resident_occupations?area={:6f}%2C{:6f}'.format(
            oa_data.loc[oa]['lat'],oa_data.loc[oa]['long'])
        out = requests.get(urlname).json()
        oa_residents2[oa] = out['totalResidents'] # this one is just a number, so it will be easily convertable
        # the following is again a list of dictionaries, with socGroudp, description, value and percentage
        out = out['residentOccupations']
        tmp = {}
        for itmp in out:
            # use the socGroup as the key (adding value or pecentage), so that then each SOC will become a column
            tmp[itmp['socGroup']+'_value'] = itmp['value']
            tmp[itmp['socGroup']+'_percentage'] = itmp['percentage']
            # at the same time, keep a list of names associated with socgroups
            socGroups[itmp['socGroup']] = itmp['description']
        oa_occupations2[oa] = tmp
        if ii%1000 == N:
            print('Done with the last {} OAs. It took {:4f} s'.format(N+1, time.time() - t0))
            t0 = time.time()

    t0 = time.time()
    # reload the  previously filled dictionary
    with open(save_oa_file, 'rb') as f:
        _,oa_distances,oa_occupations,oa_residents,_,_,_ = pickle.load(f)
    N = len(oa_distances)

    # update the dictionaries
    oa_distances.update(oa_distances2)
    oa_residents.update(oa_residents2)
    oa_occupations.update(oa_occupations2)

    with open(save_oa_file, 'wb') as f:
        pickle.dump((oa_data,oa_distances,oa_occupations,oa_residents,socGroups,start_oa,start_oa+ii),f)

    oa_distances = None
    oa_residents = None
    oa_occupations = None
    print('Done. Time spent saving: {:2f} s'.format(time.time() - t0))

In [None]:
# There are some OAs that I am missing because the API returns that their LONG/LAT is outside of the UK. 
# The longitude/latitude pair is computed as the average across postcodes belonging to them
# To get the data for these OAs, I call the API using one of the postcodes in them 
# (the first that the ONS postcode directory returns)

GETMISSING = False
if GETMISSING:
    # I am hardcoding these missing OAs for future reference
    missing_oas = ['S00128975', 'S00129086', 'S00129114', 'S00129074', 'S00129108', 'S00129102', 'S00129030',
                   'S00129125', 'S00129014', 'S00128998', 'S00128971', 'S00129038', 'S00129033', 'S00129072', 
                   'S00129107', 'S00128967', 'S00128952', 'S00128970', 'S00129092', 'S00128995', 'S00129121', 
                   'S00129132', 'S00129005', 'S00129105', 'S00129113', 'S00129133', 'S00128963', 'S00129085', 
                   'S00129139', 'S00128953', 'S00129058', 'S00128942', 'S00129046', 'S00129009', 'S00129048', 
                   'S00129035', 'S00129004', 'S00129082', 'S00129076', 'S00129028', 'S00129036', 'S00129041',
                   'S00129068', 'S00128951', 'S00129138', 'S00129078', 'S00129116', 'S00128943', 'S00129131',
                   'S00129119', 'S00128947', 'S00128976', 'L99999999', 'S00129003', 'S00128977', 'S00129090',
                   'S00129054', 'S00129011', 'S00128962', 'S00128944', 'S00129126', 'S00129002', 'S00128991',
                   'S00128985', 'S00129053', 'S00129129', 'S00129052', 'S00128973', 'S00129112', 'S00128996', 
                   'S00129106', 'S00129044', 'S00129043', 'S00128972', 'S00129101', 'S00129115', 'S00129049', 
                   'S00129034', 'S00129088', 'S00129070', 'S00129075', 'S00129083', 'S00129127', 'S00129095', 
                   'S00129012', 'S00128948', 'S00129061', 'S00129040', 'S00129071', 'S00128997', 'S00129006', 
                   'S00129089', 'S00129077', 'S00128988', 'S00129032', 'S00129099', 'S00129141', 'S00128946', 
                   'S00129140', 'S00129010', 'S00129000', 'S00129122', 'S00129021', 'S00128956', 'S00129007', 
                   'S00128999', 'S00128969', 'S00128945', 'S00129120', 'S00128974', 'S00129008', 'S00129103',
                   'S00129094', 'S00128992', 'S00128986', 'S00129135', 'S00129136', 'S00129020', 'S00129123',
                   'S00129042', 'S00129109', 'S00129059', 'M99999999', 'S00128954', 'S00129098', 'S00128987', 
                   'S00128955', 'S00129073', 'S00129050', 'S00129093', 'S00128981', 'S00129104', 'S00129100', 
                   'S00129065', 'S00129001', 'S00128968', 'S00128966', 'S00129128', 'S00128978', 'S00129026', 
                   'S00129062', 'S00129066', 'S00129047', 'S00129081', 'S00128958', 'S00129060', 'S00129031', 
                   'S00128980', 'S00128949', 'S00128964', 'S00129019', 'S00129080', 'S00129015', 'S00128989', 
                   'S00129013', 'S00129029', 'S00129045', 'S00128957', 'S00129057', 'S00128979', 'S00129079', 
                   'S00129063', 'S00129069', 'S00128982', 'S00129118', 'S00128961', 'S00129039', 'S00129022', 
                   'S00129111', 'S00128950', 'S00129110', 'S00128984', 'S00129097', 'S00129023', 'S00129016',
                   'S00129024', 'S00128993', 'S00129018', 'S00129091', 'S00129017', 'S00128983', 'S00128990', 
                   'S00128994', 'S00128965', 'S00129134', 'S00129027', 'S00129096', 'S00128959', 'S00128960', 
                   'S00129087', 'S00129051', 'S00129055', 'S00129084', 'S00129117', 'S00129056', 'S00129037', 
                   'S00129067', 'S00129064', 'S00129130', 'S00129137', 'S00129124', 'S00129025']

    print('Number of problematic OAs: ', len(missing_oas))
    # First, I need to load the ONS postcodes directory and group them by OAs
    ons_data = pd.read_csv(ons_pc_file)
    groups = ons_data.groupby('oa11')

    # get the postcode list for the missing OAs
    missing_oas_pcd = []
    for missing_oa in missing_oas:
        group = groups.get_group(missing_oa)
        missing_oas_pcd.append(list(group['pcd']))

# now call the LMI API using the first postcode
APICALL_missed = False
if APICALL_missed and GETMISSING:
    # for each OA, call the LMIforALL API:
    oa_distances2 = {}
    oa_occupations2 = {}
    oa_residents2 = {}

    # actually, there is no need to have the entire pre-stored list while making the new calls, only need to join 
    # the old and new dictionaries in the end.
    t0 = time.time()
    # loop through all OAs and call the LMIforALL API
    N = 49
    failed_oas = [[], []]
    for ii,oa in enumerate(missing_oas):
        # still skip Channel Island and Isle of Man
        if oa[0] in ['L', 'M']:
            continue
        # mean distances
        if oa == 'S00128975':
            ipcd = 1
        else:
            ipcd = 0
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/distance?area={}'.format(
            missing_oas_pcd[ii][ipcd].replace(' ',''))
        out = requests.get(urlname).json()
        if 'distances' in out:
            out = out['distances']
        else:
            print(oa, out)
            failed_oas[0].append(oa)
            failed_oas[1].append(out)
            continue
        tmp = {}
        # out is now a list of dictionaries, each with a code, a description and a value
        for itmp in out:
            # saving them this way should make it easy to then convert into a dataframe
            tmp[itmp['description']] = itmp['value']
            tmp['code'] = itmp['code']
        oa_distances2[oa] = tmp
        # occupation breakdown
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/resident_occupations?area={}'.format(
            missing_oas_pcd[ii][ipcd].replace(' ',''))
        out = requests.get(urlname).json()
        oa_residents2[oa] = out['totalResidents'] # this one is just a number, so it will be easily convertable
        # the following is again a list of dictionaries, with socGroudp, description, value and percentage
        out = out['residentOccupations']
        tmp = {}
        for itmp in out:
            # use the socGroup as the key (adding value or pecentage), so that then each SOC will become a column
            tmp[itmp['socGroup']+'_value'] = itmp['value']
            tmp[itmp['socGroup']+'_percentage'] = itmp['percentage']
            # at the same time, keep a list of names associated with socgroups
            socGroups[itmp['socGroup']] = itmp['description']
        oa_occupations2[oa] = tmp
        if ii%(N+1) == N:
            print('Done with the last {} OAs. It took {:4f} s'.format(N+1, time.time() - t0))
            t0 = time.time()
            
    # now reload the  previously filled dictionary
    t0 = time.time()
    with open(save_oa_file, 'rb') as f:
        _,oa_distances,oa_occupations,oa_residents,socGroups,_,_ = pickle.load(f)
    N = len(oa_distances)

    # update the dictionaries with the missing OAs
    oa_distances.update(oa_distances2)
    oa_residents.update(oa_residents2)
    oa_occupations.update(oa_occupations2)

    # save it
    with open(save_oa_file, 'wb') as f:
        pickle.dump((oa_data,oa_distances,oa_occupations,oa_residents,socGroups,0,len(oa_data)),f)

    oa_distances = None
    oa_residents = None
    oa_occupations = None
    print('Done. Time spent saving: {:2f} s'.format(time.time() - t0))

In [None]:
# Ideally, we would also collect data about jobs breakdown by OA, that is the number of jobs in that area, total and
# split by SOC codes: TODO


In [None]:
'''
Check memory usage of all variables
'''

import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)
