## Jobs breakdown per OA
This notebook is to call the lmiforall API that returns data on occupation breakdown from the 2011 census.

The centroids for each OA, or LSOA, are computed in another script (get_oa_lsoa_centroids) using the ONS postcode directory (February 2019). The same dataset also offers a lookup between OAs and TTWAs.

In [1]:
import urllib.request, json
import requests
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import os
import seaborn as sns
# set the default to darkgrid
sns.set_style('darkgrid')

In [2]:
from all_filenames import *

In [3]:
# first, load the list of all TTWA
ttwa_data = pd.read_csv(ttwa_file)
# first column is ttwa codes, second column is ttwa names
ttwa_info11 = pd.read_excel(ttwa_info11_file)
ttwa_info16 = pd.read_excel(ttwa_info16_file)
#print(ttwa_info11.tail(n=3))
#print(ttwa_info16.tail(n=3))

# get small TTWAs
small_ttwas = list(ttwa_info11['ttwa11cd'][ttwa_info11['LSOAs']<40])
print('There are {} TTWAs with less than 40 LSOAs.'.format(len(small_ttwas)))

There are 55 TTWAs with less than 40 LSOAs.


In [4]:
# load the extracted dictionaries of OA centroids
loadOA = True
loadLSOA = False
oa_path = folder4 + 'oa_centroids_dictionary.pickle'
lsoa_path = folder4 + 'lsoa_centroids_dictionary.pickle'
exists = os.path.isfile(oa_path)
if exists and loadOA:
    print('Loadin the OA data')
    oa_data = pd.read_pickle(oa_path)

exists = os.path.isfile(lsoa_path)
if exists and loadLSOA:
    print('Loading the LSOA data')
    lsoa_data = pd.read_pickle(lsoa_path)


Loadin the OA data


In [5]:
# file where I'm storing all the information
save_oa_file = folder3 + 'PIN/oa_jobs_breakdown.pickle'

In [6]:
APICALL = False
print(len(oa_data))
if APICALL:
    
    # for each OA, call the LMIforALL API:    
    oa_number_of_jobs2 = {}
    oa_jobs_breakdown2 = {}
    jobs_socGroups = {}
    
    # it will take multiple hours to gather them all. So, need to create breakpoints
    # if file exists load it and start from where it left off
    exists = os.path.isfile(save_oa_file)
    start_oa = 0

    #TODO: actually, there is no need to have the entire pre-stored list while making the new calls, 
    # only need to join the old and new dictionaries in the end.
    if exists:
        print('Loading already downloaded OAs')
        with open(save_oa_file,'rb') as f:
            # I only need to load the indices where we left off last time
            _,_,_,jobs_socGroups,_,start_oa = pickle.load(f)
        start_oa += 1
    end_oa = min(start_oa + 10000, len(oa_data.index))
    print(start_oa, end_oa)
    if end_oa - start_oa<1:
        stop
    t0 = time.time()
    # loop through all OAs and call the LMIforALL API
    N = 499
    missing_oas = []
    
    for ii,oa in enumerate(oa_data.index[start_oa:end_oa]):
        if oa[0] in ['L', 'M']:
            # exclude these two OAs. They are channel islands and isle of Man, respectively, but they seem to have 
            # wrong latitude and longitude. I'll check them out better later
            continue
        
        try:
            urlname = 'http://api.lmiforall.org.uk/api/v1/census/jobs_breakdown?area={:6f}%2C{:6f}'.format(
            oa_data.loc[oa]['lat'],oa_data.loc[oa]['long'])
            out = requests.get(urlname).json()
            oa_number_of_jobs2[oa] = out['totalJobs']
            out = out['jobsBreakdown']
            tmp = {}
            for itmp in out:
                # use the socGroup as the key (adding value or pecentage), so that then each SOC will become a column
                tmp[itmp['socGroup']+'_value'] = itmp['value']
                tmp[itmp['socGroup']+'_percentage'] = itmp['percentage']
                # at the same time, keep a list of names associated with socgroups
                jobs_socGroups[itmp['socGroup']] = itmp['description']
            oa_jobs_breakdown2[oa] = tmp
            if ii%(N+1) == N:
                print('Done with the last {} OAs. It took {:4f} s'.format(N+1, time.time() - t0))
                t0 = time.time()
        except:
            missing_oas.append(oa)
            continue

    t0 = time.time()
    # reload the  previously filled dictionary
    if exists:
        with open(save_oa_file, 'rb') as f:
            _,oa_number_of_jobs,oa_jobs_breakdown,_,_,_ = pickle.load(f)
        M = len(oa_number_of_jobs)

        # update the dictionaries
        oa_number_of_jobs.update(oa_number_of_jobs2)
        oa_jobs_breakdown.update(oa_jobs_breakdown2)
    else:
        oa_number_of_jobs = oa_number_of_jobs2
        oa_jobs_breakdown = oa_jobs_breakdown2
        
    with open(save_oa_file, 'wb') as f:
        pickle.dump((oa_data,oa_number_of_jobs,oa_jobs_breakdown,jobs_socGroups,start_oa,start_oa+ii),f)

    oa_number_of_jobs = None
    oa_jobs_breakdown = None
    print('Done. Time spent saving: {:2f} s'.format(time.time() - t0))

232034


In [7]:
missing_oas

NameError: name 'missing_oas' is not defined

In [None]:
with open(save_oa_file, 'rb') as f:
        _,oa_number_of_jobs,oa_jobs_breakdown,jobs_socGroups,start_oa,end_oa = pickle.load(f)
len(oa_number_of_jobs),len(oa_jobs_breakdown)


In [None]:
missing_oas = []
for oa in oa_data.index:
    if oa not in oa_jobs_breakdown:
        print(oa)
        missing_oas.append(oa)

In [8]:
missing_oas = [
'S22000047',
'S22000049',
'S22000051',
'S22000054',
'S22000055',
'S22000056',
'S22000057',
'S22000059',
'S22000060',
'S22000061',
'S22000063',
'S22000065',
'S22000067',
'S22000068',
'S22000069',
'S22000070',
'S22000071',
'S22000074',
'S22000075',
'S22000078',
'S22000085']

#'N12000001','N12000002','N12000003','N12000005','N12000006','N12000009','N12000010',

In [9]:
# There are some OAs that I am missing because the API returns that their LONG/LAT is outside of the UK. 
# The longitude/latitude pair is computed as the average across postcodes belonging to them
# To get the data for these OAs, I call the API using one of the postcodes in them 
# (the first that the ONS postcode directory returns)

GETMISSING = True
if GETMISSING:

    print('Number of problematic OAs: ', len(missing_oas))
    # First, I need to load the ONS postcodes directory and group them by OAs
    ons_data = pd.read_csv(ons_pc_file)
    groups = ons_data.groupby('oa11')

    # get the postcode list for the missing OAs
    missing_oas_pcd = []
    for missing_oa in missing_oas:
        group = groups.get_group(missing_oa)
        missing_oas_pcd.append(list(group['pcd']))

# now call the LMI API using the first postcode
APICALL_missed = False
if APICALL_missed and GETMISSING:
    # for each OA, call the LMIforALL API:
    oa_distances2 = {}
    oa_occupations2 = {}
    oa_residents2 = {}

    # actually, there is no need to have the entire pre-stored list while making the new calls, only need to join 
    # the old and new dictionaries in the end.
    t0 = time.time()
    # loop through all OAs and call the LMIforALL API
    N = 49
    failed_oas = [[], []]
    for ii,oa in enumerate(missing_oas):
        # still skip Channel Island and Isle of Man
        if oa[0] in ['L', 'M']:
            continue
        # mean distances
        if oa == 'S00128975':
            ipcd = 1
        else:
            ipcd = 0
        #try:
        urlname = 'http://api.lmiforall.org.uk/api/v1/census/jobs_breakdown?area={}'.format(
        missing_oas_pcd[ii][ipcd].replace(' ',''))
        print(urlname)
        out = requests.get(urlname).json()
        oa_number_of_jobs2[oa] = out['totalJobs']
        out = out['jobsBreakdown']
        tmp = {}
        for itmp in out:
            # use the socGroup as the key (adding value or pecentage), so that then each SOC will become a column
            tmp[itmp['socGroup']+'_value'] = itmp['value']
            tmp[itmp['socGroup']+'_percentage'] = itmp['percentage']
            # at the same time, keep a list of names associated with socgroups
            jobs_socGroups[itmp['socGroup']] = itmp['description']
        oa_jobs_breakdown2[oa] = tmp
        if ii%(N+1) == N:
            print('Done with the last {} OAs. It took {:4f} s'.format(N+1, time.time() - t0))
            t0 = time.time()
#        except:
#            print(oa, out, )
#            failed_oas[0].append(oa)
#            failed_oas[1].append(out)

UPDATE = False
if UPDATE and APICALLmissed and GETMISSING:
    # now reload the  previously filled dictionary
    t0 = time.time()
    with open(save_oa_file, 'rb') as f:
        _,oa_number_of_jobs,oa_jobs_breakdown,jobs_socGroups,_,_ = pickle.load(f)
    M = len(oa_number_of_jobs)

    # update the dictionaries
    oa_number_of_jobs.update(oa_number_of_jobs2)
    oa_jobs_breakdown.update(oa_jobs_breakdown2)
    #jobs_socGroups2.update(jobs_socGroups)

    # save it
    with open(save_oa_file, 'wb') as f:
        pickle.dump((oa_data,oa_number_of_jobs,oa_jobs_breakdown,jobs_socGroups,start_oa,start_oa+ii),f)

    oa_number_of_jobs = None
    oa_jobs_breakdown = None
    print('Done. Time spent saving: {:2f} s'.format(time.time() - t0))

Number of problematic OAs:  28


  interactivity=interactivity, compiler=compiler, result=result)


KeyError: 'N12000001'

In [None]:
missing_oas_pcd


In [None]:
'''
Check memory usage of all variables
'''

import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)
