# London Start-up Map

This notebook contains the code performed for the analysis in <a href="https://medium.com/@quantscoop/london-animated-start-up-map-2011-2020-3ae5a709edf9">this blog post</a>.

The end result is a map of London that displays relative start-up formation vs cessation:

<img src="animation.gif"/>

### Packages

In [2]:
%%capture
!pip install pgeocode
!pip install plotly-express
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install imageio
!pip install visvis

In [3]:
%%capture
import pandas as pd
import datetime
from datetime import date
from dateutil.relativedelta import relativedelta
import pgeocode
import json
import numpy as np
import plotly.express as px
import copy
from matplotlib import cm
import matplotlib.pyplot as plt
import urllib.request
import requests
import string
import time
import pickle
import random
import io
import os
import sys
import collections
import imageio
import glob
import visvis as vv

## Downloading London Postcode GEO-code JSON

We will download the JSON file for the geographical boundaries of London postcodes from <a href="https://github.com/sjwhitworth/london_geojson/blob/master/london_postcodes.json">here</a>.

In [4]:
london_geojson_url = "https://raw.githubusercontent.com/sjwhitworth/london_geojson/master/london_postcodes.json"
urllib.request.urlretrieve(london_geojson_url,"london_postcodes.json")
with open("london_postcodes.json","r") as file:
    data = json.load(file)

## Scraping Companies House Data

We will now scrap the details of London-based companies from the Companies House website.

You will need an API key, which you can set-up <a href="https://developer.companieshouse.gov.uk/api/docs/">here</a>.

In [5]:
%%capture
api_key = "api_key"
url = "https://api.companieshouse.gov.uk/company/"
request_counter =  0
attempted_company_codes = []
scrape_start = 7500000
scrape_stop = 12480000
scraping_attempts = int(0) # Change to a larger number e.g 10000
current_company_id = 0
if (os.path.exists("companies_house_data.txt")):
    dataframe = pd.read_table("companies_house_data.txt",delimiter="\t")
else:
    dataframe = pd.DataFrame(columns=['number','formed','active_until','postcode'])
    dataframe.set_index('number')
for counter in range(scraping_attempts):
    while (current_company_id in attempted_company_codes):
        current_company_id = random.randint(scrape_start,scrape_stop)
    attempted_company_codes.append(current_company_id)
    str_company_id = str(current_company_id).zfill(8)
    request_result = requests.get(url = url + str_company_id, auth = (api_key,""))
    request_counter +=1
    json_fail = False
    json_result = request_result.json()
    try:
        sys.stdout = io.StringIO()
        print(json_result)
        sys.stdout = sys.__stdout__
    except:
        json_fail = True
    # Sleep when requests close to rate limit.
    if (request_counter > 550):
        request_counter = 0
        time.sleep(300)
        print("sleeping for 5 min")
    print("Counter: {}".format(counter))
    contains_registered_office = 'registered_office_address' in json_result
    is_situated_in_london = contains_registered_office and \
    'locality' in json_result['registered_office_address'] \
    and 'London' in json_result['registered_office_address']['locality']
    contains_company_status = 'company_status' in json_result
    contains_errors = 'errors' in json_result
    contains_postal_code = contains_registered_office and 'postal_code' in json_result['registered_office_address']
    is_error_free = json_fail == False and contains_errors == False
    if (is_error_free and contains_postal_code and is_situated_in_london and contains_company_status):
        cur_dict = {}
        cur_dict['number'] = json_result['company_number']
        cur_dict['formed'] = json_result['date_of_creation']
        cur_dict['active_until'] = json_result['date_of_cessation'] if 'date_of_cessation' in json_result else "2020-12-01"
        cur_dict['postcode'] = cessation,json_result['registered_office_address']['postal_code']
        dataframe.append(cur_dict,ignore_index=True)

Aggregate the set of existing postcodes in the JSON and apply reformatting for further down the line:

In [6]:
postcodes_extant = set()
for counter in range(len(data['features'])):
    data['features'][counter]['id'] = data['features'][counter]['properties']['Name']
    postcodes_extant.add(data['features'][counter]['properties']['Name'])

Next, to compare the Companies House company postcode data vs. the London postcodes, we will use this utility function:

In [7]:
def get_closest_postcode(postcode, postcodes_list):
    if (" " in postcode and postcode.split()[0] in postcodes_list):
        return postcode.split()[0]
    postcode = ''.join(postcode.split())
    longest = ""
    for pc in postcodes_list:
        result = postcode.find(pc)
        if (result == 0 and len(pc) > len(longest)):
            longest = pc
    return longest

postcodes_list = list(postcodes_extant)
dataframe['postcode'] = [get_closest_postcode(val.upper(), postcodes_list) for val in dataframe['postcode'].values.tolist()]

Now, lets transform the company creation vs. cessation dates to python datetime format:

In [8]:
dataframe['dt1'] = [datetime.date(int(val.split("-")[0]), int(val.split("-")[1]),1) for val in dataframe['formed'].values.tolist()]
dataframe['dt2'] = [datetime.date(int(val.split("-")[0]), int(val.split("-")[1]),1) for val in dataframe['active_until'].values.tolist()]

earliest = min(dataframe['dt1'].tolist())
latest = max(dataframe['dt1'].tolist())

And another utility function for getting the time-difference between months:

In [9]:
def get_months_dif(d1,d2):
    return d1.month - d2.month + 12*(d1.year-d2.year)

Now, lets create a dictionary that maps postcodes to how many companies were
active in a given period (represented as a numpy array):

In [10]:
total_months = get_months_dif(latest,earliest)
postcode_dict = collections.defaultdict(lambda:np.zeros((total_months+1)))
total = np.zeros((total_months+1))
total2 = np.zeros((total_months+1))
for index,row in dataframe.iterrows():
    matrix = np.zeros((total_months+1))
    start_months = get_months_dif(row['dt1'],earliest)
    end_months = get_months_dif(min(row['dt2'],latest),earliest)
    matrix[start_months:end_months] = 1.0
    postcode_dict[row['postcode']] += matrix
    test = np.zeros((total_months+1))
    test2 = np.zeros((total_months+1))
    test[end_months] = 1
    test2[start_months] = 1
    total += matrix
    total2 += test2

postcode_vals = postcode_dict.values()

max_val = 0
#We will take the square root of the number of active companies for visual purposes
for i in postcode_vals:
    max_val = np.power(max(max_val,np.max(i,axis=0)),0.5)

Now lets create a mapping of postcodes to transformed net-active company values:

In [11]:
plotting_values = [pd.DataFrame.from_dict({'id': list(postcode_dict.keys()), 'val': [float(np.power(val[index],0.5)) for val in list(postcode_dict.values())]}) for index in range(total_months)]

And finally, we will plot each month of the start-up map using the plotly express choropleth map,
and save them as PNGs, which are then combined with imageio into an animated .gif image.

In [13]:
viridis = cm.get_cmap('viridis', int(max_val)+1)
viridis.colors

def get_col(viridis_val):
    return 'rgb({},{},{})'.format(int(viridis_val[0]*255),int(viridis_val[1]*255),int(viridis_val[2]*255))

colorscale = {}
colorscale2 = ((0.0,get_col(viridis.colors[0])),)

for index in range(1,int(max_val)+1):
    colorscale[float(index)] = get_col(viridis.colors[index])
    colorscale2 += ((float(index),get_col(viridis.colors[index])),)

for index in range(total_months):
    day = earliest + relativedelta(months=index)
    fig = px.choropleth_mapbox(plotting_values[index], geojson=data, locations='id', color='val',
                               color_continuous_scale ="RdYlGn",
                               range_color=(0, float(max_val)),
                               mapbox_style='carto-positron',
                               zoom=10, center = {"lat": 51.5073, "lon": -0.1277},
                               opacity=0.1,
                               labels={'val':'(sqrt) Net Active'}
                              )
    fig.update_layout(title="   London: (sqrt of) Active Companies per Month, {}-{}".format(day.year,day.month), title_font_size=24)
    fig.layout.coloraxis.autocolorscale= False
    fig.layout.coloraxis.cauto= False
    fig.layout.coloraxis.cmin= 0.0
    fig.layout.coloraxis.cmax= float(max_val)
    fig.write_image("./images_output/image_{}_{}_{}.png".format(day.year,day.month//10,day.month%10))
    
images = []
imageio.plugins.freeimage.download()
for filename in sorted(glob.glob("./images_output/*.png")):
    images.append(imageio.imread(filename))
    images.append(imageio.imread(filename))
imageio.mimsave('animation.gif', images, 'GIF-FI')