# Create a London postcode to borough/district mapping

## Data from [Doogal](https://www.doogal.co.uk/PostcodeDownloads#google_vignette)

In [1]:
import pandas as pd

In [2]:
boroughs = pd.read_csv("../../data/UK/london_postcode.csv")
boroughs.head(10)

Unnamed: 0,District,Postcode,Ward,District Code,London zone
0,Bromley,BR1 1AA,Bromley Town,E09000006,5
1,Bromley,BR1 1AB,Bromley Town,E09000006,4
2,Bromley,BR1 1AD,Bromley Town,E09000006,5
3,Bromley,BR1 1AE,Bromley Town,E09000006,4
4,Bromley,BR1 1AF,Bromley Town,E09000006,5
5,Bromley,BR1 1AG,Bromley Town,E09000006,5
6,Bromley,BR1 1AH,Bromley Town,E09000006,5
7,Bromley,BR1 1AJ,Bromley Town,E09000006,5
8,Bromley,BR1 1AL,Bromley Town,E09000006,4
9,Bromley,BR1 1AN,Bromley Town,E09000006,4


In [3]:
boroughs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328443 entries, 0 to 328442
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   District       328443 non-null  object
 1   Postcode       328443 non-null  object
 2   Ward           328443 non-null  object
 3   District Code  328443 non-null  object
 4   London zone    328443 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 12.5+ MB


In [4]:
# Create a new column with just the area part of the postcode
boroughs['postcode_area'] = boroughs['Postcode'].str.split(' ').str[0]

# Create a new dataframe with just the 'District' and 'postcode_area' columns
postcode_district = boroughs[['District', 'postcode_area']]

In [5]:
print(postcode_district['postcode_area'].unique().tolist())
print(f"Count: {len(postcode_district['postcode_area'].unique().tolist())}")

['BR1', 'BR2', 'BR3', 'BR4', 'BR5', 'BR6', 'BR7', 'BR8', 'CM13', 'CM14', 'CM23', 'CR0', 'CR2', 'CR3', 'CR4', 'CR44', 'CR5', 'CR6', 'CR7', 'CR8', 'CR9', 'CR90', 'DA1', 'DA14', 'DA15', 'DA16', 'DA17', 'DA18', 'DA5', 'DA6', 'DA7', 'DA8', 'E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E1W', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E77', 'E8', 'E9', 'E98', 'EC1A', 'EC1M', 'EC1N', 'EC1P', 'EC1R', 'EC1V', 'EC1Y', 'EC2A', 'EC2M', 'EC2N', 'EC2P', 'EC2R', 'EC2V', 'EC2Y', 'EC3A', 'EC3B', 'EC3M', 'EC3N', 'EC3P', 'EC3R', 'EC3V', 'EC4A', 'EC4M', 'EC4N', 'EC4P', 'EC4R', 'EC4V', 'EC4Y', 'EC50', 'EC88', 'EN1', 'EN2', 'EN3', 'EN4', 'EN5', 'EN6', 'EN7', 'EN8', 'EN9', 'HA0', 'HA1', 'HA2', 'HA3', 'HA4', 'HA5', 'HA6', 'HA7', 'HA8', 'HA9', 'IG1', 'IG11', 'IG2', 'IG3', 'IG4', 'IG5', 'IG6', 'IG7', 'IG8', 'IG9', 'KT1', 'KT17', 'KT18', 'KT19', 'KT2', 'KT22', 'KT3', 'KT4', 'KT5', 'KT6', 'KT7', 'KT8', 'KT9', 'N1', 'N10', 'N11', 'N12', 'N13', 'N14', 'N15', 'N16', 'N17', 'N18', 'N19', 'N1C'

There exist multiple 'Districts' for certain `postcode_areas`. This is unsurprising for a city such as london with many historically overlapping local authorities.

We use the `mode()` function to assign each postcode to the district that appears most frequently within that `postcode_area`

In [6]:
postcode_district_clean = postcode_district.copy()
postcode_district_clean['District'] = postcode_district_clean.groupby('postcode_area')['District'].transform(lambda x: x.mode()[0])

In [7]:
postcode_district_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328443 entries, 0 to 328442
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   District       328443 non-null  object
 1   postcode_area  328443 non-null  object
dtypes: object(2)
memory usage: 5.0+ MB


Find out if there are discrepancies. We return a boolean Series if more than one unique 'District' for each postcode area.

In [8]:
grouped = postcode_district_clean.groupby('postcode_area')['District'].nunique() > 1
discrepancies = grouped[grouped]
discrepancies.info()

<class 'pandas.core.series.Series'>
Index: 0 entries
Series name: District
Non-Null Count  Dtype
--------------  -----
0 non-null      bool 
dtypes: bool(1)
memory usage: 0.0+ bytes


In [9]:
postcode_district = postcode_district_clean.drop_duplicates(subset='postcode_area')

In [10]:
print(f"Boroughs:\n{postcode_district['District'].unique().tolist()}"
      f"\nTotal = {len(postcode_district['District'].unique().tolist())}")

Boroughs:
['Bromley', 'Havering', 'Camden', 'Croydon', 'Merton', 'Sutton', 'Bexley', 'Tower Hamlets', 'Waltham Forest', 'Newham', 'Redbridge', 'Hackney', 'City of London', 'Islington', 'Enfield', 'Barnet', 'Brent', 'Harrow', 'Hillingdon', 'Barking and Dagenham', 'Kingston upon Thames', 'Richmond upon Thames', 'Haringey', 'Westminster', 'Southwark', 'Greenwich', 'Lambeth', 'Lewisham', 'Kensington and Chelsea', 'Wandsworth', 'Hammersmith and Fulham', 'Hounslow', 'Ealing']
Total = 33


## We now export this data

In [11]:
postcode_district.to_csv('../../data/UK/postcode_district.csv', index=False)