### GBR

+ `admin_name1` = `region`
+ `admin_name2` = `city`
+ `place_name` = `city`

In [38]:
import pandas as pd
import warnings
from unidecode import unidecode

warnings.filterwarnings("ignore")

**Crunchbase Dataframe**

In [39]:
df = pd.read_csv('input/foodtech.csv')
df = df[df['country_code'] == 'GBR']
df = df[['uuid','country_code', 'state_code', 'region', 'city', 'address', 'postal_code']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10825 entries, 12 to 163812
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uuid          10825 non-null  object
 1   country_code  10825 non-null  object
 2   state_code    0 non-null      object
 3   region        10825 non-null  object
 4   city          10825 non-null  object
 5   address       8500 non-null   object
 6   postal_code   8439 non-null   object
dtypes: object(7)
memory usage: 676.6+ KB


*Reformatting*

In [40]:
df.loc[df['region'].str.contains('yorkshire', case=False), 'region'] = 'York'

**Postal Codes**

In [41]:
codes = pd.read_json('input/geonames.json')

codes = codes[codes['country_code'] == 'GB']
codes = codes[['postal_code', 'place_name', 'admin_name2']]

codes.drop_duplicates(subset=['place_name', 'admin_name2'], inplace=True)
codes.drop_duplicates(subset=['postal_code','place_name', 'admin_name2'], inplace=True)

In [42]:
codes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26419 entries, 6397 to 1508766
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   postal_code  26419 non-null  object
 1   place_name   26419 non-null  object
 2   admin_name2  26352 non-null  object
dtypes: object(3)
memory usage: 825.6+ KB


*Matching formatting*

In [43]:
codes.loc[codes['admin_name2'] == 'Nottinghamshire', 'admin_name2'] = 'Nottingham'
codes.loc[codes['admin_name2'] == 'Greater Manchester', 'admin_name2'] = 'Manchester'
codes.loc[codes['place_name'] == 'Bath', 'place_name'] = 'Bath and North East Somerset'

codes.loc[codes['admin_name2'] == 'Greater London', 'admin_name2'] = 'England'
codes.loc[codes['admin_name2'] == 'Greater London', 'place_name'] = 'London'

mask = codes['place_name'] == 'Derby'
codes.loc[mask, 'admin_name2'] = 'Derby'

mask = codes['place_name'] == 'Aberdeen City'
codes.loc[mask, 'admin_name2'] = 'Aberdeen City'
codes.loc[codes['place_name'] == 'Aberdeen City', 'place_name'] = 'Aberdeen'

codes.loc[codes['place_name'] == 'Edinburgh', 'admin_name2'] = 'Edinburgh, City of'

codes.loc[codes['place_name'] == 'Bristol', 'admin_name2'] = 'Bristol, City of'
codes.loc[codes['place_name'] == 'Bristol', 'place_name'] = 'Bristol'

codes.loc[codes['place_name'] == 'Birmingham', 'admin_name2'] = 'Birmingham'
df.loc[df['region'] == 'Birmingham', 'city'] = 'Birmingham'

codes.loc[codes['admin_name2'] == 'County Antrim', 'admin_name2'] = 'Antrim'


In [44]:
codes['admin_name2'] = codes['admin_name2'].fillna('')
codes.loc[codes['admin_name2'].str.contains('yorkshire', case=False), 'admin_name2'] = 'York'

In [45]:
codes.drop_duplicates(subset=['place_name', 'admin_name2'], inplace=True)

*Merging*

In [46]:
merged_df = pd.merge(df, codes, left_on=['city', 'region'], right_on=['place_name', 'admin_name2'], how='left')
merged_df.drop_duplicates(subset=['uuid','place_name', 'admin_name2'], inplace=True)


In [47]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10825 entries, 0 to 10824
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           10825 non-null  object
 1   country_code   10825 non-null  object
 2   state_code     0 non-null      object
 3   region         10825 non-null  object
 4   city           10825 non-null  object
 5   address        8500 non-null   object
 6   postal_code_x  8439 non-null   object
 7   postal_code_y  7970 non-null   object
 8   place_name     7970 non-null   object
 9   admin_name2    7970 non-null   object
dtypes: object(10)
memory usage: 930.3+ KB


**Filling NaNs with Existing Postal Codes**

In [48]:
merged_df['postal_code_y'].fillna(merged_df['postal_code_x'], inplace=True)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10825 entries, 0 to 10824
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           10825 non-null  object
 1   country_code   10825 non-null  object
 2   state_code     0 non-null      object
 3   region         10825 non-null  object
 4   city           10825 non-null  object
 5   address        8500 non-null   object
 6   postal_code_x  8439 non-null   object
 7   postal_code_y  10304 non-null  object
 8   place_name     7970 non-null   object
 9   admin_name2    7970 non-null   object
dtypes: object(10)
memory usage: 930.3+ KB


*Across rows: df*

In [49]:
merged_df = merged_df[merged_df.columns[:-2]]

In [50]:
codes_df = df[df['postal_code'].notna()]

codes_df = codes_df[['region', 'city', 'postal_code']]
codes_df.drop_duplicates(subset=['region', 'city'], inplace=True)

In [51]:
merged_df = pd.merge(merged_df, codes_df, left_on=['city', 'region'], right_on=['city', 'region'], how='left')
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

In [52]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10825 entries, 0 to 10824
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           10825 non-null  object
 1   country_code   10825 non-null  object
 2   state_code     0 non-null      object
 3   region         10825 non-null  object
 4   city           10825 non-null  object
 5   address        8500 non-null   object
 6   postal_code_x  8439 non-null   object
 7   postal_code_y  10738 non-null  object
 8   postal_code    10627 non-null  object
dtypes: object(9)
memory usage: 845.7+ KB


*Checking NaNs* 

In [53]:
nan_df = merged_df[merged_df['postal_code_y'].isna()]
nan_df['region'].value_counts()

Nottinghamshire          4
Hertford                 4
Kirklees                 4
Scottish Borders, The    3
Essex                    3
                        ..
Wirral                   1
Central Bedfordshire     1
Dorset                   1
Cumbria                  1
Leeds                    1
Name: region, Length: 61, dtype: int64

*Save to csv*

In [54]:
merged_df = merged_df.rename(columns={'postal_code_x': 'pc_crunchbase', 
                                      'postal_code_y': 'pc_filled'})

merged_df = merged_df[['uuid', 'pc_crunchbase', 'pc_filled']]

In [55]:
merged_df.to_csv('processed/GBR_processed.csv')