### General

+ `admin_name1` = `region`
+ `admin_name2` = `city`
+ `place_name` = `city`

In [59]:
import pandas as pd
import warnings
from unidecode import unidecode

warnings.filterwarnings("ignore")

**ISO Codes**

In [60]:
iso2 = 'AU'
iso3 = 'AUS'

**Crunchbase Dataframe**

In [61]:
df = pd.read_csv('input/foodtech.csv')
df = df[df['country_code'] == iso3]
df = df[['uuid','country_code', 'state_code', 'region', 'city', 'address', 'postal_code']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821 entries, 122 to 163764
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uuid          4821 non-null   object
 1   country_code  4821 non-null   object
 2   state_code    0 non-null      object
 3   region        4821 non-null   object
 4   city          4821 non-null   object
 5   address       3823 non-null   object
 6   postal_code   3827 non-null   object
dtypes: object(7)
memory usage: 301.3+ KB


*Reformatting*

In [62]:
df['city'] = df['city'].apply(unidecode)
df['region'] = df['region'].apply(unidecode)

df['city'] = df['city'].str.lower()
df['region'] = df['region'].str.lower()

**Postal Codes**

In [80]:
codes = pd.read_json('input/geonames.json')
codes = codes[codes['country_code'] == iso2]

In [64]:
codes['admin_name1'] = codes['admin_name1'].apply(lambda x: unidecode(x) if x is not None else None)
codes['admin_name2'] = codes['admin_name2'].apply(lambda x: unidecode(x) if x is not None else None)
codes['place_name'] = codes['place_name'].apply(lambda x: unidecode(x) if x is not None else None)

`admin_name2`

In [65]:
codes1 = codes.copy()
codes1 = codes1[['postal_code','admin_name1', 'admin_name2']]

In [66]:
codes1['admin_name1'] = codes1['admin_name1'].str.lower()
codes1['admin_name2'] = codes1['admin_name2'].str.lower()

codes1.drop_duplicates(subset=['admin_name1', 'admin_name2'], inplace=True)

In [67]:
merged_df = pd.merge(df, codes1, left_on=['city', 'region'], right_on=['admin_name2', 'admin_name1'], how='left')
merged_df.drop_duplicates(subset=['uuid'], inplace=True)

In [68]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821 entries, 0 to 4820
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           4821 non-null   object
 1   country_code   4821 non-null   object
 2   state_code     0 non-null      object
 3   region         4821 non-null   object
 4   city           4821 non-null   object
 5   address        3823 non-null   object
 6   postal_code_x  3827 non-null   object
 7   postal_code_y  238 non-null    object
 8   admin_name1    238 non-null    object
 9   admin_name2    238 non-null    object
dtypes: object(10)
memory usage: 414.3+ KB


`place_name`

In [69]:
codes2 = codes.copy()
codes2 = codes2[['postal_code','admin_name1', 'place_name']]

codes2['admin_name1'] = codes2['admin_name1'].str.lower()
codes2['place_name'] = codes2['place_name'].str.lower()

codes2.drop_duplicates(subset=['admin_name1', 'place_name'], inplace=True)


In [70]:
merged_df = pd.merge(merged_df, codes2, left_on=['city', 'region'], right_on=['place_name', 'admin_name1'], how='left')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821 entries, 0 to 4820
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           4821 non-null   object
 1   country_code   4821 non-null   object
 2   state_code     0 non-null      object
 3   region         4821 non-null   object
 4   city           4821 non-null   object
 5   address        3823 non-null   object
 6   postal_code_x  3827 non-null   object
 7   postal_code_y  238 non-null    object
 8   admin_name1_x  238 non-null    object
 9   admin_name2    238 non-null    object
 10  postal_code    4652 non-null   object
 11  admin_name1_y  4652 non-null   object
 12  place_name     4652 non-null   object
dtypes: object(13)
memory usage: 527.3+ KB


**Filling NaNs with Existing Postal Codes**

In [71]:
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)
merged_df['postal_code_y'].fillna(merged_df['postal_code_x'], inplace=True)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821 entries, 0 to 4820
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           4821 non-null   object
 1   country_code   4821 non-null   object
 2   state_code     0 non-null      object
 3   region         4821 non-null   object
 4   city           4821 non-null   object
 5   address        3823 non-null   object
 6   postal_code_x  3827 non-null   object
 7   postal_code_y  4806 non-null   object
 8   admin_name1_x  238 non-null    object
 9   admin_name2    238 non-null    object
 10  postal_code    4652 non-null   object
 11  admin_name1_y  4652 non-null   object
 12  place_name     4652 non-null   object
dtypes: object(13)
memory usage: 527.3+ KB


*Across rows: df*

In [72]:
merged_df = merged_df[merged_df.columns[:-5]]

In [73]:
codes_df = df[df['postal_code'].notna()]

codes_df = codes_df[['region', 'city', 'postal_code']]
codes_df.drop_duplicates(subset=['region', 'city'], inplace=True)

In [74]:
merged_df = pd.merge(merged_df, codes_df, left_on=['city', 'region'], right_on=['city', 'region'], how='left')
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

In [75]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821 entries, 0 to 4820
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           4821 non-null   object
 1   country_code   4821 non-null   object
 2   state_code     0 non-null      object
 3   region         4821 non-null   object
 4   city           4821 non-null   object
 5   address        3823 non-null   object
 6   postal_code_x  3827 non-null   object
 7   postal_code_y  4815 non-null   object
 8   postal_code    4705 non-null   object
dtypes: object(9)
memory usage: 376.6+ KB


*Checking NaNs*

In [76]:
nan_df = merged_df[merged_df['postal_code_y'].isna()]
nan_df['region'].value_counts()

queensland         2
victoria           2
new south wales    2
Name: region, dtype: int64

*Save to csv*

In [77]:
merged_df = merged_df.rename(columns={'postal_code_x': 'pc_crunchbase', 
                                      'postal_code_y': 'pc_filled'})

merged_df = merged_df[['uuid', 'pc_crunchbase', 'pc_filled']]


In [78]:
merged_df.to_csv('general/' + iso3 + '_processed.csv')