### CAN

+ `admin_name1` = `region`
+ `admin_name2` = `city`

In [23]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

**Crunchbase Dataframe**

In [24]:
df = pd.read_csv('input/foodtech.csv')
df = df[df['country_code'] == 'CAN']
df = df[['uuid','country_code', 'state_code', 'region', 'city', 'address', 'postal_code']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6379 entries, 1 to 163780
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uuid          6379 non-null   object
 1   country_code  6379 non-null   object
 2   state_code    6379 non-null   object
 3   region        6379 non-null   object
 4   city          6379 non-null   object
 5   address       4774 non-null   object
 6   postal_code   4617 non-null   object
dtypes: object(7)
memory usage: 398.7+ KB


*Reformatting*

In [25]:
df['city'] = df['city'].str.replace(r'\bNiagara-on-the-lake\b', 'Niagara')
df['city'] = df['city'].str.replace(r'\bVaughan\b', 'Woodbridge')
df['city'] = df['city'].str.replace(r'\bSt.\b', 'Saint ')

df['city'] = df['city'].str.lower()

In [26]:
def format_postal_code(postal_code):
    if pd.notna(postal_code):  
        postal_code = postal_code.upper()
        formatted_code = postal_code[:3] + ' ' + postal_code[3:]
        return formatted_code
    else:
        return postal_code

df['postal_code'] = df['postal_code'].apply(format_postal_code)

**Postal Codes**

In [27]:
codes = pd.read_json('input/geonames.json')
codes = codes[codes['country_code'] == 'CA']

`admin_name2`

In [28]:
codes1 = codes.copy()
codes1 = codes1[['postal_code','admin_name1', 'admin_name2']]
codes1.drop_duplicates(subset=['admin_name1', 'admin_name2'], inplace=True)

codes1['admin_name2'] = codes1['admin_name2'].str.replace(r'\bSt.\b', 'Saint ')
codes1['admin_name2'] = codes['admin_name2'].str.lower()


In [29]:
merged_df = pd.merge(df, codes1, left_on=['city', 'region'], right_on=['admin_name2', 'admin_name1'], how='left')
merged_df.drop_duplicates(subset=['uuid'], inplace=True)

In [30]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6379 entries, 0 to 6378
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6379 non-null   object
 1   country_code   6379 non-null   object
 2   state_code     6379 non-null   object
 3   region         6379 non-null   object
 4   city           6379 non-null   object
 5   address        4774 non-null   object
 6   postal_code_x  4617 non-null   object
 7   postal_code_y  4581 non-null   object
 8   admin_name1    4581 non-null   object
 9   admin_name2    4581 non-null   object
dtypes: object(10)
memory usage: 548.2+ KB


`place_name`

In [31]:
codes2 = codes.copy()
codes2 = codes2[['postal_code','admin_name1', 'place_name']]

codes2['place_name'] = codes2['place_name'].str.replace(r'\s*\(.*?\)', '')

codes2.loc[(codes2['admin_name1'] == 'Quebec') & (codes2['place_name'].str.contains('Quebec')), 'place_name'] = 'Quebec'
codes2.loc[(codes2['admin_name1'] == 'Ontario') & (codes2['place_name'].str.contains('Brantford')), 'place_name'] = 'Brantford'
codes2.loc[(codes2['admin_name1'] == 'Ontario') & (codes2['place_name'].str.contains('Niagara')), 'place_name'] = 'Niagara'

codes2['place_name'] = codes2['place_name'].str.replace(r'\bSt.\b', 'Saint ')
codes2['place_name'] = codes2['place_name'].str.lower()

codes2.drop_duplicates(subset=['admin_name1', 'place_name'], inplace=True)


In [32]:
merged_df = pd.merge(merged_df, codes2, left_on=['city', 'region'], right_on=['place_name', 'admin_name1'], how='left')

In [33]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6379 entries, 0 to 6378
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6379 non-null   object
 1   country_code   6379 non-null   object
 2   state_code     6379 non-null   object
 3   region         6379 non-null   object
 4   city           6379 non-null   object
 5   address        4774 non-null   object
 6   postal_code_x  4617 non-null   object
 7   postal_code_y  4581 non-null   object
 8   admin_name1_x  4581 non-null   object
 9   admin_name2    4581 non-null   object
 10  postal_code    3709 non-null   object
 11  admin_name1_y  3709 non-null   object
 12  place_name     3709 non-null   object
dtypes: object(13)
memory usage: 697.7+ KB


*Filling NaNs*

In [34]:
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)
merged_df['postal_code_y'].fillna(merged_df['postal_code_x'], inplace=True)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6379 entries, 0 to 6378
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6379 non-null   object
 1   country_code   6379 non-null   object
 2   state_code     6379 non-null   object
 3   region         6379 non-null   object
 4   city           6379 non-null   object
 5   address        4774 non-null   object
 6   postal_code_x  4617 non-null   object
 7   postal_code_y  6134 non-null   object
 8   admin_name1_x  4581 non-null   object
 9   admin_name2    4581 non-null   object
 10  postal_code    3709 non-null   object
 11  admin_name1_y  3709 non-null   object
 12  place_name     3709 non-null   object
dtypes: object(13)
memory usage: 697.7+ KB


*Across rows: df*

In [35]:
merged_df = merged_df[merged_df.columns[:-5]]

In [36]:
codes_df = df[df['postal_code'].notna()]

codes_df = codes_df[['region', 'city', 'postal_code']]
codes_df.drop_duplicates(subset=['region', 'city'], inplace=True)

In [37]:
merged_df = pd.merge(merged_df, codes_df, left_on=['city', 'region'], right_on=['city', 'region'], how='left')
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)


In [38]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6379 entries, 0 to 6378
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6379 non-null   object
 1   country_code   6379 non-null   object
 2   state_code     6379 non-null   object
 3   region         6379 non-null   object
 4   city           6379 non-null   object
 5   address        4774 non-null   object
 6   postal_code_x  4617 non-null   object
 7   postal_code_y  6274 non-null   object
 8   postal_code    6253 non-null   object
dtypes: object(9)
memory usage: 498.4+ KB


*Checking NaNs*

In [39]:
nan_df = merged_df[merged_df['postal_code_y'].isna()]
nan_df['region'].value_counts()

Ontario                 42
Quebec                  21
Nova Scotia              9
Alberta                  9
Saskatchewan             8
British Columbia         7
Prince Edward Island     4
Manitoba                 2
Nunavut                  1
Newfoundland             1
New Brunswick            1
Name: region, dtype: int64

*Save to csv*

In [40]:
merged_df = merged_df.rename(columns={'postal_code_x': 'pc_crunchbase', 
                                      'postal_code_y': 'pc_filled'})

merged_df = merged_df[['uuid', 'pc_crunchbase', 'pc_filled']]


In [41]:
merged_df.to_csv('processed/CAN_processed.csv')