### BRA

+ `admin_name1` = `region`
+ `admin_name2` = `city`
+ `place_name` = `city`

In [17]:
import pandas as pd
import warnings
from unidecode import unidecode

warnings.filterwarnings("ignore")

**Crunchbase Dataframe**

In [18]:
df = pd.read_csv('input/foodtech.csv')
df = df[df['country_code'] == 'CHN']
df = df[['uuid','country_code', 'state_code', 'region', 'city', 'address', 'postal_code']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3187 entries, 713 to 163755
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uuid          3187 non-null   object
 1   country_code  3187 non-null   object
 2   state_code    0 non-null      object
 3   region        3187 non-null   object
 4   city          3187 non-null   object
 5   address       1503 non-null   object
 6   postal_code   741 non-null    object
dtypes: object(7)
memory usage: 199.2+ KB


*Reformatting*

In [19]:
df['city'] = df['city'].apply(unidecode)
df['region'] = df['region'].apply(unidecode)

df['city'] = df['city'].str.lower()
df['region'] = df['region'].str.lower()

**Postal Codes**

In [20]:
codes = pd.read_json('input/geonames.json')
codes = codes[codes['country_code'] == 'CN']

In [21]:
codes['admin_name1'] = codes['admin_name1'].apply(lambda x: unidecode(x) if x is not None else None)
codes['admin_name2'] = codes['admin_name2'].apply(lambda x: unidecode(x) if x is not None else None)
codes['place_name'] = codes['place_name'].apply(lambda x: unidecode(x) if x is not None else None)

`admin_name2`

In [22]:
codes1 = codes.copy()
codes1 = codes1[['postal_code','admin_name1', 'admin_name2', 'latitude', 'longitude']]

In [23]:
# codes1.loc[(codes1['admin_name1'] == 'Sao Paulo') & (codes1['admin_name2'].str.contains('Campinas')), 'admin_name2'] = 'Campina'

codes1['admin_name1'] = codes1['admin_name1'].str.lower()
codes1['admin_name2'] = codes1['admin_name2'].str.lower()

codes1.drop_duplicates(subset=['admin_name1', 'admin_name2'], inplace=True)

In [24]:
merged_df = pd.merge(df, codes1, left_on=['city', 'region'], right_on=['admin_name2', 'admin_name1'], how='left')
merged_df.drop_duplicates(subset=['uuid'], inplace=True)

In [25]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3187 entries, 0 to 3186
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           3187 non-null   object 
 1   country_code   3187 non-null   object 
 2   state_code     0 non-null      object 
 3   region         3187 non-null   object 
 4   city           3187 non-null   object 
 5   address        1503 non-null   object 
 6   postal_code_x  741 non-null    object 
 7   postal_code_y  595 non-null    object 
 8   admin_name1    595 non-null    object 
 9   admin_name2    595 non-null    object 
 10  latitude       595 non-null    float64
 11  longitude      595 non-null    float64
dtypes: float64(2), object(10)
memory usage: 323.7+ KB


In [26]:
codes2 = codes.copy()
codes2 = codes2[['postal_code','admin_name1', 'place_name', 'latitude', 'longitude']]

In [27]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


merged_df = pd.merge(merged_df, codes2, left_on=['city', 'region'], right_on=['place_name', 'admin_name1'], how='left')
fuzzy_matches = merged_df.apply(lambda row: fuzz.token_sort_ratio(row['city'], row['place_name']), axis=1)
df_merge = merged_df[fuzzy_matches >= 65]
# df_merge = df_merge.drop_duplicates(subset=[left_on, right_on])

`place_name`

In [29]:
codes2 = codes.copy()
codes2 = codes2[['postal_code','admin_name1', 'place_name', 'latitude', 'longitude']]

codes2['admin_name1'] = codes2['admin_name1'].str.lower()
codes2['place_name'] = codes2['place_name'].str.lower()

codes2.drop_duplicates(subset=['admin_name1', 'place_name'], inplace=True)


In [30]:
merged_df = pd.merge(merged_df, codes2, left_on=['city', 'region'], right_on=['place_name', 'admin_name1'], how='left')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  4990 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     4990 non-null   float64
 11  longitude_x    4990 non-null   float64
 12  postal_code    4954 non-null   object 
 13  admin_name1_y  4954 non-null   object 
 14  place_name     4954 non-null   object 
 15  latitude_y     4954 non-null   float64
 16  longitude_y    4954 non-null   float64
dtypes: float64(4), object(13)
memory usage: 865.0+ KB


In [31]:
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

merged_df['latitude_x'].fillna(merged_df['latitude_y'], inplace=True)
merged_df['longitude_x'].fillna(merged_df['longitude_y'], inplace=True)

merged_df = merged_df.iloc[:, 0:12]

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  5005 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5005 non-null   float64
 11  longitude_x    5005 non-null   float64
dtypes: float64(2), object(10)
memory usage: 624.7+ KB


`place_name` ONLY

In [32]:
codes2.drop_duplicates(subset=['place_name'], inplace=True)

merged_df = pd.merge(merged_df, codes2, left_on=['city'], right_on=['place_name'], how='left')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  5005 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5005 non-null   float64
 11  longitude_x    5005 non-null   float64
 12  postal_code    5623 non-null   object 
 13  admin_name1    5623 non-null   object 
 14  place_name     5623 non-null   object 
 15  latitude       5623 non-null   float64
 16  longitude      5623 non-null   float64
dtypes: float64(4), object(13)
memory usage: 865.0+ KB


In [33]:
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

merged_df['latitude_x'].fillna(merged_df['latitude'], inplace=True)
merged_df['longitude_x'].fillna(merged_df['longitude'], inplace=True)

merged_df = merged_df.iloc[:, 0:12]

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  5674 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5674 non-null   float64
 11  longitude_x    5674 non-null   float64
dtypes: float64(2), object(10)
memory usage: 624.7+ KB


**Filling NaNs with Existing Postal Codes**

In [34]:
merged_df['postal_code_y'].fillna(merged_df['postal_code_x'], inplace=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  6000 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5674 non-null   float64
 11  longitude_x    5674 non-null   float64
dtypes: float64(2), object(10)
memory usage: 624.7+ KB


*Across rows: df*

In [35]:
codes_df = df[df['postal_code'].notna()]

codes_df = codes_df[['region', 'city', 'postal_code']]
codes_df.drop_duplicates(subset=['region', 'city'], inplace=True)

In [36]:
merged_df = pd.merge(merged_df, codes_df, left_on=['city', 'region'], right_on=['city', 'region'], how='left')
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

In [37]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  6109 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5674 non-null   float64
 11  longitude_x    5674 non-null   float64
 12  postal_code    5809 non-null   object 
dtypes: float64(2), object(11)
memory usage: 672.8+ KB


In [38]:
merged_df = merged_df.iloc[:, 0:12]

In [39]:
codes3 = codes.copy()
codes3 = codes3[['postal_code', 'latitude', 'longitude']]

merged_df = pd.merge(merged_df, codes3, left_on=['postal_code_y'], right_on=['postal_code'], how='left')

merged_df['latitude_x'].fillna(merged_df['latitude'], inplace=True)
merged_df['longitude_x'].fillna(merged_df['longitude'], inplace=True)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6151 entries, 0 to 6150
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           6151 non-null   object 
 1   country_code   6151 non-null   object 
 2   state_code     0 non-null      object 
 3   region         6151 non-null   object 
 4   city           6151 non-null   object 
 5   address        3784 non-null   object 
 6   postal_code_x  3567 non-null   object 
 7   postal_code_y  6109 non-null   object 
 8   admin_name1_x  4990 non-null   object 
 9   admin_name2    4990 non-null   object 
 10  latitude_x     5754 non-null   float64
 11  longitude_x    5754 non-null   float64
 12  postal_code    5754 non-null   object 
 13  latitude       5754 non-null   float64
 14  longitude      5754 non-null   float64
dtypes: float64(4), object(11)
memory usage: 768.9+ KB


*Checking NaNs*

In [None]:
nan_df = merged_df[merged_df['postal_code_y'].isna()]
nan_df['region'].value_counts()

rio de janeiro       10
minas gerais          9
sao paulo             8
bahia                 6
rio grande do sul     3
amazonas              1
mato grosso           1
rondonia              1
espirito santo        1
parana                1
para                  1
Name: region, dtype: int64

*Save to csv*

In [None]:
merged_df = merged_df.rename(columns={'postal_code_x': 'pc_crunchbase', 
                                      'postal_code_y': 'pc_filled'})

merged_df = merged_df[['uuid', 'pc_crunchbase', 'pc_filled']]


In [None]:
merged_df.to_csv('processed/BRA_processed.csv')