### ITA

+ `admin_name1` = `region`
+ `place_name` = `city`

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

**Crunchbase Dataframe**

In [2]:
df = pd.read_csv('input/foodtech.csv')
df = df[df['country_code'] == 'ITA']
df = df[['uuid','country_code', 'state_code', 'region', 'city', 'address', 'postal_code']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6679 entries, 81 to 163821
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uuid          6679 non-null   object
 1   country_code  6679 non-null   object
 2   state_code    0 non-null      object
 3   region        6679 non-null   object
 4   city          6679 non-null   object
 5   address       5680 non-null   object
 6   postal_code   5529 non-null   object
dtypes: object(7)
memory usage: 417.4+ KB


*Reformatting*

In [3]:
df['city'] = df['city'].str.replace(r'\bMilan\b', 'Milano')
df['city'] = df['city'].str.replace(r'\bRome\b', 'Roma')
df['city'] = df['city'].str.replace(r'\bPadua\b', 'Padova')
df['city'] = df['city'].str.replace(r'\bVenice\b', 'Venezia')
df['city'] = df['city'].str.replace(r'\bTurin\b', 'Torino')
df['city'] = df['city'].str.replace(r'\bReggio Nell Emilia\b', 'Reggio Emilia')
df['city'] = df['city'].str.replace(r"\bSant'ilario D'enza\b", "Sant'Ilario D'Enza")
df['city'] = df['city'].str.replace(r"\bSant'ambrogio Di Valpolicella\b", "Sant'Ambrogio Di Valpolicella")


In [4]:
df['city'] = df['city'].str.lower()
df['region'] = df['region'].str.lower()

**Postal Codes**

In [5]:
codes = pd.read_json('input/geonames.json')
codes = codes[codes['country_code'] == 'IT']

`place_name`

In [6]:
codes1 = codes.copy()
codes1 = codes1[['postal_code','admin_name1', 'place_name']]

codes1['admin_name1'] = codes1['admin_name1'].str.lower()
codes1['place_name'] = codes1['place_name'].str.lower()

codes1.drop_duplicates(subset=['admin_name1', 'place_name'], inplace=True)


In [7]:
merged_df = pd.merge(df, codes1, left_on=['city', 'region'], right_on=['place_name', 'admin_name1'], how='left')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6679 entries, 0 to 6678
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6679 non-null   object
 1   country_code   6679 non-null   object
 2   state_code     0 non-null      object
 3   region         6679 non-null   object
 4   city           6679 non-null   object
 5   address        5680 non-null   object
 6   postal_code_x  5529 non-null   object
 7   postal_code_y  6015 non-null   object
 8   admin_name1    6015 non-null   object
 9   place_name     6015 non-null   object
dtypes: object(10)
memory usage: 574.0+ KB


**Filling NaNs with Existing Postal Codes**

In [8]:
merged_df['postal_code_y'].fillna(merged_df['postal_code_x'], inplace=True)
merged_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6679 entries, 0 to 6678
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6679 non-null   object
 1   country_code   6679 non-null   object
 2   state_code     0 non-null      object
 3   region         6679 non-null   object
 4   city           6679 non-null   object
 5   address        5680 non-null   object
 6   postal_code_x  5529 non-null   object
 7   postal_code_y  6569 non-null   object
 8   admin_name1    6015 non-null   object
 9   place_name     6015 non-null   object
dtypes: object(10)
memory usage: 574.0+ KB


*Across rows: df*

In [9]:
merged_df = merged_df[merged_df.columns[:-2]]

codes_df = df[df['postal_code'].notna()]

codes_df = codes_df[['region', 'city', 'postal_code']]
codes_df.drop_duplicates(subset=['region', 'city'], inplace=True)

In [10]:
merged_df = pd.merge(merged_df, codes_df, left_on=['city', 'region'], right_on=['city', 'region'], how='left')
merged_df['postal_code_y'].fillna(merged_df['postal_code'], inplace=True)

In [11]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6679 entries, 0 to 6678
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uuid           6679 non-null   object
 1   country_code   6679 non-null   object
 2   state_code     0 non-null      object
 3   region         6679 non-null   object
 4   city           6679 non-null   object
 5   address        5680 non-null   object
 6   postal_code_x  5529 non-null   object
 7   postal_code_y  6642 non-null   object
 8   postal_code    6465 non-null   object
dtypes: object(9)
memory usage: 521.8+ KB


*Checking NaNs*

In [12]:
nan_df = merged_df[merged_df['postal_code_y'].isna()]
nan_df['region'].value_counts()

emilia-romagna           6
friuli-venezia giulia    4
veneto                   4
toscana                  3
lombardia                3
trentino-alto adige      3
umbria                   2
sicilia                  2
piemonte                 2
campania                 2
molise                   2
liguria                  1
lazio                    1
abruzzi                  1
calabria                 1
Name: region, dtype: int64

*Save to csv*

In [13]:
merged_df = merged_df.rename(columns={'postal_code_x': 'pc_crunchbase', 
                                      'postal_code_y': 'pc_filled'})

merged_df = merged_df[['uuid', 'pc_crunchbase', 'pc_filled']]


In [14]:
merged_df.to_csv('processed/ITA_processed.csv')