In [1]:
import pandas as pd
import operator
import re, string
import numpy as np

# Prepare Philippine Standard Geographic Code Reference File

In [46]:
psgc = pd.read_csv("../../dataset/psgc/processed/clean-psgc.csv.gz",
                   dtype={'code':'object'},
                   compression="gzip",
                   encoding="utf-8")

In [3]:
psgc

Unnamed: 0,code,interlevel,location,original
0,010000000,Reg,REGION I,False
1,010000000,Reg,ILOCOS REGION,False
2,010000000,Reg,REGION I (ILOCOS REGION),True
3,010000000,Reg,REGION 1,False
4,012800000,Prov,ILOCOS NORTE,True
...,...,...,...,...
48717,175917004,Bgy,Paroyhog,True
48718,175917005,Bgy,Santo Niño,True
48719,175917006,Bgy,San Isidro,True
48720,050506000,City,LEGASPI,True


In [4]:
psgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48722 entries, 0 to 48721
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   code        48722 non-null  object
 1   interlevel  48720 non-null  object
 2   location    48721 non-null  object
 3   original    48704 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


In [5]:
psgc.head()

Unnamed: 0,code,interlevel,location,original
0,10000000,Reg,REGION I,False
1,10000000,Reg,ILOCOS REGION,False
2,10000000,Reg,REGION I (ILOCOS REGION),True
3,10000000,Reg,REGION 1,False
4,12800000,Prov,ILOCOS NORTE,True


In [6]:
# totally drop places that are just "capital" or "not a province". I think in the previous cleanups of the PSGC file anything
# in parentheses was turned into a new row. As such, "Capital" was often turned into a new row.

psgc = psgc[psgc.location.isin(["CAPITAL","NOT A PROVINCE"]) == False].reset_index(drop=True)

In [7]:
# interlevel lower
psgc.interlevel = psgc.interlevel.str.lower()
psgc.interlevel = psgc.interlevel.replace({"mun": "municity", "city": "municity"}) #combine municity for now

# Clean location column a bit. but not too much because we'll use this as the "canonical" name
psgc['location'] = psgc.location.str.replace(r"NOT A PROVINCE|CAPITAL|\(|\)", "").str.strip()
psgc = psgc.drop_duplicates(subset=["code", "location", "interlevel"], keep="first")
psgc = psgc.dropna()

In [8]:
psgc.interlevel.value_counts()

bgy         46759
municity     1796
prov           85
reg            61
Name: interlevel, dtype: int64

In [9]:
# rename districts as simply "Metro Manila", "Metropolitan Manila", "National Capital Region" or "NCR"

ncr = psgc[(psgc.code == "130000000")]
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION", "NATIONAL CAPITAL REGION")
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION", "NATIONAL CAPITAL REGION")
ncr = ncr.append(pd.Series({"code":"130000000", "location": "METRO MANILA","interlevel":"reg","original":False}), ignore_index=True)
ncr = ncr.append(pd.Series({"code":"130000000", "location": "METROPOLITAN MANILA", "interlevel": "reg", "original": False}), ignore_index=True)
ncr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,code,interlevel,location,original
0,130000000,reg,NCR,False
1,130000000,reg,NATIONAL CAPITAL REGION NCR,True
2,130000000,reg,NATIONAL CAPITAL REGION,False
3,130000000,reg,METRO MANILA,False
4,130000000,reg,METROPOLITAN MANILA,False


In [10]:
# remove districts and replace NCR region rows from reference file for now with cleaned up NCR rows

psgc = psgc[psgc.interlevel != 'dist'].reset_index(drop=True) # exclude districts
psgc = psgc[psgc.code != '130000000'].reset_index(drop=True) # exclude original ncr region rows
print(len(psgc))
psgc = psgc.append(ncr, ignore_index=True) # append cleaned up ncr region rows
print(len(psgc))
psgc.head()
psgc[psgc.code == "130000000"]

48698
48703


Unnamed: 0,code,interlevel,location,original
48698,130000000,reg,NCR,False
48699,130000000,reg,NATIONAL CAPITAL REGION NCR,True
48700,130000000,reg,NATIONAL CAPITAL REGION,False
48701,130000000,reg,METRO MANILA,False
48702,130000000,reg,METROPOLITAN MANILA,False


In [11]:
# add as aliases abbreviations of north, south, east, west. for example, northern samar will have an alias n. samar.

nsew = re.compile(r"^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ")
nsew_abbrev = psgc[psgc.location.str.contains(nsew)].location.str.split().str.get(0).str.slice(0,1)
nsew_abbrev.head()

  return func(self, *args, **kwargs)


19247    W
27191    E
27193    E
29752    N
30412    W
Name: location, dtype: object

In [12]:
nsew_locs = psgc[psgc.location.str.contains(nsew)]
nsew_locs.head()

Unnamed: 0,code,interlevel,location,original
19247,60000000,reg,WESTERN VISAYAS,False
27191,80000000,reg,EASTERN VISAYAS,False
27193,82600000,prov,EASTERN SAMAR,True
29752,84800000,prov,NORTHERN SAMAR,True
30412,86000000,prov,WESTERN SAMAR,False


In [13]:
nsew_locs.loc[:, 'location'] = nsew_abbrev.str.cat(psgc[psgc.location.str.contains(nsew)].location.str.replace("^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ","").str.strip(),sep=" ")
nsew_locs.loc[:, 'original'] = False
nsew_locs.head()

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,code,interlevel,location,original
19247,60000000,reg,W VISAYAS,False
27191,80000000,reg,E VISAYAS,False
27193,82600000,prov,E SAMAR,False
29752,84800000,prov,N SAMAR,False
30412,86000000,prov,W SAMAR,False


In [14]:
psgc = pd.concat([psgc, nsew_locs], ignore_index=True)
psgc.head()

Unnamed: 0,code,interlevel,location,original
0,10000000,reg,REGION I,False
1,10000000,reg,ILOCOS REGION,False
2,10000000,reg,REGION I ILOCOS REGION,True
3,10000000,reg,REGION 1,False
4,12800000,prov,ILOCOS NORTE,True


In [15]:
#fill interlevels for isabela, cotabato

psgc.loc[psgc.interlevel.isnull(), "interlevel"] = "municity"

What we need is reference file that contains the higher-level administrative territories in separate columns. This is so we can create a single "master string" that we will use for matching. We'll try this instead of matching each component individually.

First, create a dictionary of the rankings of various administrative levels.

In [16]:
adm_rank= {'reg': 1, 'prov': 2, 'dist': 2, 'city': 3, 'mun': 3, 'municity': 3, 'submun': 3, 'bgy': 4}
adm_rank_list = sorted(adm_rank, key=lambda k: adm_rank[k])
psgc['adm_rank'] = psgc.interlevel.map(adm_rank)

In [17]:
psgc.interlevel.value_counts()

bgy         46759
municity     1799
prov           91
reg            66
Name: interlevel, dtype: int64

We'll apply this later as a separate column.

Create a function that will add to our dataframe columns with the PSGC codes
of each location's higher level administrative territories. We'll then use this to fill the name columns 
with their corresponding place names.


In [18]:
def fill_higher_level_codes(df):
        
    # Below is a dictionary of administrative hierarchy levels ranks and the stop string positions inside the PSG code.
    
    adm_rank = {1: 2,
                2: 4,
                3: 6,
                4: 9} 
    
    # Loop through each administrative level. 
    # Create additional columns for each administrative level with the suffixes _code and _name.
    # Fill each column with the names and codes of the higher level administrative territories in which
    # a place is located.
    
    for adm_level in adm_rank.keys():
        
        # create code cols        
        adm_code_col = "adm{}_code".format(adm_level)
        
        df[adm_code_col] = None
        
        # find the administrative levels that are higher than the current one
        
        higher_adm_levels = [l for l in adm_rank.keys() if l <= adm_level]
                
        for higher_level in higher_adm_levels:
            
            # higher adm level colum names
            
            higher_level_code_col = "adm{}_code".format(higher_level)
            
            # stop position of PSG code for this adm level
            
            stop_position = adm_rank[higher_level] 
            
            # derive higher level admin codes for each row
            
            codes = df.loc[df.adm_rank >= higher_level, "code"].str.slice(start=0, stop=stop_position).str.pad(9, side="right", fillchar="0")
            df.loc[df.adm_rank >= higher_level,higher_level_code_col] = codes
            
            # derive higher level admin names for each row
            
            higher_level_name_col = "adm{}_name".format(higher_level)

    return df

In [19]:
psgc_unpivot = fill_higher_level_codes(psgc).dropna(how="all")
psgc_unpivot.head(10)

Unnamed: 0,code,interlevel,location,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code
0,10000000,reg,REGION I,False,1,10000000,,,
1,10000000,reg,ILOCOS REGION,False,1,10000000,,,
2,10000000,reg,REGION I ILOCOS REGION,True,1,10000000,,,
3,10000000,reg,REGION 1,False,1,10000000,,,
4,12800000,prov,ILOCOS NORTE,True,2,10000000,12800000.0,,
5,12801000,municity,ADAMS,True,3,10000000,12800000.0,12801000.0,
6,12801001,bgy,Adams,True,4,10000000,12800000.0,12801000.0,12801001.0
7,12802000,municity,BACARRA,True,3,10000000,12800000.0,12802000.0,
8,12802001,bgy,Bani,True,4,10000000,12800000.0,12802000.0,12802001.0
9,12802002,bgy,Buyon,True,4,10000000,12800000.0,12802000.0,12802002.0


In [20]:
all_loc_names = psgc_unpivot[['code', 'location', 'original']].rename(columns={'code': 'join_code'})
all_loc_names.head()

Unnamed: 0,join_code,location,original
0,10000000,REGION I,False
1,10000000,ILOCOS REGION,False
2,10000000,REGION I ILOCOS REGION,True
3,10000000,REGION 1,False
4,12800000,ILOCOS NORTE,True


Add Region names.

In [21]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location': 'adm1_name', 'original': 'adm1_is_orig'}),
                                  how="left", left_on="adm1_code", right_on="join_code").drop('join_code', axis=1)
psgc_unpivot.head()

Unnamed: 0,code,interlevel,location,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig
0,10000000,reg,REGION I,False,1,10000000,,,,REGION I,False
1,10000000,reg,REGION I,False,1,10000000,,,,ILOCOS REGION,False
2,10000000,reg,REGION I,False,1,10000000,,,,REGION I ILOCOS REGION,True
3,10000000,reg,REGION I,False,1,10000000,,,,REGION 1,False
4,10000000,reg,ILOCOS REGION,False,1,10000000,,,,REGION I,False


Add Prov names.

In [22]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location': 'adm2_name', 'original': 'adm2_is_orig'}),
                                  how="left", left_on="adm2_code", right_on="join_code").drop('join_code', axis=1)
psgc_unpivot.head()

Unnamed: 0,code,interlevel,location,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig
0,10000000,reg,REGION I,False,1,10000000,,,,REGION I,False,,
1,10000000,reg,REGION I,False,1,10000000,,,,ILOCOS REGION,False,,
2,10000000,reg,REGION I,False,1,10000000,,,,REGION I ILOCOS REGION,True,,
3,10000000,reg,REGION I,False,1,10000000,,,,REGION 1,False,,
4,10000000,reg,ILOCOS REGION,False,1,10000000,,,,REGION I,False,,


Add MuniCity names.

In [23]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location': 'adm3_name', 'original': 'adm3_is_orig'}),
                                  how="left", left_on="adm3_code", right_on="join_code").drop('join_code', axis=1)
psgc_unpivot.head()

Unnamed: 0,code,interlevel,location,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig,adm3_name,adm3_is_orig
0,10000000,reg,REGION I,False,1,10000000,,,,REGION I,False,,,,
1,10000000,reg,REGION I,False,1,10000000,,,,ILOCOS REGION,False,,,,
2,10000000,reg,REGION I,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,
3,10000000,reg,REGION I,False,1,10000000,,,,REGION 1,False,,,,
4,10000000,reg,ILOCOS REGION,False,1,10000000,,,,REGION I,False,,,,


Add Barangay names.

In [24]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location': 'adm4_name', 'original': 'adm4_is_orig'}),
                                  how="left", left_on="adm4_code", right_on="join_code").drop('join_code', axis=1)
psgc_unpivot.head()

Unnamed: 0,code,interlevel,location,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig,adm3_name,adm3_is_orig,adm4_name,adm4_is_orig
0,10000000,reg,REGION I,False,1,10000000,,,,REGION I,False,,,,,,
1,10000000,reg,REGION I,False,1,10000000,,,,ILOCOS REGION,False,,,,,,
2,10000000,reg,REGION I,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,
3,10000000,reg,REGION I,False,1,10000000,,,,REGION 1,False,,,,,,
4,10000000,reg,ILOCOS REGION,False,1,10000000,,,,REGION I,False,,,,,,


In [25]:
# Special handling for isabela city! it's supposed to be in the province of basilan

psgc_unpivot.loc[psgc_unpivot.code.isin(["099700000", "099701000"]), "adm2_name"] = "BASILAN"

# Also, isabela is the only place with two PSGC codes -- one for province level and one for city level! lets just use one.

psgc_unpivot = psgc_unpivot[psgc_unpivot.code != "099700000"]

Create a "location tuple" that concatenates all the location components names into a single tuple. We'll use this for fuzzy matching later.

In [26]:
#append all the rows again for places in metro manila except with blank regions. 
#this enables us to accept "Fort Bonifacio, Taguig" as an exact match even if it doesn't have "Metro Manila" in it

metro_manila = psgc_unpivot[psgc_unpivot.code.str.startswith("13")]
metro_manila.loc[:, "adm1_name"] = np.nan
print(len(psgc_unpivot))
psgc_unpivot = psgc_unpivot.append(metro_manila, ignore_index=True).reset_index(drop=True)
print(len(psgc_unpivot))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


317514
327154


In [None]:
def normalize_text(item):
    replacements = {r"city of|city": "",
                    r"barangay|brgy": "bgy",
                    r"[^a-zA-Z0-9_\s]": "",
                    r"poblacion": "pob",
                    r"ñ": "n"}
    
    item = item.lower()
    
    if item not in ['bgy', 'municity', 'prov', 'reg']:
        
        for k, v in replacements.items():
            item = re.sub(k,v,item.strip())
            
    return item.strip()

In [None]:
def create_loc_tuple_with_code(row):
    
    # For now, disregard making location tuples for all regions 
    if row.interlevel == "reg": 
        return None 
    
    # Special handling for NCR:
    
    if row.code[:3] == "133" and row.interlevel != "municity": # Manila has submunicipalities so include all admin levels
        
        return tuple([normalize_text(v) for v in [row.adm4_name, row.adm3_name, row.adm2_name, row.adm1_name, row.interlevel, row.code] if (v is not None) and (v is not np.nan)])
    
    elif (row.code[:3] == "133" and row.interlevel == "municity") or (row.code[:3] == "137"): # Except when Manila City or anywhere else in NCR is the item, then exclude adm2_name
        
        return tuple([normalize_text(v) for v in [row.adm4_name, row.adm3_name, row.adm1_name, row.interlevel, row.code] if (v is not None) and (v is not np.nan)])
        
    # else, exclude region from final tuple   
    
    return tuple([normalize_text(v) for v in [row.adm4_name, row.adm3_name, row.adm2_name, row.interlevel, row.code] if (v is not None) and (v is not np.nan)])

In [None]:
psgc_unpivot.loc[:, 'loc_tuple'] = psgc_unpivot.apply(create_loc_tuple_with_code,axis=1)
psgc_unpivot.head(10)

In [None]:
psgc_unpivot = psgc_unpivot.drop_duplicates(keep="first")
len(psgc_unpivot)

## Create canonical names

In [None]:
psgc_unpivot.code.nunique()

In [None]:
#Find the original names for each unique PSGC code

psgc_unpivot['is_canonical'] = psgc_unpivot[['adm4_is_orig', 'adm3_is_orig', 'adm2_is_orig', 'adm1_is_orig']].sum(axis=1)

#does every code have a canonical name?

psgc_unpivot[psgc_unpivot.is_canonical > 0].code.nunique()

In [None]:
# get the canonical names for each PSGC

canonical_names = psgc_unpivot.sort_values(["code", "is_canonical"], ascending=False).drop_duplicates("code", keep="first")

# check if each code has one canonical name

canonical_names.groupby('code').size().value_counts()

In [None]:
canonical_names.head()

In [None]:
# drop row for metro manila where region is blank, we don't want to use these in the canonical names

null_ncr_region = canonical_names[canonical_names.code.str.startswith("13") * canonical_names.adm1_name.isnull()].index
canonical_names = canonical_names.drop(null_ncr_region,axis=0).set_index('code')

# rename columns
canonical_names = canonical_names.rename(columns={"adm4_name": "bgy", "adm3_name": "municity", "adm2_name": "prov", "adm1_name": "reg"})

# keep only those we need
canonical_names = canonical_names[['bgy', 'municity', 'prov', 'reg']]

In [None]:
print(canonical_names.info())
canonical_names.head()

In [None]:
# join the canonical names to all possible search terms

psgc_locations = psgc_unpivot[['loc_tuple', 'code']].dropna().drop_duplicates(keep="first")
psgc_locations = psgc_locations.merge(canonical_names, left_on="code", right_index=True, how="left")
psgc_locations.info()

In [None]:
def to_string(row):
    return ",".join(row.loc_tuple)

In [None]:
psgc_locations['loc_tuple'] = psgc_locations.apply(to_string,axis=1)

In [None]:
psgc_locations.head()

In [None]:
psgc_locations['candidate_terms'] = psgc_locations['loc_tuple'].str.rsplit(',', n=1).str.get(0)

In [None]:
psgc_locations = psgc_locations.set_index('loc_tuple')
psgc_locations.head()

In [None]:
# drop region column

psgc_locations = psgc_locations.drop('reg',axis=1)

In [None]:
psgc_location

In [45]:
psgc_locations.to_csv('../../dataset/psgc/processed/psgc-locations.csv.gz',compression="gzip")

In [47]:
psgc_locations.to_json('../../dataset/psgc/processed/psgc.2020-05-31.json',orient='records')