# Solution

## Importing packages


In [7]:
import pandas as pd
from difflib import SequenceMatcher

## Reading data

In [8]:
osm_poi_df = pd.read_csv("osm_poi.csv", sep=";", low_memory=False)
google_poi_df = pd.read_csv("google_poi.csv", sep=";")
matching_df = pd.read_csv("google_osm_poi_matching.csv", sep=";")


## Adding new confidence score column to google_osm_poi_matching.csv

In [9]:
matching_df["confidence_score"] = 0.0

## Computing confidence score for one row of osm and google data

In [10]:
# tag, category, address
identical_columns = ["latitude", "longitude", "name"]
similar_columns = [("contact_website","website"), ("contact_phone", "phone")]
osm_address_columns = ["address_details_level", "address_house_nr", "address_street", "address_zip_code",
 "address_city", "address_country", "address_full", "address_region_neighborhood", 
 "address_region_suburb", "address_region_district", "address_region_province", "address_region_state", 
 "address_house_name", "address_place", "address_block", "address_details_level", "address_details_flats"]

def compute_confidence_score(osm_data, google_data):
    identical_columns_confidence_score = 0
    similar_columns_confidence_score = 0
    address_columns_confidence_score = 0
    tag_columns_confidence_score = 0
    category_column_confidence_score = 0
    
    for col in identical_columns:
        if pd.isna(osm_data[col].values[0]) and pd.isna(google_data[col].values[0]):
            identical_columns_confidence_score += 1
            continue
        if pd.isna(osm_data[col].values[0]) or pd.isna(google_data[col].values[0]):
            continue
        if col == "latitude" or col == "longitude":
            if SequenceMatcher(None, str(round(float(osm_data[col].values[0]),3)), str(round(float(google_data[col].values[0]),3))).ratio() > 0.5:
                identical_columns_confidence_score += 1
        elif SequenceMatcher(None, str(osm_data[col].values[0]), str(google_data[col].values[0])).ratio() > 0.7:
            identical_columns_confidence_score += 1
    
    for col in similar_columns:
        if pd.isna(osm_data[col[1]].values[0]) and pd.isna(google_data[col[0]].values[0]):
            similar_columns_confidence_score += 1
            continue
        if pd.isna(osm_data[col[1]].values[0]) or pd.isna(google_data[col[0]].values[0]):
            continue
        if SequenceMatcher(None, str(osm_data[col[1]].values[0]), str(google_data[col[0]].values[0])).ratio() > 0.6:
            similar_columns_confidence_score += 1
    
    
    for col in osm_address_columns:
        if pd.isna(osm_data[col].values[0]) or pd.isna(google_data["address"].values[0]):
            continue
        
        google_addresses = google_data["address"].values[0][1:-1].split(",")
        for google_address in google_addresses:
            if SequenceMatcher(None, str(osm_data[col].values[0]), google_address).ratio() > 0.5:
                address_columns_confidence_score += 1
    
    
    for osm_tags in osm_data["tags"]:
        if pd.isna(osm_tags):
            continue
        osm_tags = osm_tags[1:-1].split(",")
        for osm_tag in osm_tags:
            if len(osm_tag.split("=")) > 1:
                osm_tag = osm_tag.split("=")[1]
                for google_tags in google_data["tags"]:
                    if pd.isna(google_tags):
                        continue
                    for google_tag in google_tags[1:-1].split(","):
                         if SequenceMatcher(None, osm_tag, google_tag).ratio() > 0.5:
                                tag_columns_confidence_score += 1
    
    
    
    for osm_categories in osm_data["categories"]:
        if pd.isna(osm_categories):
            continue
        for osm_category in osm_categories[1:-1].split(","):
            for google_categories in google_data["categories"]:
                if pd.isna(google_categories):
                    continue
                for google_category in google_categories[1:-1].split(","):
                    if SequenceMatcher(None, osm_category, google_category).ratio() > 0.5:
                        category_column_confidence_score += 1
    
    
    identical_columns_confidence_score = (identical_columns_confidence_score / 3) * 0.4
    similar_columns_confidence_score = (similar_columns_confidence_score / 2) * 0.3
    address_columns_confidence_score = (address_columns_confidence_score / 2) * 0.15
    tag_columns_confidence_score = (tag_columns_confidence_score / 2) * 0.075
    category_column_confidence_score = (category_column_confidence_score / 2) * 0.075
    
    confidence_score = identical_columns_confidence_score + similar_columns_confidence_score + address_columns_confidence_score + tag_columns_confidence_score + category_column_confidence_score
    if confidence_score > 1:
        confidence_score = 1
    
    return confidence_score




## Iterating over rows in google_osm_poi_matching.csv file and computing confidence score

In [11]:

for index, row in matching_df.iterrows():
    osm_id = row["osm_id"]
    internal_id = row["internal_id"]
    osm_data = osm_poi_df.loc[(osm_poi_df['osm_id'] == osm_id)]
    google_data = google_poi_df.loc[(google_poi_df['internal_id'] == internal_id)]
    
    confidence_score = compute_confidence_score(osm_data, google_data)
    matching_df.at[index,'confidence_score'] = confidence_score

matching_df.head()


Unnamed: 0,osm_type,osm_id,internal_id,query,confidence_score
0,way,154470603,0x130e44cd6e20475f:0x671441b8dc03be60,Kalkara,0.566667
1,node,5896564791,0x130e45014da2fa03:0xf743a120b9194c06,"Spar, Triq Tigné",0.566667
2,node,2471609507,0x130e4501edfb329b:0x9edcdba888218c47,"Bayview Hotel & Apartments, The Strand 143, Sl...",0.641667
3,node,6222651588,0x130e450a6e7438bd:0x5af47e8f69212d52,"David Hardware Store, Triq Carlo Manche",0.625
4,node,6635172974,0x130e451ad5f0c673:0x1973502978d5c025,"Chef Lee, Triq d'Argens",0.6625


## Saving the modified google_osm_poi_matching.csv

In [12]:
matching_df.to_csv('modified_google_osm_poi_matching.csv', index=False)