# Geonames reconciliation

### run all cells until "Change here"

In [None]:
!pip install pandas
!pip install requests
!pip install numpy

In [1]:
import pandas as pd
import urllib.parse , requests
import numpy as np
from IPython.display import HTML
import base64  
from datetime import datetime

In [4]:
def geonames_iri(placename):
    iri = ''
    if isinstance(placename,str) and len(placename) >1:
        pom_url = "https://palopenmaps.org/api/localities/?search="+placename
        pomid = None
        r = requests.get(pom_url)
        if r.status_code == 200:
            data = r.json()
            if len(data) >= 1:
                pomid = data[0]["id"]
        if pomid:
            iri ="https://palopenmaps.org/api/localities/"+str(pomid)
        else:
            geonames_url = 'http://api.geonames.org/search?q='+urllib.parse.quote(placename)+'&fuzzy=0.8&username=palread&type=json'
            r = requests.get(geonames_url)
            if r.status_code == 200:
                data = r.json()
                if "geonames" in data and len(data["geonames"]) >= 1:
                    iri = "http://www.geonames.org/"+str(data["geonames"][0]["geonameId"])
    
    return iri

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index =False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=str(timestamp)+"_"+filename)
    return HTML(html)



# read an online csv, get the specified column 
def add_geonames(url, geonames_col):
    separator ='\t' if 'tsv' in url else ','
    df = pd.read_csv(url, error_bad_lines=False, sep=separator, header=0 )
    df.loc[df[geonames_col].isnull(),[geonames_col]] = df.loc[df[geonames_col].isnull(),geonames_col].apply(lambda x: "")
    places = df[geonames_col].tolist() # query geonames
    geonames = [geonames_iri(place) for place in places]
    res = [str(j) + ' ' + str(i) for i, j in zip(places, geonames)] 
    df[geonames_col+'_geonames'] = res # return a new column with the geonames IDs
    
    # export csv
    return create_download_link(df)

# Change here

In [5]:
# 1. change the url of the table 
# notice the prefix "https://raw.githubusercontent.com/palread/import_csv/main/" 
url = 'https://raw.githubusercontent.com/palread/import_csv/main/alsharekh%20magazine%20archive%20-%20author_sample.tsv'

# 2. change the name of the column including a placename
geonames_col = 'geonames'

# click on Run
add_geonames(url, geonames_col)