The goal of these modules is to streamline the checking of water fountains as items in Wikidata. The input should be a pandas dataframe that includes the structure: 'X' = longitude, 'Y' = latitude, 'name' = unique name of item

In [1]:
import pandas as pd
import import_ipynb
import warnings; warnings.simplefilter('ignore')
from pandas2quickstatements import write_query, identify_nearest_fountains, createline, process_coordinates

importing Jupyter notebook from pandas2quickstatements.ipynb
191210_223020
Python v 3.6.5


In [2]:
#Take the fountain data from csv file, convert to pandas
df = pd.read_csv("fontaines-a-boire.csv", sep = ";")
location = "Paris"


paris_fountain_data = df[['ID', 'Geo Point', 'A_BOIRE']]

print(paris_fountain_data.head())

#Split the geolocation into X and Y coordinates
geo_split = paris_fountain_data['Geo Point'].str.split(",", n = 1, expand = True)
paris_fountain_data['Y'] = geo_split[0].apply(lambda x: float(x))
paris_fountain_data['X'] = geo_split[1].apply(lambda x: float(x))
paris_fountain_data.drop(columns = 'Geo Point')

       ID                    Geo Point  A_BOIRE
0  1032.0  48.8455274608,2.25550660889        1
1  1019.0  48.8381521871,2.25870518213        0
2  1036.0  48.8572651165,2.26686423645        1
3   986.0   48.8530939586,2.2705187223        1
4   856.0  48.8409169077,2.27577864693        1


Unnamed: 0,ID,A_BOIRE,Y,X
0,1032.0,1,48.845527,2.255507
1,1019.0,0,48.838152,2.258705
2,1036.0,1,48.857265,2.266864
3,986.0,1,48.853094,2.270519
4,856.0,1,48.840917,2.275779
...,...,...,...,...
1417,1050.0,1,48.889639,2.298853
1418,1317.0,1,48.855768,2.400955
1419,406.0,1,48.854852,2.394567
1420,543.0,0,48.841109,2.393067


In [3]:
buffer = 0.0003  # in degrees, corresponds to about 20-30 meters)
bounds = {
    'minX': paris_fountain_data['X'].min() - buffer,
    'minY': paris_fountain_data['Y'].min() - buffer,
    'maxX': paris_fountain_data['X'].max() + buffer,
    'maxY': paris_fountain_data['Y'].max() + buffer
    }
print(bounds)

{'minX': 2.24668232248, 'minY': 48.8181766942, 'maxX': 2.45685206475, 'maxY': 48.901302660800006}


In [5]:
paris_fountain_data = identify_nearest_fountains(paris_fountain_data, location)

paris_fountain_data.head()

Unnamed: 0,ID,Geo Point,A_BOIRE,Y,X,nearest_qid,nearest_has_label_en,place_label,nearest_has_date,nearest_has_operator,nearest_has_code,nearest_has_water_type,nearest_has_described_at_url,match_found
0,1032.0,"48.8455274608,2.25550660889",1,48.845527,2.255507,Q76424782,True,Fountain,False,False,False,False,False,match
1,1019.0,"48.8381521871,2.25870518213",0,48.838152,2.258705,Q77789702,True,Fountain,False,False,False,False,False,match
2,1036.0,"48.8572651165,2.26686423645",1,48.857265,2.266864,Q77789728,True,Fountain,False,False,False,False,False,match
3,986.0,"48.8530939586,2.2705187223",1,48.853094,2.270519,Q77789750,True,Fountain,False,False,False,False,False,match
4,856.0,"48.8409169077,2.27577864693",1,48.840917,2.275779,Q77789777,True,Fountain,False,False,False,False,False,match


In [8]:
from datetime import datetime as dt
dtFmt = "%y%m%d_%H%M%S"
import io

def process_coordinates(x, y):
    # format geographic coordinates
    return '@{1:1.8f}/{0:1.8f}'.format(x,y)


def createline(lines, item, prop, value, ref, qualifiers=[]):
    # general function to create Quickstatement v1 commands
    if value != '' and value != '""':
        statement = '{}\t{}\t{}'.format(item, prop, value)
        if len(qualifiers):
            # append qualifiers if applicable
            for q in qualifiers:
                statement += '\t{}\t{}'.format(q['prop'], q['value'])
        statement += ref
        statement += '\n'
        lines.append(statement)
    return lines 

def write_query1(df, location, ref):
    lines = []
    i=0
    for index, row in df.iterrows():
        i+=1
        # either create new or edit existing entity
        if row['match_found'] == 'no match':
            # create a new fountain
            lines.append('CREATE\n')
            item = 'LAST'
        elif row['match_found'] == 'unclear':
            print('unclear match')
            print(row)
            continue
        elif row['match_found'] == 'match':
            # update existing fountain
            item = row['nearest_qid']
        

        # Add this basic information only if creating a new entity
        if item == 'LAST':
            # coordinates
            lines = createline(lines, item, 'P625', process_coordinates(row['X'], row['Y']),ref)
            if row['A_BOIRE'] == 1:
                 # instance of drinking fountain 
                lines = createline(lines, item, 'P31', 'Q1630622', ref)
            else: 
                lines = createline(lines, item, 'P31', 'Q483453', ref)
        else:
            # Add this basic information only if creating a new entity
            if row['nearest_has_described_at_url']:
                print(str(i)+' '+item+' has ready a shortUrl')
            else:
                #short URL as per https://github.com/water-fountains/proximap/issues/244
                lines = createline(lines, item, 'P973', '"https://h2o.do/'+item+'"','')

            

        # For other properties, add information if the entity is new or if property does not yet exist

        # label in english
        if item == 'LAST' or not row['nearest_has_label_en']:
            lines = createline(lines, item, 'Len', 'Fountain','')

        # creation date
        #if item == 'LAST' or not row['nearest_has_date']:
        #    lines = createline(lines, item, 'P571', process_year(row['date']))

        # operated by t.b.d. operator per location  
        # lines = createline(lines, item, 'P137', 'Q72936279')

        # catalog number can always be added (it is hard to check for)
        #lines = createline(lines, item, 'P528', '"{}"'.format(row['operator_id']), [{
        #    'prop': 'P972',
        #    'value': 'Q53629101'
        #}])
        
        
    #Write the data 
    quickStatFileName = "quickstatement_commands_{}_drink_".format(location)+dt.now().strftime(dtFmt)+".txt"
    with io.open(quickStatFileName, "w", encoding='utf8') as f:
        f.writelines(lines)
    print("wrote '"+quickStatFileName+"' with "+str(len(lines))+" lines")

In [9]:
statedId = "\tS248\tQ76424180"
write_query1(paris_fountain_data, location, statedId)

575 Q18146923 has ready a shortUrl
unclear match
ID                                                      296
Geo Point                       48.8673321136,2.31791349622
A_BOIRE                                                   0
Y                                                   48.8673
X                                                   2.31791
nearest_qid                                        Q3076221
nearest_has_label_en                                   True
place_label                       fontaine des Ambassadeurs
nearest_has_date                                       True
nearest_has_operator                                  False
nearest_has_code                                      False
nearest_has_water_type                                False
nearest_has_described_at_url                          False
match_found                                         unclear
Name: 1094, dtype: object
wrote 'quickstatement_commands_Paris_drink_191210_223243.txt' with 2335 lines
