# Introduction

Purpose:
  * Retrieve the feature details for select countries from the **GeoNames** download site, http://download.geonames.org/export/dump
    * The countries of interest are defined by a comma-separated list of their ISO-2 alpha codes
      * The list is specified in the dna.config parameter, countries
    * Each country's data is captured in GeoNames download site's XX.zip files (where XX is the country's 2 character ISO code)
    * The zip files are decompressed by the code below, and the XX.txt file is parsed for the data
  * Capture all the feature-level detail from the countries:
    * GeoNames id (at tab 0)
    * ASCII name and alternate names (at tabs 2 and 3)
    * Latitude and longitude (at tabs 4 and 5)
    * Feature code and feature class (at tabs 6 and 7; for example, feature code P indicates a city, village, etc. while the feature class PPLF indicates a farm village)
      * Full details on the feature classes and codes can be found at https://www.geonames.org/export/codes.html
      * Also, these details are captured in an ontology file, geonames_featureCodes_X.ttl (where X represents the feature class code)
    * "Containing" countries (at tabs 8 and 9)
    * Elevation (at tab 15)
  * Retrieve the containment hierarchy for the entities using the REST API, api.geonames.org/hierarchy?geonameId=xxxx&username=uuuu
  * Encode all the above data as triples and save to two files:
    * geonames_XX.ttl with all the location entities for the country identified by XX
    * geonames_containment_XX.ttl with the containment details for the locations (if any are defined)
    
After the files are fully encoded, they are loaded to Stardog and moved to the directory, ../Ontologies/country_data.

Before running this program, make sure that:
  * Stardog is started (via the command, stardog-admin server start) 
  * The dna.config file (used in the second executing cell) has been updated for your environment

## Imports 

In [1]:
import os, os.path
import time
from datetime import date
import zipfile, shutil
import configparser as cp
import pickle
import xml.etree.ElementTree as ET
import requests, wget
import stardog

## String data

In [2]:
commaStr = ','
dotZipStr = '.zip'
dotTTLStr = '.ttl'
geonamesContainmentStr = 'geonames_containment_'
geonamesStr = 'geonames_'
pluralStr = ' (plural)'

stardogConfig = 'StardogConfig'

today = date.today().strftime("%B %d, %Y")    

## Pickled data

In [3]:
mappingDict = {}
with open('featureCodes_mappings.pickle', 'rb') as mdHandle:
    mappingDict = pickle.load(mdHandle)

countryDict = {}
with open('continent_country_geonames.pickle', 'rb') as cdHandle:
    countryDict = pickle.load(cdHandle)

countryCodeDict = {}
with open('country_isoCodes_and_names.pickle', 'rb') as ccHandle:
    countryCodeDict = pickle.load(ccHandle)

# Get access details from dna.config

In [4]:
# Get access details from the dna.config file, stored in the same directory as the .ipynb file
config = cp.RawConfigParser()
config.read('dna.config')
        
# Set Stardog connection details
sdConnDetails = {
    'endpoint': config.get(stardogConfig, 'endpoint'),
    'username': config.get(stardogConfig, 'username'),
    'password': config.get(stardogConfig, 'password')
}

# Set Stardog database name
dbName = config.get(stardogConfig, 'dbName')

# Set path to directory where ontologies stored
ontolPath = config.get('OntologiesConfig', 'ontolPath')
if not ontolPath.endswith('/'):
    ontolPath = f'{ontolPath}/'

# Get GeoNames user id
geonamesUser = config.get('GeoNamesConfig', 'geonamesUser')

# Get the countries of interest
countries = config.get('CountriesOfInterest','countries')
countries = [country.strip() for country in countries.split(',')]

# Process the feature data from the country-specific zips

In [5]:
# Get the features .zip files
for country in countries:
    # Does the file already exist in the directory? If so, don't repeat
    if os.path.exists(f'{ontolPath}country_data/{geonamesStr}{country}.ttl'):
        break;
    # File needs to be downloaded
    with open(f'{geonamesStr}{country}{dotTTLStr}', 'w') as ttlFile:
        # Write the prefix details
        ttlFile.write('@prefix : <urn:ontoinsights:ontology:dna:> .\n'\
                      '@prefix dna: <urn:ontoinsights:ontology:dna:> .\n'\
                      '@prefix geo: <urn:ontoinsights:ontology:geonames:> .\n'\
                      '@prefix owl: <http://www.w3.org/2002/07/owl#> .\n'\
                      '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n'\
                      '@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n'\
                      '@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n\n'\
                      '########################################################################.\n'\
                      '# File defining location data for ')
        ttlFile.write(f'{country} ({countryCodeDict[country]})\n')
        ttlFile.write('# \n') 
        ttlFile.write(f'# Created: {today}\n')
        ttlFile.write(f'# Last modified: {today}\n')
        ttlFile.write('########################################################################\n\n')
        # Get the features .zip file for the country
        wget.download(f'http://download.geonames.org/export/dump/{country}{dotZipStr}')
        # Unpack the zip and get the contents of the XX.txt file
        with zipfile.ZipFile(f'{country}{dotZipStr}', 'r') as countryZip:
            countryData = countryZip.read(f'{country}.txt').decode('utf-8')
        # Remove the zip file 
        os.remove(f'{country}{dotZipStr}')
        
        # Parse the data in countriesData
        countryLines = countryData.split('\n')
        for locLine in countryLines:  
            locDetails = [ld.strip() for ld in locLine.split('\t')]    # Data is separated by tabs
            if len(locDetails) > 8:                                    # Ignore lines w/out sufficient tabs
                geoId = locDetails[0]
                # Determine the type of the location (based on its feature class and code)
                fCode = f'{locDetails[6]}_{locDetails[7]}'
                if fCode in mappingDict:
                    fCode = mappingDict[fCode]
                plural = False
                if pluralStr in fCode:
                    plural = True
                    fCode = fCode.replace(pluralStr, '')
                # Get the names of the location 
                label = locDetails[2]               # ASCII name
                nonAsciiName = locDetails[1]
                # Create Turtle for the country's data, with the GeoNames ID as the local part of the entity's IRI
                ttlFile.write(f'geo:{geoId} rdf:type :{fCode} ; \n  rdfs:label "{label}"@en ; \n')
                ttlFile.write(f'  :latitude "{locDetails[4]}"^^xsd:decimal ; \n  :longitude "{locDetails[5]}"^^xsd:decimal ')
                synNames = ''
                if label != nonAsciiName:           # Include the non-ASCII location name in synonyms
                    synNames = nonAsciiName
                if locDetails[3]:
                    if synNames:
                        synNames += ',' + locDetails[3]
                    else:
                        synNames = locDetails[3]
                if synNames:
                    synNames = synNames.replace(label, '').replace(nonAsciiName, '')    # Get rid of duplicates
                    # Clean up the text
                    synNames = synNames.replace(',,', ',')                             
                    if synNames.startswith(','):
                        synNames = synNames[1:]                           
                    if synNames.endswith(','):
                        synNames = synNames[:-1]
                    # Make sure that there still is some text and then format as individual strings
                    if synNames:
                        synNames = synNames.replace(',', '", "')
                        ttlFile.write(f'; \n  :synonym "{synNames}" ')
                # Write 'plural True' if applicable
                if plural:
                    ttlFile.write('; \n  :plural true ')
                # Write elevation details if available
                if locDetails[15]:      
                    ttlFile.write(f'; \n  :altitude_meters "{locDetails[15]}"^^xsd:decimal .\n')
                else:
                    ttlFile.write('. \n')
                # Indicate that the location is part of a country, if defined
                containingCountrySet = set()
                if locDetails[8]:
                    containingCountrySet.add(locDetails[8])
                if locDetails[9]:
                    containingCountries = locDetails[9].split(',')
                    for containing in containingCountries:
                        containingCountrySet.add(containing)
                for cc in containingCountrySet:
                    if cc in countryDict:
                        ttlFile.write(f'geo:{countryDict[cc]} :has_component geo:{geoId} . \n')
                # Finished processing the location, add a new line
                ttlFile.write('\n') 

# Get the place hierarchy details

Note that this execution is LONG-RUNNING due to need to pace requests to the GeoNames server. For example, getting the containment data for the Ethiopian locations (extracted from ET.zip) resulted in almost 20000 locations (approx 20-21  hours elapsed time).

In [8]:
# Create a set to hold the unique container-contained pairs (to avoid duplication of data in the TTL file)
containingSet = set()
count = 0

# Get the new locations that were encoded above
for country in countries:
    # Set up the output file
    with open(f'geonames_containment_{country}.ttl', 'w') as outTTLFile:
            outTTLFile.write('@prefix : <urn:ontoinsights:ontology:dna:> .\n'\
                             '@prefix dna: <urn:ontoinsights:ontology:dna:> .\n'\
                             '@prefix geo: <urn:ontoinsights:ontology:geonames:> .\n\n'\
                             '########################################################################\n'\
                             '# File defining containment data for locations in ')
            outTTLFile.write(f'{country} ({countryCodeDict[country]})\n')
            outTTLFile.write('# \n')
            outTTLFile.write(f'# Created: {today}\n')
            outTTLFile.write(f'# Last modified: {today}\n')
            outTTLFile.write('########################################################################\n\n')
    # Process through the country-specific GeoNames file to retrieve the IDs
    lineNumber = 0
    found = False
    with open(f'geonames_{country}.ttl', 'r') as inTTLFile:
        for line in inTTLFile:
            lineNumber += 1
            if not ' rdf:type ' in line:
                continue                # Only care about retrieving the geoIds, found in the string, geo:XXXX rdf:type loc:YYYY
            geoId = line.split()[0][4:]  
            if geoId == '339927':
                found = True
            if not(found):
                continue
            try:
                response = requests.get('http://api.geonames.org/hierarchy?geonameId={}&username={}'.
                                        format(geoId, geonamesUser))
            except:                     # In-Process details when error occurred
                print(f'Country: {country}')
                print(f'Count: {count}')
                print(f'Line number: {lineNumber}')
                print(f'GeoId: {geoId}')
            count += 1
            root = ET.fromstring(response.content)
            containing = []
            for child in root:
                if child.tag == 'geoname':
                    # Skip the obvious containers (Earth, the continent and the country)
                    if child[0].text == 'Earth' or child[8].text == 'CONT' or child[8].text == 'PCLI':
                        continue               
                    containing.append(child[4].text)
            
            # If not > 1 containing entities, the location is either the country itself
            #   or is contained by the country (which is already known from above)
            if len(containing) > 1:               
                for i in range(1, len(containing)):
                    if not (containing[i-1], containing[i]) in containingSet:
                        containingSet.add((containing[i-1], containing[i]))
                        with open(f'{geonamesContainmentStr}{country}{dotTTLStr}', 'a') as containingTTLFile:
                            containingTTLFile.write(
                                f'geo:{containing[i-1]} :has_component geo:{containing[i]} . \n')
           
            # Pace requests to geonames server (< 1000/hr)
            time.sleep(3.75)            # Pace requests to geonames server (< 1000/hr)                
            # Pace requests to geonames server (< 20000/day)
            if count > 19900:              # Assuming 1 request every 3.75 seconds = 20.73 hrs to 19900 requests
                time.sleep(12600)          # Sleeping 3.5 hours resets the 24hr clock
                count = 0

# Print count details
print(f'Total GeoNames requests: {count}')

Total GeoNames requests: 7685


# Store the data in the Stardog repository and move the country files 

The files are moved to the directory specified by the ontolPath parameter in the dna.config file.

In [8]:
for country in countries:
    # Open connection to Stardog
    conn = stardog.Connection(dbName, **sdConnDetails)
    # Load ontologies
    conn.begin()
    conn.add(stardog.content.File(f'{geonamesStr}{country}{dotTTLStr}'))
    conn.add(stardog.content.File(f'{geonamesContainmentStr}{country}{dotTTLStr}'))
    conn.commit()
    # Move files
    shutil.move(f'{geonamesStr}{country}{dotTTLStr}', 
                f'{ontolPath}country_data/{geonamesStr}{country}{dotTTLStr}')
    shutil.move(f'{geonamesContainmentStr}{country}{dotTTLStr}', 
                f'{ontolPath}country_data/{geonamesContainmentStr}{country}{dotTTLStr}')