# Create GeoNames Country Ontology File

# Description

Purpose is to create the geonames-countries.ttl file from the countryInfo.txt file, downloaded from http://download.geonames.org/export/dump/. It is assumed that the countryInfo.txt file is already downloaded from the GeoNames site.

The steps to create the TTL file are to:
  * Extract the following detail from countryInfo.txt:
    * A country's ISO (2), ISO3 and GeoNames ID codes (at tabs 0, 1 and 16)
    * "Containing" continent (at tab 8)
    * Currency code and name (at tabs 10 and 11)
  * Retrieve additional relationships for the countries by:
    * Obtaining neighbors using the REST API, api.geonames.org/neighbours?geonameId=xxxx&username=uuuu
  * Encode the extracted details using the Turtle syntax
  
After completion, the geonames-countries.ttl file is moved to the ../ontologies directory.

In addition, a pickled dictionary of continent and country to geonameId mappings is created and stored (in the processing directory) as continent-country-geonames.pickle.

## Imports

In [1]:
import configparser as cp
import pickle
import requests
import xml.etree.ElementTree as ET
import time

# Get configuration details from the dna.config 

In [2]:
# Get details from the dna.config file, stored in the same directory as the .ipynb file
config = cp.RawConfigParser()
config.read('dna.config')

# Set geoname user id
geonamesUser = config.get('GeoNamesConfig', 'geonamesUser')

# Process the countryInfo.txt file

In [3]:
# Extract the countryInfo data
with open('countryInfo.txt', 'r') as handle:
    countryInfoData = handle.read()

In [4]:
# Define dictionary to hold the continent/country code mappings to their GeoName IDs
continentCountryDict = {'AF': '6255146', 'AS': '6255147', 'EU': '6255148', 'NA': '6255149',
           'OC': '6255151', 'SA': '6255150', 'AN': '6255152'}

In [5]:
# Create file to hold the Turtle output
with open('geonames_countries.ttl', 'w') as ttlFile:
    # Write the prefix details
    ttlFile.write('@prefix : <urn:ontoinsights:ontology:dna:> . \n'\
                  '@prefix dna: <urn:ontoinsights:ontology:dna:> . \n'\
                  '@prefix geo: <urn:ontoinsights:ontology:geonames:> . \n'\
                  '@prefix owl: <http://www.w3.org/2002/07/owl#> . \n'\
                  '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . \n'\
                  '@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . \n'\
                  '@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . \n\n')
    # Create a set of tuples to hold currency codes and their respective names
    currencySet = set()
    # Parse the data from the countryInfo.txt file
    countryInfoLines = countryInfoData.split('\n')
    for infoLine in countryInfoLines:
        if not infoLine.startswith('#'):          # Ignore comment lines
            infoDetails = infoLine.split('\t')    # Data is separated by tabs
            if len(infoDetails) > 16:             # Ignore lines w/out sufficient tabs
                # Create the Turtle for the country definitions, using the GeoNames ID as the entity URI
                ttlLine1 = f'geo:{infoDetails[16]} a :Country ;'
                ttlLine2 = f'  :iso_alpha2 "{infoDetails[0]}" ;'
                ttlLine3 = f'  :iso_alpha3 "{infoDetails[1]}" .'
                # Create blank line to potentially hold currency details
                ttlLine4 = ''
                # Write currency details if available
                if infoDetails[10].strip():      
                    ttlLine4 = f'geo:{infoDetails[16]} :has_currency geo:currency{infoDetails[10]} .'
                    currencySet.add((infoDetails[10], infoDetails[11]))
                # Indicate that the country is part of a continent
                ttlLine5 = 'geo:{} :has_component geo:{} .\n\n'.
                            format(continentCountryDict.get(infoDetails[8]),infoDetails[16])
                # Capture the ISO code to GeoName ID mapping
                continentCountryDict[infoDetails[0]] = infoDetails[16]
                if ttlLine4:
                    ttlFile.write('\n'.join([ttlLine1, ttlLine2, ttlLine3, ttlLine4, ttlLine5]))
                else:
                    ttlFile.write('\n'.join([ttlLine1, ttlLine2, ttlLine3, ttlLine5]))
    # Also save the country currency details
    for (currAbbrev,currName) in currencySet: 
        currLine1 = 'geo:currency{} a :Currency ;'.format(currAbbrev)
        currLine2 = '  rdfs:label "{}"@en .\n\n'.format(currName)
        ttlFile.write('\n'.join([currLine1, currLine2]))

In [6]:
# Pickle (save) the country/geoname mapping for reuse
with open('continent-country-geonames.pickle', 'wb') as handle:
    pickle.dump(continentCountryDict, handle)

# Get neighbors details using the GeoNames API

In [7]:
# Using the country geonameIds from the continentCountryDict, get their neighbors 
#   and store them as a set (of tuples) where the lesser GeoNameId is first
neighborSet = set()
response = requests.get(f'http://api.geonames.org/neighbours?geonameId={geoId}&username={geonamesUser}')
    root = ET.fromstring(response.content)
    neighbors = list()
    for child in root:
        if child.tag == 'geoname':
            neighbors.append(child[4].text)
    for neighbor in neighbors:
        if int(geoId) < int(neighbor):
            neighborSet.add((geoId, neighbor))
        else:
            neighborSet.add((neighbor, geoId))

In [8]:
# Create the Turtle for the neighbors and add it to the previously created geonames_countries.ttl file
with open('geonames-countries.ttl', 'a') as ttlFile:
    for (n1,n2) in neighborSet:
        ttlFile.write(f'geo:{n1} :meets geo:{n2} .\n\n')

In [17]:
response = requests.get(f'http://api.geonames.org/search?q=znojmo&maxRows=1&username=arwesterinen')
root = ET.fromstring(response.content)
print(response.content)
country = root.findall('./geoname/name')[0]
feature = root.findall('./geoname/fcl')[0]
fcode = root.findall('./geoname/fcode')[0]
print(country.text)
print(feature.text)
print(fcode.text)

b'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<geonames style="MEDIUM">\n    <totalResultsCount>708</totalResultsCount>\n    <geoname>\n        <toponymName>Znojmo</toponymName>\n        <name>Znojmo</name>\n        <lat>48.8555</lat>\n        <lng>16.0488</lng>\n        <geonameId>3061344</geonameId>\n        <countryCode>CZ</countryCode>\n        <countryName>Czechia</countryName>\n        <fcl>P</fcl>\n        <fcode>PPLA2</fcode>\n    </geoname>\n</geonames>\n'
Znojmo
P
PPLA2


In [13]:
print(response.content)

b'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<geonames style="MEDIUM">\n    <totalResultsCount>1</totalResultsCount>\n    <geoname>\n        <toponymName>Union of Soviet Socialist Republics</toponymName>\n        <name>Soviet Union</name>\n        <lat>60</lat>\n        <lng>98</lng>\n        <geonameId>8354411</geonameId>\n        <countryCode/>\n        <countryName/>\n        <fcl>A</fcl>\n        <fcode>PCLH</fcode>\n    </geoname>\n</geonames>\n'


In [14]:

country = root.findall('./geoname/name')[0]
feature = root.findall('./geoname/fcl')[0]
fcode = root.findall('./geoname/fcode')[0]
print(country.text)
print(feature.text)
print(fcode.text)

Soviet Union
A
PCLH
