In [None]:
# !pip install pandas
# !pip install pyarrow
# !pip install geopandas
# !pip install matplotlib
# !pip install seaborn
# !pip install pyproj
# !pip install geopy
# !pip install "geopy[aiohttp]"
# !pip install tqdm
# !pip install pycountry
import logging
import os
from random import randint
from time import sleep
import pytz

import geopandas as gpd
import pandas as pd
from geopy.adapters import AioHTTPAdapter
from geopy.exc import GeocoderServiceError, GeocoderTimedOut
from geopy.geocoders import Nominatim
from shapely.geometry import Polygon
from tqdm import tqdm
import re
from ast import literal_eval

In [157]:
"""
This functions converts string likes POLYGON((x.xxx x.xxxx, x.xxx x.xxxx, x.xxx x.xxxx, x.xxx x.xxxx, x.xxx x.xxxx))
to tuple out of the first point
"""
def transform_polygon_string_to_tuple(polygon_string):
    subst = ""
    polygon_regex = r"(POLYGON\(\()(-?\d*\.?\d*\s-?\d*\.?\d*)"
    result = re.search(polygon_regex, polygon_string)
    if result == None:
        print(polygon_string)
    polygon_string = result.group(2)
    return_tuple = tuple(map(int, map(float, polygon_string.split(' '))))
    return return_tuple


"""
This function 
1. walk through the whole dataset
2. reads parquet files
3. transform values in the columns 'tile' from the parquet file using 
transform_polygon_string_to_tuple() into a tuple(long,lat) to enable us
query the country based on the column value
4. make a unique list of the generated tuples
5. write those tuples to a new .csv file where the column country is MBD(placeholder)
6. after finishing the walk through, removes duplicate values from the csv file 
"""
def build_coordinate_db(db_file):
    db = pd.DataFrame(columns=['long_lat', 'country'])
    db.long_lat = ''
    db.country = ''
    db.to_csv(db_file, index=False, encoding='utf-8')
    for subdir, dirs, files in os.walk('dataset'):
        for file in files:
            if file.endswith('parquet'):
                file_name = os.path.join(subdir, file)
                print("Reading now from " + file_name)
                source_parquet = pd.read_parquet(file_name, engine='pyarrow')

                print("Converting 5 points polygons to 1 point")
                list_of_tuples_tiles = source_parquet.tile.apply(
                    transform_polygon_string_to_tuple)
                print("Getting the unique values of the polygons")
                unique_tiles = list(set(list_of_tuples_tiles))
                print("Writing the unique values to CSV file ")
                db = pd.DataFrame({'long_lat': unique_tiles, 'country': 'MBD'})
                db.to_csv(db_file,
                          mode='a',
                          index=False,
                          header=False,
                          encoding='utf-8')
                print("Finished extracting unique coordinate points for " +
                      file_name)
                print("\n")

    print("Removing duplicate values from the csv")
    db = pd.read_csv(db_file)
    db.drop_duplicates(subset=None, inplace=True)
    db.to_csv(db_file, index=False)
    print("Finished creating the database")

In [158]:
"""
This function performs the actual call to geoAPI to fetch a country name for a given (long,lat)
"""
def perform_geo_reverse_request(geolocator, lat_long_point):
    country = 'undefined'
    location = geolocator.reverse(lat_long_point, timeout=5, language="en")
    if location != None:
        country = location.raw.get('address').get('country')
        if country == None:
            # For places like Gulf of Thailand, Bermuda Triangle and VETERINARY STATION MALI LOŠINJ 
            country = location.raw.get('address').get('locality')
            if country == None:
                # Sometimes we have only information about a location including country code, then use pytz to extract the country name
                country = pytz.country_names[location.raw.get('address').get('country_code')]
    return country

""" 
This function takes a tuple in and return the location using  Nominatim api.
Places middle in the ocean would take country name as undefined. Later we can decide what to do
with those values 
"""
def convert_single_polygon_to_country(poly_item):
    logger = logging.getLogger()
    country = 'undefined'
    (longitude, latitude) = poly_item
    my_user_agent = 'UTwente_Managing_Big_Data_project_{}'.format(randint(10000, 99999))
    
    #OpenstreetMap API
    geolocator = Nominatim(user_agent=my_user_agent)
    #OpenStreetMap API expects first latitude and then longitude
    lat_long_point = str(latitude) + "," + str(longitude)
    try:
        country = perform_geo_reverse_request(geolocator, lat_long_point)
    except GeocoderTimedOut:
        logger.error('ERROR: GeocoderTimedOut, we will keep retrying...')
        sleep(randint(1 * 100, 5 * 100) / 100)
        country = perform_geo_reverse_request(geolocator, lat_long_point)
    except GeocoderServiceError as e:
        logger.error('CONNECTION ERROR: GeocoderServiceError {}'.format(e) )
        return None
    except Exception as e:
        logger.error('ERROR: Terminating due to an exception {}'.format(e))
        logger.error(str(poly_item))
        logger.error(e)
        return None
    return country

In [163]:
""" 
This functions takes csv file with two columns long_lat and country 
and replace the values of the country columns with the country name corresponding to
the tuple of (long, lat) in long_lat column"""


def fill_coordinate_db_with_countries(source_file, target_file):
    db = pd.read_csv(source_file, sep=',')
    db.long_lat = db.long_lat.apply(lambda x: literal_eval(x))
    index = 0
    for i in tqdm(range(0, db.shape[0])):
        db.at[i, 'country'] = convert_single_polygon_to_country(db.at[i, 'long_lat'])
        try:
            # db.at[i, 'country'] = convert_single_polygon_to_country(db.at[i, 'long_lat'])
            index = index + 1
        except Exception as e:
            print('ERROR: because  {}'.format(e))
    print(str(index) + " rows have been processed")
    db.to_csv(target_file, sep=',', index=False, encoding='utf-8')


"""
This function takes a newly created db file from fill_coordinate_db_with_countries and 
checks if there are any missing values in the column country and tries to get the corresponding country name
"""


def check_if_built_coordinate_db_contains_missing_values(db_file):
    db = pd.read_csv(db_file, sep=',')
    # Convert the string representation of tuples to real tuples double check
    db.long_lat = db.long_lat.apply(lambda x: literal_eval(x))
    index = 0
    for i in tqdm(range(0, db.shape[0])):
        #MBD was used first in the code as placeholder. Whenever the country is empty or it's null, then it needs to be re-queried
        if (db.at[i, 'country'] == 'MBD' or pd.isnull(db.at[i, 'country'])):
            try:
                db.at[i, 'country'] = convert_single_polygon_to_country(
                    db.at[i, 'long_lat'])
                index = index + 1
            except Exception as e:
                print('ERROR: because  {}'.format(e))

    print(str(index) + " rows have been processed")
    db.to_csv(db_file, sep=',', index=False, encoding='utf-8')

#### Build the countries database

In [160]:
logging.basicConfig(filename="logger.log", format='%(asctime)s %(message)s \n', filemode='w')
# Build the database from the dataset 
build_coordinate_db('dataset/long_lat_db.csv')

Reading now from dataset/2019-q1/2019-01-01_performance_fixed_tiles.parquet
Converting 5 points polygons to 1 point
Getting the unique values of the polygons
Writing the unique values to CSV file 
Finished extracting unique coordinate points for dataset/2019-q1/2019-01-01_performance_fixed_tiles.parquet


Reading now from dataset/2019-q1/2019-01-01_performance_mobile_tiles.parquet
Converting 5 points polygons to 1 point
Getting the unique values of the polygons
Writing the unique values to CSV file 
Finished extracting unique coordinate points for dataset/2019-q1/2019-01-01_performance_mobile_tiles.parquet


Reading now from dataset/2019-q2/2019-04-01_performance_fixed_tiles.parquet
Converting 5 points polygons to 1 point
Getting the unique values of the polygons
Writing the unique values to CSV file 
Finished extracting unique coordinate points for dataset/2019-q2/2019-04-01_performance_fixed_tiles.parquet


Reading now from dataset/2019-q2/2019-04-01_performance_mobile_tiles.parquet


In [None]:
# Fetch countries names for each given tuple(long,lat) in the database file 
# Executing this line will call the OpenStreetMap API 13200 times. Be careful :)  
fill_coordinate_db_with_countries('dataset/long_lat_db.csv','dataset/long_lat_db_filled.csv')

In [162]:
# Check if there are any missing values in the newely built database files 
check_if_built_coordinate_db_contains_missing_values('dataset/long_lat_db_filled.csv')

100%|██████████| 29/29 [00:00<00:00, 12287.59it/s]

0 rows have been processed so far



