# Quality Control Considerations

* Datetime format == UTC compliant: True
* Long form: True
* ISO3: True
* Duplicate records: Under development
* Uploaded to correct location on s3: Under development
* Locations and History tables in sync: Under development
* S2 cell assigned: False


Import required libraries

In [111]:
import requests as req
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
from configparser import ConfigParser

Carto: Authentication and URL setup

In [118]:
# Base URL for all SQL calls
carto_url = "https://wri-rw.carto.com/api/v2/sql"

# Authentication credentials:
config = ConfigParser()
config.read("/Users/nathansuberi/Desktop/Code Portfolio/ResourceWatchCode/.env")
# FROM: https://resourcewatch.carto.com/u/wri-rw/your_apps
carto_api_token = config.get("auth", "carto_api_token")

Helper functions

In [380]:
def sql_api(url, sql, key):
    """ Execute sql request over API """
    
    params = {
        'api_key' : key,
        'q'       : sql
    }
    r = req.get(url, params=params)
    return(r)

def dump_row_contents(row, cols_and_types, cols_with_apostrophes=None):
    """ Format data from a dataframe for insert statements into a Carto table """
    
    dump = "("
    for ix in row.index:
        if ix in cols_with_apostrophes:
            dump += "'" + str(row[ix]).replace("'","") + "',"
        elif cols_and_types[ix] in ["date", "varchar"]:
            dump += "'" + str(row[ix]) + "',"
        else:
            dump += str(row[ix]) + ","
    dump = dump[:-1]+")"
    return(dump)

def update_in_batches(data_df, batch_size, target_table_name, cols_and_types, cols_with_apostrophes):
    """ 
    Send new rows for Carto in smaller batch sizes.
    A batch_size of 20 seems to work for the location data. 
    """
    
    columns = str(tuple(data_df.columns)).replace("'","")
    
    num_batches = int(data_df.shape[0] / batch_size)

    for batch in range(1,num_batches+1):
        sub_df = data_df.iloc[batch*batch_size:batch*batch_size+batch_size]
        values = ", ".join(list(sub_df.apply(lambda row: dump_row_contents(row, cols_and_types, cols_with_apostrophes), axis=1)))
        insert_value_sql = """
        INSERT INTO {table_name} {columns} VALUES {values}
        """.format(table_name=table_name, columns=columns, values=values)

        res = sql_api(carto_url, insert_value_sql, carto_api_token)

        if "error" in res.text:
            print(sub_df)
            break

        print("Completed up until index:", batch*batch_size+batch_size)

Miscellaneous helper tables

In [122]:
# Table for converting from two letter ISO to three letter ISO
isos = pd.read_csv("/Users/nathansuberi/Desktop/Code Portfolio/ResourceWatchCode/Conversion_Standards/iso_conversions.csv", sep="\t", header=None)
isos.columns = ["country", "iso2", "iso3", "num"]
iso2s = isos.set_index("iso2")

Interacting with Carto Tables
* Table creation (locations, history)
* Table destruction (locations, history... in case need to start over due to dev mistakes)
* Adding new rows (after querying an API to get new data)
* Adding new locations (if newly observed data fall in previously unlisted sensor locations)

Delete table with history of OpenAQ data

In [123]:
# ### CAREFUL ###
# # Leave commented out majority of time, unless sure you want to delete history

# # Delete table sql
# # Select all from a table
# table_name = "open_aq_history"

# delete_table_sql = """
# DROP TABLE {table_name}
# """.format(table_name=table_name)

# res = sql_api(carto_url, delete_table_sql, carto_api_token)
# print(res.text)

{"rows":[],"time":0.01,"fields":{},"total_rows":0}


Create table to store history of OpenAQ data

In [None]:
# Define the column names and types
cols_and_types_history = {
    #col name: Carto col type
    "lastUpdated":"date",
    "value":"float",
    "parameter":"varchar",
    "sourceName":"varchar",
    "location":"varchar",
    "city":"varchar",
    "iso3":"varchar",
    "unit":"varchar",
    "latitude":"float",
    "longitude":"float"
}

# Create table sql
table_name = "open_aq_history"

columns_and_data_types_history = ", ".join([col + " " + cols_and_types_history[col] for col in cols_and_types_history])

create_table_sql = """
CREATE TABLE {table_name}
 (
 {columns_and_data_types}
 );
""".format(table_name=table_name, columns_and_data_types=columns_and_data_types_history)

res = sql_api(carto_url, create_table_sql, carto_api_token)
print(res.text)

Delete table with previously observed locations of OpenAQ data

In [375]:
# ### CAREFUL ###
# # Leave commented out majority of time, unless sure you want to delete table of observed locations

# table_name = "open_aq_locations"

# delete_table_sql = """
# DROP TABLE {table_name}
# """.format(table_name=table_name)

# res = sql_api(carto_url, delete_table_sql, carto_api_token)
# print(res.text)

{"rows":[],"time":0.011,"fields":{},"total_rows":0}


Create table for observed locations in OpenAQ data

In [376]:
# Define the column names and types
cols_and_types_locations = {
    #col name: Carto col type
    "firstUpdated":"date",
    "sourceName":"varchar",
    "location":"varchar",
    "city":"varchar",
    "iso3":"varchar",
    "latitude":"float",
    "longitude":"float"
}

# Create table sql
table_name = "open_aq_locations"

columns_and_data_types_locations = ", ".join([col + " " + cols_and_types_locations[col] for col in cols_and_types_locations])

create_table_sql = """
CREATE TABLE {table_name}
 (
 {columns_and_data_types}
 );
""".format(table_name=table_name, columns_and_data_types=columns_and_data_types_locations)

res = sql_api(carto_url, create_table_sql, carto_api_token)
print(res.text)

{"rows":[],"time":0.008,"fields":{},"total_rows":0}


Check OpenAQ API 'locations' endpoint for acknowledged sensor locations

In [377]:
url = "https://api.openaq.org/v1/locations"

# There are a total of 8055 locations in the database so far, according to this query
# so this shouldn't miss any... but it may at some point

### FRANCIS ###
# The 10000 limit on requests is a hard limit in their API... what to do if there are more than 
# 10000 observations in the desired endpoint?
params = {
    "limit":10000
}

res = req.get(url, params=params)
data = res.json()["results"]
locations = pd.io.json.json_normalize(data, errors='ignore')
locations.columns = ["city", "latitude", "longitude", "count", "country", "firstUpdated", "lastUpdated",
             "location", "parameters", "sourceName", "sourceNames"]
locations["iso3"] = iso2s.loc[locations["country"], "iso3"].values
# Note - not storing parameters because these could change over time and it would be a pain to update
locations = locations[["city", "latitude", "longitude", "iso3", "firstUpdated", "location", "sourceName"]]

pre_clean = locations.shape[0]

# View columns that have null values
# https://stackoverflow.com/questions/14016247/python-find-integer-index-of-rows-with-nan-in-pandas
print(pd.isnull(locations).any())
print("number with no latitude",sum(pd.isnull(locations["latitude"])))
print("number with no longitude",sum(pd.isnull(locations["longitude"])))
print("number with no iso3",sum(pd.isnull(locations["iso3"])))

keep_geotagged = pd.notnull(locations["latitude"]) & pd.notnull(locations["longitude"]) 

# Remove all points that don't have a lat-lon
locations = locations.loc[keep_geotagged]

# Convert any remaining nan into empty string
# http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.fillna.html
locations = locations.fillna(value="")

post_clean = locations.shape[0]
print("Number of rows in OpenAQ locations database removed due to not having geo-coordinates:", pre_clean - post_clean)

## Having issues with non-standard characters... how to deal with this?

city            False
latitude         True
longitude        True
iso3             True
firstUpdated    False
location        False
sourceName      False
dtype: bool
number with no latitude 180
number with no longitude 180
number with no iso3 1
Number of rows in OpenAQ locations database removed due to not having geo-coordinates: 180


In [378]:
locations.head()

Unnamed: 0,city,latitude,longitude,iso3,firstUpdated,location,sourceName
0,Ulaanbaatar,47.932907,106.92139,MNG,2015-09-01T00:00:00.000Z,100 ail,Agaar.mn
1,Omaha-Council Bluffs,41.32247,-95.93799,USA,2016-03-06T19:00:00.000Z,16th and Whitmore,AirNow
2,Farmington,36.8097,-107.6517,USA,2016-03-06T19:00:00.000Z,1NL Navajo Lake,AirNow
3,21 de mayo,-37.471184,-72.361465,CHL,2015-09-23T14:00:00.000Z,21 de mayo,Chile - SINCA
4,Tucson,32.205,-110.8772,USA,2016-03-06T19:00:00.000Z,22nd Street & Craycr,AirNow


Insert these observed sensor locations into open_aq_locations Carto table

In [381]:
table_name = "open_aq_locations"

## URI too large to insert more than about 20 rows at once, have to do in small sets
kwargs = {
    "data_df":locations,
    "batch_size":20,
    "target_table_name":table_name,
    "cols_and_types":cols_and_types_locations,
    "cols_with_apostrophes":["city", "location", "sourceName"]
}

update_in_batches(**kwargs)

Completed up until index: 40
Completed up until index: 60
Completed up until index: 80
Completed up until index: 100
Completed up until index: 120
Completed up until index: 140
Completed up until index: 160
Completed up until index: 180
Completed up until index: 200
Completed up until index: 220
Completed up until index: 240
Completed up until index: 260
Completed up until index: 280
Completed up until index: 300
Completed up until index: 320
Completed up until index: 340
Completed up until index: 360
Completed up until index: 380
Completed up until index: 400
Completed up until index: 420
Completed up until index: 440
Completed up until index: 460
Completed up until index: 480
Completed up until index: 500
Completed up until index: 520
Completed up until index: 540
Completed up until index: 560
Completed up until index: 580
Completed up until index: 600
Completed up until index: 620
Completed up until index: 640
Completed up until index: 660
Completed up until index: 680
Completed up 

List OpenAQ locations that we've previously acknowledged

In [353]:
table_name = "open_aq_locations"
select_all_sql = """
SELECT * FROM {table_name}
""".format(table_name=table_name)

res = sql_api(carto_url, select_all_sql, carto_api_token)
locations = pd.DataFrame(res.json()["rows"])
locations = locations.set_index("location")
locations.head()

Unnamed: 0_level_0,city,firstupdated,iso3,latitude,longitude,sourcename
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
40RL01 - ROESELARE,Flanders,2016-11-17T00:00:00Z,BEL,50.95318,3.121155,EEA Belgium
40SZ01 - STEENOKKERZ,Flanders,2016-11-17T00:00:00Z,BEL,50.914577,4.504183,EEA Belgium
40SZ02 - STEENOKKERZ,Flanders,2016-11-17T00:00:00Z,BEL,50.91302,4.512184,EEA Belgium
40TS21 - TESSENDERLO,Flanders,2016-11-17T00:00:00Z,BEL,51.06571,5.107536,EEA Belgium
40WZ02 - MOL,Flanders,2016-11-17T00:00:00Z,BEL,51.1928,5.221534,EEA Belgium


Retrieve latest data

In [310]:
url = "https://api.openaq.org/v1/latest"

params = {
    "limit":10000
}

res = req.get(url, params=params)
data = res.json()["results"]

latest_data = pd.io.json.json_normalize(data, ['measurements'],[['coordinates', 'latitude'], ['coordinates', 'longitude'],'location', 'city', 'country'],  
                                          errors='ignore')

##
## Potential error - if no observed points have an averagingPeriod during an update, this can fail
##

latest_data.columns = ["averagingPeriod", "lastUpdated", "parameter", "sourceName", "unit", "value", "latitude", "longitude", "location","city", "country"]
latest_data["iso3"] = iso2s.loc[latest_data["country"], "iso3"].values
latest_data = latest_data.set_index("location")

## May need to develop function for adding iso3 that is more flexible for a range of spellings...
# Have a check whether anything was not successfully coded. Determine whether to add this new spelling
# to running list.

Check to see that new data all has a corresponding location

In [367]:
unique_places_in_latest_data = latest_data.index.unique()
new_places_ix = [place not in locations.index for place in unique_places_in_latest_data ]
new_places = unique_places_in_latest_data[new_places_ix]
print(new_places)
print("\nTotal number of unique places in latest data:", len(unique_places_in_latest_data))
print("Previously unseen places in latest data:", len(new_places))

Index(['100 ail', '16th and Whitmore', '1NL Navajo Lake', '21 de mayo',
       '22nd Street & Craycr', '24th & O', '2912 Coffey', '2LL Los Lunas',
       '40AB01 - ANTWERPEN', '40AB02 - BERENDRECHT',
       ...
       'תחנה:איינשטין', 'תחנה:אריאל', 'תחנה:גבעת המורה', 'תחנה:גליל מערבי',
       'תחנה:גן שמואל', 'תחנה:חיפה', 'תחנה:כביש 1 מוצא', 'תחנה:ניידת1',
       'תחנה:קיסריה', 'ฺBan-Tai, Kanchanaburi'],
      dtype='object', name='location', length=240)

Total number of unique places in latest data: 8055
Previously unseen places in latest data: 240


Add any "new places" to the open_aq_locations table
* Note: firstUpdated will be set to the earliest "lastUpdated" field for that sensor in the new data
* This will likely not be correct... will need to verify this with OpenAQ partners

In [374]:
location_table_columns =["location","firstUpdated","sourceName","city","iso3","latitude","longitude"]
new_places_df = pd.DataFrame(columns=location_table_columns).set_index("location")
new_places_df

Unnamed: 0_level_0,firstUpdated,sourceName,city,iso3,latitude,longitude
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [369]:
latest_data.loc[new_places[0], ]



locations.reset_index().set_index("city").loc["Ulaanbaatar"]
latest_data.reset_index().set_index("city").loc["Ulaanbaatar"]



Unnamed: 0_level_0,averagingPeriod,lastUpdated,parameter,sourceName,unit,value,latitude,longitude,city,country,iso3
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100 ail,,2017-12-08T10:45:00.000Z,o3,Agaar.mn,µg/m³,0.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,pm10,Agaar.mn,µg/m³,203.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,so2,Agaar.mn,µg/m³,41.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,co,Agaar.mn,µg/m³,3771.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,no2,Agaar.mn,µg/m³,86.0,47.932907,106.92139,Ulaanbaatar,MN,MNG


In [157]:
# Load files to experiment with de-duping

folder = "/Users/nathansuberi/Desktop/RW_Data/open_aq/"
sample1_file = "open_aq_latest_2017-12-07_09-19-10.csv"
sample2_file = "open_aq_latest_2017-12-08_09-41-36.csv"

df1 = pd.read_csv(folder+sample1_file, index_col=[0])
df2 = pd.read_csv(folder+sample2_file, index_col=[0])

df1.set_index(["latitude", "longitude"], inplace=True)
df2.set_index(["latitude", "longitude"], inplace=True)

In [167]:
# Determine if there are any overlaps
df1.index.levels[0]
df1.index.levels[1]


Float64Index([-158.088592529,  -157.96913147,  -157.87109375, -157.858093262,
              -156.492416382, -156.446105957, -156.370346069, -155.913299561,
              -155.778137207, -155.468902588,
              ...
               153.028106689,  153.029998779,  153.032104492,  153.035003662,
               153.087203979,  153.103805542,  153.135894775,  153.149505615,
               153.152694702,  153.158096313],
             dtype='float64', name='longitude', length=6183)

In [None]:
# Insert original data into Carto table
# Insert value sql

df = pd.read_csv("open_aq_latest_2017-12-08 09/21/31.219395.csv")

table_name = "open_aq_history"

columns = str(tuple(df.columns)).replace("'","")
values = ", ".join(list(df.apply(lambda row: dump_row_contents(row, cols_and_types_history), axis=1)))

insert_value_sql = """
INSERT INTO {table_name} {columns} VALUES {values}
""".format(table_name=table_name, columns=columns, values=values)

print(insert_value_sql)

res = sql_api(carto_url, insert_value_sql, carto_api_token)
print(res.text)


# Extract data from Carto table to run the de-duping method

# Select all from a table in a certain time range

## TO DO: Format this for the UTC format the Carto table will use
look_back = "1 day"
select_all_in_time_range_sql = """
SELECT * FROM {table_name} WHERE lastUpdated 
""".format(table_name=table_name)

res = sql_api(carto_url, select_all_in_time_range_sql, carto_api_token)
print(res.text)

# Add in the de-duped data as an extension to the original Carto table

In [139]:
# Save latest data (should be after de-duping)
# Deposit file in appropriate location in cloud... where is this?

cur_datetime = str(datetime.now())
cur_datetime = cur_datetime.split(".")[0]
cur_datetime = cur_datetime.replace(":", "-")
cur_datetime = cur_datetime.replace(" ", "_")

folder = "/Users/nathansuberi/Desktop/RW_Data/open_aq/"
file_name = "open_aq_latest_{datetime}.csv".format(datetime=cur_datetime)
current_file_name = folder + file_name\
print(current_file_name)

#df.to_csv(current_file_name)

/Users/nathansuberi/Desktop/RW_Data/open_aq/open_aq_latest_2017-12-08_09-48-38.csv


Experimentation

In [62]:
## All of this made unnecessary by magic of json_normalize, but list flattening is a nice trick

# # Extract measurements

# # All possible measurements:
# # Flattening nested lists: https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
# parameters = [obs["parameter"] for msr in df['measurements'] for obs in msr]
# parameters = np.unique(parameters, return_counts=True)
# #(array(['bc', 'co', 'no2', 'o3', 'pm10', 'pm25', 'so2'],
# #       dtype='<U4'), array([   17,  3453, 24056,  6837,  4493,  2953,  4665]))

# # Sometimes has an averaging period, other times not
# fields = ["averagingPeriod", "lastUpdated", "parameter", "sourceName", "unit", "value"]

# parameters

(array(['bc', 'co', 'no2', 'o3', 'pm10', 'pm25', 'so2'],
       dtype='<U4'), array([   17,  3453, 24056,  6837,  4493,  2953,  4665]))

In [103]:
# Exploring Python's datetime library
# Docs: https://docs.python.org/3/library/datetime.html

# This is UTC time, from Greenwich mean time
print(datetime.utcnow())
# This takes my current timezone
print(datetime.now())
# This makes a 1 day timedelta
print(timedelta(days=1))
print(datetime.now() - timedelta(days=1))

2017-12-08 14:19:57.581782
2017-12-08 09:19:57.582269
1 day, 0:00:00
2017-12-07 09:19:57.582608


In [249]:
# Experiment with breaking apart data structure
x = np.random.rand(107)
x = pd.DataFrame(x)
#print(x)

pieces = int(len(x) / piece_len)
rg = range(pieces+1)
y = pd.DataFrame([])

for r in rg:
    #print(r*piece_len)
    #print(r*piece_len+piece_len)
    y = y.append(x.iloc[r*piece_len:r*piece_len+piece_len], ignore_index=True)
#y = np.append(y,x[pieces*piece_len:])

print("These two dfs are equal:",x.equals(y))

These two dfs are equal: True


OpenAQ API Documentation: https://docs.openaq.org

In [None]:
# Cities
url = "https://api.openaq.org/v1/cities"
# Countries
url = "https://api.openaq.org/v1/countries"
# Fetches
url = "https://api.openaq.org/v1/fetches"
# Latest
url = "https://api.openaq.org/v1/latest"
# Locations
url = "https://api.openaq.org/v1/locations"
# Measurements
url = "https://api.openaq.org/v1/measurements"
# Parameters
url = "https://api.openaq.org/v1/parameters"
# Sources
url = "https://api.openaq.org/v1/sources"