# Quality Control Considerations

* Datetime format == UTC compliant: True
* Long form: True
* Duplicate records: Under development
* Locations and History tables in sync: JUST USE RECENT DATA ENDPOINT Under development

# Priority
* Insert into Carto
* Design a widget to show updating data


Import required libraries

In [159]:
# Access internet resources
import requests as req
# Parse json
import json
# Data handling
import pandas as pd
import numpy as np
# Using datetime objects to ensure UTC time in incoming data
from datetime import datetime, timedelta
# https://dateutil.readthedocs.io/en/stable/
from dateutil.parser import parse
# Retrieve credentials from a local .env file
from configparser import ConfigParser
config = ConfigParser()
config.read("/Users/nathansuberi/Desktop/Code Portfolio/ResourceWatchCode/.env")

# S3 connection libraries
# Need to have environmental variables set with "aws configure", 
# or update code to explicitly supplied AWS Access Id and AWS Secret Key
import boto3
import sys
import threading

s3_client = boto3.client("s3")
s3_resource = boto3.resource("s3")

class ProgressPercentage(object):
        def __init__(self, filename):
            self._filename = filename
            self._size = float(os.path.getsize(filename))
            self._seen_so_far = 0
            self._lock = threading.Lock()

        def __call__(self, bytes_amount):
            # To simplify we'll assume this is hooked up
            # to a single filename.
            with self._lock:
                self._seen_so_far += bytes_amount
                percentage = (self._seen_so_far / self._size) * 100
                sys.stdout.write("\r%s  %s / %s  (%.2f%%)"%(
                        self._filename, self._seen_so_far, self._size,
                        percentage))
                sys.stdout.flush()

CONSTANTS:

In [37]:
# Base URL for all SQL calls
CARTO_URL = "https://wri-rw.carto.com/api/v2/sql"

# FROM: https://resourcewatch.carto.com/u/wri-rw/your_apps
CARTO_API_TOKEN = config.get("auth", "carto_api_token")

OPENAQ_DATA_FOLDER = "/Users/nathansuberi/Desktop/RW_Data/OpenAQ/"

Interacting with Carto Tables
* Table creation (locations, history)
* Table destruction (locations, history... in case need to start over due to dev mistakes)
* Adding new rows to history (after querying an API to get new data)
* Adding new locations (if newly observed data fall in previously unlisted sensor locations)

Helper Functions

In [202]:
###
# Functions for updating tables after de-duplicating
###


### when passing around table, only pass the unique ids/datetime/the barebones necessary
## Pass in 2 lists of ids... it sends back what are the duplicates, what are the new ones
#### ^ this is instead of 



def update_table_without_duplicates(data_df, target_table_name,
                                    cols_and_types, float_cols_to_round=["latitude", "longitude"], precision=8, 
                                    cols_with_apostrophes=None, 
                                    datetime_column=None, datetime_cutoff=None, want_data_since_cutoff=True,
                                    dedupe_with_target=True, update_batch_size=20):
    """ 
    Determines whether there are new locations to add to the table.
    Sends an SQL statement and returns the result of that operation to stdout.
    
    look_back_length parameter allows for de-duping with a table while only considering a limited
    history of the recent record
    """
    
    # If target table doesn't exist, create it
    if not check_table_exists(target_table_name):
        create_table(target_table_name, cols_and_types)
    
    # column names to consider from both the observations and the target_table
    column_names = list(cols_and_types.keys())
    
    # Select data from the target_table to use in de-duping procedure
    res = select_from_table(target_table_name, datetime_column, datetime_cutoff, want_data_since_cutoff)  
    target_table = pd.DataFrame(res["rows"], columns=column_names)
    
    # Only attempt these if rows are returned
    if target_table.shape[0] > 0:
        # Fix precision on float columns in target_table
        target_table = fix_precision_of_floats(target_table, float_cols_to_round, precision)

        # Fix apostrophe change (back to a ' from &#8217) in target_table
        target_table = toggle_apostrophes(target_table, cols_with_apostrophes, remove=False)
    else:    
        # Override dedupe_with_target, as there is no target to dedupe with at this time
        dedupe_with_target = False

    # Remove duplicates in the new observations
    observations = data_df[column_names]
    new_obs = deduplicate(observations, column_names, 
                          float_cols_to_round, precision, 
                          target_table, dedupe_with_target)
    number_new_observations = new_obs.shape[0]
    print("Number of new observations added to", target_table_name + ":", number_new_observations)
    
    if number_new_observations > 0:
        # Add genuinely new observations to the existing table
        update_in_batches(new_obs, update_batch_size, target_table_name, cols_and_types, cols_with_apostrophes)
    
    # Return number of new observations for logging
    return(number_new_observations)



### Use POST instead of GET, with a json payload
# This will put the data into the body of the request, instead of into the URL
# Might still have to chunk it, can try uploading the whole thing
###

def update_in_batches(data_df, batch_size, 
                      target_table_name, cols_and_types, 
                      cols_with_apostrophes=None):
    """ 
    Send new rows for Carto in smaller batch sizes.
    A batch_size of 20 seems to work for the location data. 
    """
    
    # Calculate column names once
    columns = str(tuple(data_df.columns)).replace("'","")
    
    # Determine number of batches in which to send data
    num_batches = int(data_df.shape[0] / batch_size)

    for batch in range(num_batches+1):
        # Select sub-dataframe
        sub_df = data_df.iloc[batch*batch_size:batch*batch_size+batch_size]
        
        # Replace apostrophes from varchar columns with &#8217
        sub_df = toggle_apostrophes(sub_df, cols_with_apostrophes, remove=True)
        
        # Create Insert SQL statement
        res = insert_into_table(sub_df, target_table_name, columns, cols_and_types)

        # Help with trouble shooting
        # Display response error, the data that created the error, and break the cycle
        if "error" in res.text:
            print(res.text)
            print(sub_df)
            break

        print("Completed up until index:", batch*batch_size+batch_size)    
    
def dump_row_contents(row, cols_and_types):
    """ Format data from a dataframe for insert statements into a Carto table """
    
    dump = "("
    for ix in row.index:
        if cols_and_types[ix] == "varchar":
            dump += "'" + str(row[ix]) + "',"
        elif cols_and_types[ix] == "timestamp":
            # This is particular to the OpenAQ application
            dt_obj = parse(str(row[ix])[:-5])
            dt_str = dt_obj.strftime("%Y-%m-%dT%H:%M:%S")
            dump += "'" + dt_str + "',"
        else:
            dump += str(row[ix]) + ","
    dump = dump[:-1]+")"
    return(dump)    
 
    
### To dedupe:
# Create a new column that is unique 
# Location name + timestamp + parameter
    
###### Edit this to slim down
# Pass ids from observations and ids from target table
# Use the results of this to select the right rows from your table

def generate_unique_ids(data_df, columns):
    """
    Input:
    * Data
    * Columns in that data
    Output:
    * Single column that is the unique id's for every row in the DF
    """

    # Can can do this first, and then dedupe
    
    return(None)
    
def deduplicate(observations, column_names, 
                float_cols_to_round, precision, 
                target_table, dedupe_with_target=True):
    """Determine unique observations in data_df (de-dupe in the new observations)"""
    
    print("Deduping with target in mind:", dedupe_with_target)
    print("Target table shape:", target_table.shape)
    
    # http://pandas.pydata.org/pandas-docs/version/0.17/generated/pandas.DataFrame.drop_duplicates.html
    obs = observations.drop_duplicates(keep="first")
    obs = keep_geolocated(obs)
    obs = fix_precision_of_floats(obs, float_cols_to_round, precision)
    
    ##
    #obs = fix_date
    
    if dedupe_with_target:
        # De-dupe between existing table and new observations
        # https://stackoverflow.com/questions/29464234/compare-python-pandas-dataframes-for-matching-rows
        shared = pd.merge(target_table, obs, on=column_names, how="inner")
        
        print("obs shape:", obs.shape)
        print("shared shape:", shared.shape)
        shared["key"] = "x"
        temp_df = pd.merge(obs, shared, on=column_names, how="left")
        print("temp_df shape:", temp_df.shape)
        new_obs = temp_df[temp_df["key"].isnull()].drop("key", axis=1)
    else:
        new_obs = obs
        
    return(new_obs)


    
    
###
# Procedure for looping over OpenAQ history - specific to OpenAQ
### 
    
def run_over_single_set_new_data(data, target_table_name, 
                                 cols_and_types, 
                                 datetime_cutoff):
    """
    Update table for a single set of new data
    Specific to the OpenAQ history because there is no option to change several columns
    """
    
    kwargs = {
                "data_df":data,
                "target_table_name":target_table_name,
                "cols_and_types":cols_and_types,
                "cols_with_apostrophes":["location", "city","country"],
                "float_cols_to_round":["latitude", "longitude"],
                "precision":6,
                "datetime_column":"lastUpdated",
                "datetime_cutoff":datetime_cutoff,
                "want_data_since_cutoff":True
             }

    number_new_observations = update_table_without_duplicates(**kwargs)
    
    return(number_new_observations)

def run_over_history_of_files(list_of_files, target_table_name, 
                              cols_and_types, datetime_cutoff,
                              start_ix="", end_ix="", 
                              output_json=None):    
    """Used to loop over history of openaq-data
    
    Inputs:
    * List of files to feed in
    * Target table
    * Columns and types to add in
    Outputs:
    * A json file with the number of new observations 
    * Data added into the appropriate table
    """
    
    # If no output_json given, create a new dictionary to serve as this
    if not output_json:
        output_json = {}
    
    for file in list_of_files[start_ix:end_ix]:
        print("Now handling date:", file)
        url = "https://openaq-data.s3.amazonaws.com/"+file
        data = pd.read_csv(url)
        
        number_new_observations = run_over_single_set_new_data(data, target_table_name, 
                                                               cols_and_types,
                                                              datetime_cutoff)
        
        # Keep a history of the number of new observations per day in OpenAQ history
        output_json[file] = number_new_observations
        #print(output_json)
    
    # Output 
    with open(OPENAQ_DATA_FOLDER + target_table_name + "additions.json", "w") as f:
        json.dump(output_json, f)
    
    
    
    
###
# Carto SQL API interacting functions
###

# def sql_api(sql, get_or_post, url=CARTO_URL, key=CARTO_API_TOKEN):
#     """ Execute sql request over Carto SQL API """
# #Consider adding in account as a parameter, would make this more extensible
    
#     if get_or_post == "get":
#         params = {
#             'api_key' : key,
#             'q'       : sql
#         }
#         r = req.get(url, params=params)
#         return(r)
#     elif get_or_post == "post":
#         data = {
#             'query':sql.replace("\n", "").strip(),
#             #'api_key' : key
#         }
#         print(data)
#         r = req.post(url + "?api_key="+key, data=json.dumps(data))
#         return(r)
    
    
def sql_api(sql, get_or_post='get', url=CARTO_URL, key=CARTO_API_TOKEN):
    """ Execute sql request over Carto SQL API """

    payload = {
            'api_key' : key,
            'q'       : sql
        }
    
    if get_or_post == "get":
        
        r = req.get(url, params=payload)
        return(r.json())
    elif get_or_post == "post":
        
        r = req.post(url, data=payload)
        return(r) 

def check_table_exists(target_table_name):
    """Check to see if table already in our Carto account"""
    
    check_exists_sql = """
    SELECT cdb_usertables FROM CDB_UserTables()
    """.format(table_name=target_table_name)
    
    res = sql_api(check_exists_sql, "get")
    
    if target_table_name in res.text:
        return(True)
    else:
        return(False)

def create_table(target_table_name, cols_and_types):
    """SQL statement to create a table"""
    
    cols_and_types = ", ".join([col + " " + cols_and_types[col] for col in cols_and_types])

    create_table_sql = """
    CREATE TABLE {table_name}
     (
     {column_names_and_data_types}
     );
    """.format(table_name=target_table_name, column_names_and_data_types=cols_and_types)

    res = sql_api(create_table_sql, "post")
    print(res.text)
    
def delete_table(target_table_name):
    """SQL statement to drop a table"""
    
    delete_table_sql = """
    DROP TABLE {table_name}
    """.format(table_name=target_table_name)

    res = sql_api(delete_table_sql, "post")
    print(res.text)
    
def select_from_table(target_table_name, datetime_column=None, datetime_cutoff=None, want_data_since_cutoff=True):
    """
    Standard SQL statement to select from a table, 
    with support for a single WHERE clause 
    to select before or after a certain time cutoff
    """
    
    # If a datetime_cutoff are provided, a datetime_column must also be
    # The datetime_column and datetime_cutoff should both be single strings
    if(datetime_cutoff):
        assert(datetime_column and datetime_cutoff)
        assert(type(datetime_cutoff)==str)
        if(datetime_column):
            assert(type(datetime_column)==str) 
    
    # If no datetime_cutoff, select all from the target_table
    if not datetime_cutoff:
        select_all_sql = """
        SELECT * FROM {table_name}
        """.format(table_name=target_table_name)
        res = sql_api(select_all_sql, "get")
        return(res)
    
    # If there is a datetime_cutoff, 
    # either take the data since that date or previous to that date
    else:
        # want_data_since_cutoff should be a boolean
        assert(type(want_data_since_cutoff)==bool)
        if want_data_since_cutoff:
            comparison_operator = "<="
        else:
            comparison_operator = ">="
            
        ## TO DO    
        ### SELECT JUST DATETIME COLUMN AND THE UNIQUE ID COLUMN    
        ## 
        
        select_all_sql = """
        SELECT * FROM {table_name} WHERE {datetime_column} {comparison_operator} {datetime_cutoff}
        """.format(table_name=target_table_name, 
                   datetime_column=datetime_column,
                   comparison_operator=comparison_operator,
                   datetime_cutoff=datetime_cutoff)
        
        res = sql_api(select_all_sql, "get")
        return(res)
        
def insert_into_table(data_df, target_table_name, column_names, cols_and_types):
    """Craft insert statement for Carto table"""
    
    values = ", ".join(list(data_df.apply(lambda row: dump_row_contents(row, cols_and_types), axis=1)))
    
    insert_value_sql = """
    INSERT INTO {table_name} {columns} VALUES {values}
    """.format(table_name=target_table_name, columns=column_names, values=values)

    res = sql_api(insert_value_sql, "post")
    return(res)



def get_table_count(target_table_name, id_col):
    """
    Return a count of the number of entries in the target table
    This will be used to assess whether old rows need to be deleted to accomodate new content
    """
    
    assess_count_sql = """
    SELECT count(id_col) FROM {table_name}
    """.format(table_name=target_table_name, id_col=id_col)

    res = sql_api(insert_value_sql, "get")
    return(res)



###
# DataFrame cleaning functions
###

def clean_datetime(datetime):
    """Ensures that datetimes are all in the same format"""
    clean_datetime = datetime
    
    return(clean_datetime)

def keep_geolocated(df):
    """
    Throw away points that do not have a latitude and longitude defined
    COLUMNS MUST BE NAMED latitude and longitude
    """
    
    keep_geotagged = pd.notnull(df["latitude"]) & pd.notnull(df["longitude"]) 
    df = df.loc[keep_geotagged]
    return(df)

def fix_precision_of_floats(df, float_columns, precision):
    """Use this to address problem of comparing numpy floats with rounding errors"""
    df = df.copy()
    for col in float_columns:
        df[col] = np.around(df[col],precision)
    return(df)



###
### TO verify where it is breaking...
# print out the payload before upload
# when writing the payload to json... apostrophe not escaped breaks the json

def toggle_apostrophes(df, cols_with_apostrophes, remove=True):
    """
    Will switch between &#8217 and ' representation of an apostrophe
    Provides a reversible function to accomplish this
    
    TO DO: Address how this affects our data storage... &#8217 will be in Carto table
    
    """
    # Copy df
    df = df.copy()
    
    # Initialize array to avoid "NoneType not iterable" error
    if not cols_with_apostrophes:
        cols_with_apostrophes = []
    
    # Loop over all columns and either remove or replace apostrophes
    for col in cols_with_apostrophes:
        if remove:
            # 50% chance this works:
            # df[col] = df[col].apply(lambda row: str(row).replace("'", "\\'"))
            df[col] = df[col].apply(lambda row: str(row).replace("'", "&#8217"))
        else:
            df[col] = df[col].apply(lambda row: str(row).replace("&#8217", "'"))
    return(df)

Delete table with history of OpenAQ data

In [163]:
# ### CAREFUL ###
# # Leave commented out majority of time, unless sure you want to delete history

delete_table("open_aq_history")

{"rows":[],"time":0.008,"fields":{},"total_rows":0}


Create table to store history of OpenAQ data

In [164]:
# Define the column names and types
cols_and_types_history = {
    #col name: Carto col type
    "utc":"timestamp",
    "value":"float",
    "parameter":"varchar",
    #"sourceName":"varchar",
    "location":"varchar",
    "city":"varchar",
    "country":"varchar",
    "unit":"varchar",
    "latitude":"float",
    "longitude":"float"
}

# Create table sql
create_table("open_aq_history", cols_and_types_history)

{"rows":[],"time":0.009,"fields":{},"total_rows":0}


In [None]:
# Consider creating an index on the date column
# https://carto.com/docs/carto-engine/sql-api/query-optimizations/#creating-indexes

Delete table with previously observed locations of OpenAQ data

In [328]:
# ### CAREFUL ###
# # Leave commented out majority of time, unless sure you want to delete table of observed locations

# delete_table("open_aq_locations")

{"rows":[],"time":0.011,"fields":{},"total_rows":0}


Create table for observed locations in OpenAQ data

In [5]:
# Define the column names and types
cols_and_types_locations = {
    #col name: Carto col type
    "location":"varchar",
    "city":"varchar",
    "country":"varchar",
    "latitude":"float",
    "longitude":"float"
}

# Create table sql
create_table("open_aq_locations", cols_and_types_locations)

{"error":["relation \"open_aq_locations\" already exists"]}


In [None]:
# Consider creating an index on the location, city, and country columns
# For guidance on creating an index with multiple columsn:
# https://www.postgresql.org/docs/9.1/static/sql-createindex.html

# Keep in mind that we would need to re-run the index after every addition to the table
# The CONCURRENT option can be used to create the index without blocking write operations while doing so

Use Pandas to read a CSV from a public s3 account

In [88]:
# https://stackoverflow.com/questions/32400867/pandas-read-csv-from-url/41880513#41880513
# http://www.ritchieng.com/pandas-removing-duplicate-rows/
url = "https://openaq-data.s3.amazonaws.com/2017-12-09.csv"
dec9 = pd.read_csv(url)
url = "https://openaq-data.s3.amazonaws.com/2017-12-08.csv"
dec8 = pd.read_csv(url)
twodays = dec9.append(dec8)
dupes = twodays.duplicated()
print("Number of duplicate records between two days:", dupes.sum())
print("Dec 8 times:", dec8.utc.unique())
print("Dec 9 times:", dec9.utc.unique())

Number of duplicate records between two days: 0
Dec 8 times: ['2017-12-08T00:00:00.000Z' '2017-12-08T01:00:00.000Z'
 '2017-12-08T02:00:00.000Z' ..., '2017-12-08T22:24:09.000Z'
 '2017-12-08T22:22:26.000Z' '2017-12-08T22:24:26.000Z']
Dec 9 times: ['2017-12-09T00:00:00.000Z' '2017-12-09T00:02:17.000Z'
 '2017-12-09T00:02:18.000Z' ..., '2017-12-09T23:44:24.000Z'
 '2017-12-09T23:45:00.000Z' '2017-12-09T23:55:00.000Z']


In [160]:
dec8.head()

Unnamed: 0,location,city,country,utc,local,parameter,value,unit,latitude,longitude,attribution
0,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,pm25,4.09000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
1,Escuela E-10,Tocopilla,CL,2017-12-08T01:00:00.000Z,2017-12-07T22:00:00-03:00,pm25,5.31000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
2,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,so2,1.30000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
3,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,o3,40.16000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
4,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,no2,1.53000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
5,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,pm10,23.43000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
6,Escuela E-10,Tocopilla,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,co,556.86000,µg/m³,-22.085519,-70.188683,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
7,Chiu Chiu,Calama,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,so2,2.08000,µg/m³,-22.342264,-68.650897,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
8,Santa Margarita,Catemu,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,so2,57.00000,µg/m³,-32.776573,-70.938144,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."
9,Nueva Libertad,Talcahuano,CL,2017-12-08T00:00:00.000Z,2017-12-07T21:00:00-03:00,pm25,38.65000,µg/m³,-36.735998,-73.118693,"[{""name"":""SINCA"",""url"":""http://sinca.mma.gob.c..."


Add locations from these two sample days, make sure that further adds are successfully de-duped

In [203]:
delete_table("open_aq_locations2")

{"error":["table \"open_aq_locations2\" does not exist"]}


In [182]:
delete_table("open_aq_locations2")

kwargs = {
    "data_df":dec8,
    "target_table_name":"open_aq_locations2",
    "cols_and_types":cols_and_types_locations,
    "cols_with_apostrophes":["location", "city","country"],
    "float_cols_to_round":["latitude", "longitude"],
    "precision":6,
    "datetime_column":None,
    "datetime_cutoff":None,
    "want_data_since_cutoff":None,
    "dedupe_with_target":True
}

update_table_without_duplicates(**kwargs)
update_table_without_duplicates(**kwargs)

kwargs = {
    "data_df":dec9,
    "target_table_name":"open_aq_locations2",
    "cols_and_types":cols_and_types_locations,
    "cols_with_apostrophes":["location", "city","country"],
    "float_cols_to_round":["latitude", "longitude"],
    "precision":6,
    "datetime_column":None,
    "datetime_cutoff":None,
    "want_data_since_cutoff":None,
    "dedupe_with_target":True
}

update_table_without_duplicates(**kwargs)

{"error":["You must indicate a sql query"]}
Deduping with target in mind: True
Target table shape: (9356, 5)
obs shape: (7271, 5)
shared shape: (7271, 5)
temp_df shape: (7271, 6)
Number of new observations added to open_aq_locations2: 0
Deduping with target in mind: True
Target table shape: (9356, 5)
obs shape: (7271, 5)
shared shape: (7271, 5)
temp_df shape: (7271, 6)
Number of new observations added to open_aq_locations2: 0
Deduping with target in mind: True
Target table shape: (9356, 5)
obs shape: (6864, 5)
shared shape: (6864, 5)
temp_df shape: (6864, 6)
Number of new observations added to open_aq_locations2: 0


0

Add history of observations from these two sample days, make sure that further adds are successfully de-duped

In [176]:
kwargs = {
    "data_df":dec8,
    "target_table_name":"open_aq_history",
    "cols_and_types":cols_and_types_history,
    "cols_with_apostrophes":["location", "city", "country"],
    "float_cols_to_round":["latitude", "longitude", "value"],
    "precision":6,
    "datetime_column":"utc",
    "datetime_cutoff":None,
    "want_data_since_cutoff":None,
    "dedupe_with_target":True
}

update_table_without_duplicates(**kwargs)
update_table_without_duplicates(**kwargs)

kwargs = {
    "data_df":dec9,
    "target_table_name":"open_aq_history",
    "cols_and_types":cols_and_types_history,
    "cols_with_apostrophes":["location", "city", "country"],
    "float_cols_to_round":["latitude", "longitude", "value"],
    "precision":6,
    "datetime_column":"utc",
    "datetime_cutoff":None,
    "want_data_since_cutoff":None,
    "dedupe_with_target":True
}

update_table_without_duplicates(**kwargs)

Deduping with target in mind: True
Target table shape: (1840, 9)
obs shape: (489096, 9)
shared shape: (0, 9)
temp_df shape: (489096, 10)
Number of new observations added to open_aq_history: 489096
Completed up until index: 20
Completed up until index: 40
Completed up until index: 60
Completed up until index: 80
Completed up until index: 100
Completed up until index: 120
Completed up until index: 140
Completed up until index: 160
Completed up until index: 180
Completed up until index: 200
Completed up until index: 220
Completed up until index: 240
Completed up until index: 260
Completed up until index: 280
Completed up until index: 300
Completed up until index: 320
Completed up until index: 340
Completed up until index: 360
Completed up until index: 380
Completed up until index: 400
Completed up until index: 420
Completed up until index: 440
Completed up until index: 460
Completed up until index: 480
Completed up until index: 500
Completed up until index: 520
Completed up until index: 5

KeyboardInterrupt: 

In [171]:
dec8.sort_values(by=["location", "city", "country", "parameter"]).head(20)


Unnamed: 0,location,city,country,utc,local,parameter,value,unit,latitude,longitude,attribution
3060,100 ail,Ulaanbaatar,MN,2017-12-08T01:00:00.000Z,2017-12-08T09:00:00+08:00,co,1591.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
5626,100 ail,Ulaanbaatar,MN,2017-12-08T01:45:00.000Z,2017-12-08T09:45:00+08:00,co,1717.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
7573,100 ail,Ulaanbaatar,MN,2017-12-08T02:15:00.000Z,2017-12-08T10:15:00+08:00,co,1888.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
10561,100 ail,Ulaanbaatar,MN,2017-12-08T02:45:00.000Z,2017-12-08T10:45:00+08:00,co,1854.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
15337,100 ail,Ulaanbaatar,MN,2017-12-08T03:45:00.000Z,2017-12-08T11:45:00+08:00,co,1854.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
17724,100 ail,Ulaanbaatar,MN,2017-12-08T04:15:00.000Z,2017-12-08T12:15:00+08:00,co,2688.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
21442,100 ail,Ulaanbaatar,MN,2017-12-08T05:00:00.000Z,2017-12-08T13:00:00+08:00,co,2950.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
26759,100 ail,Ulaanbaatar,MN,2017-12-08T05:45:00.000Z,2017-12-08T13:45:00+08:00,co,1421.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
29583,100 ail,Ulaanbaatar,MN,2017-12-08T06:30:00.000Z,2017-12-08T14:30:00+08:00,co,981.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."
29751,100 ail,Ulaanbaatar,MN,2017-12-08T06:45:00.000Z,2017-12-08T14:45:00+08:00,co,925.0,µg/m³,47.932906,106.921383,"[{""name"":""Agaar.mn"",""url"":""http://agaar.mn/""},..."


In [172]:
history = pd.DataFrame(select_from_table("open_aq_history").json()["rows"])
#duped = history.duplicated(keep=False)
#print("num duped:", duped.sum())
#history.loc[duped].sort_values(by=["location", "city", "country", "parameter"])
history.sort_values(by=["location", "city", "country", "parameter"])

Unnamed: 0,city,country,latitude,location,longitude,parameter,unit,utc,value
152,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,no2,µg/m³,2017-12-08T00:00:00Z,-1.0000
812,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,no2,µg/m³,2017-12-08T00:00:00Z,-1.0000
151,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,pm10,µg/m³,2017-12-08T00:00:00Z,49.0000
811,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,pm10,µg/m³,2017-12-08T00:00:00Z,49.0000
157,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,so2,µg/m³,2017-12-08T00:00:00Z,6.0000
817,Kocaeli,TR,40.771010,Alikahya-MTHM,30.007700,so2,µg/m³,2017-12-08T00:00:00Z,6.0000
160,Yalova,TR,40.700580,Altınova-MTHM,29.507850,no2,µg/m³,2017-12-08T00:00:00Z,15.0000
820,Yalova,TR,40.700580,Altınova-MTHM,29.507850,no2,µg/m³,2017-12-08T00:00:00Z,15.0000
147,Yalova,TR,40.700580,Altınova-MTHM,29.507850,o3,µg/m³,2017-12-08T00:00:00Z,52.0000
807,Yalova,TR,40.700580,Altınova-MTHM,29.507850,o3,µg/m³,2017-12-08T00:00:00Z,52.0000


In [174]:
history["utc"].unique()

array(['2017-12-08T00:00:00Z', '2017-12-08T01:00:00Z',
       '2017-12-08T02:00:00Z'], dtype=object)

Loop over all CSVs in the OpenAQ record and capture the unique locations.

Locations are included as long as they have the same "location", "city", "country" fields, and their "latitude" and "longitude" fields are equal to 6 significant figures.

In [41]:
bucket = "openaq-data"
openaq_data = s3_resource.Bucket(bucket)
keys = []
for file in openaq_data.objects.filter():
    if file.key[-3:] == "csv":
        keys.append(file.key)
keys 

['2015-06-29.csv',
 '2015-06-30.csv',
 '2015-07-01.csv',
 '2015-07-02.csv',
 '2015-07-03.csv',
 '2015-07-04.csv',
 '2015-07-06.csv',
 '2015-07-07.csv',
 '2015-07-08.csv',
 '2015-07-09.csv',
 '2015-07-10.csv',
 '2015-07-11.csv',
 '2015-07-12.csv',
 '2015-07-13.csv',
 '2015-07-14.csv',
 '2015-07-15.csv',
 '2015-07-16.csv',
 '2015-07-17.csv',
 '2015-07-18.csv',
 '2015-07-20.csv',
 '2015-07-21.csv',
 '2015-07-22.csv',
 '2015-07-23.csv',
 '2015-07-24.csv',
 '2015-08-02.csv',
 '2015-08-03.csv',
 '2015-08-04.csv',
 '2015-08-05.csv',
 '2015-08-06.csv',
 '2015-08-07.csv',
 '2015-08-08.csv',
 '2015-08-09.csv',
 '2015-08-10.csv',
 '2015-08-11.csv',
 '2015-08-12.csv',
 '2015-08-13.csv',
 '2015-08-14.csv',
 '2015-08-15.csv',
 '2015-08-16.csv',
 '2015-08-17.csv',
 '2015-08-18.csv',
 '2015-08-19.csv',
 '2015-08-20.csv',
 '2015-08-21.csv',
 '2015-08-22.csv',
 '2015-08-23.csv',
 '2015-08-24.csv',
 '2015-08-25.csv',
 '2015-08-26.csv',
 '2015-08-27.csv',
 '2015-08-28.csv',
 '2015-08-29.csv',
 '2015-08-30

Updating open_aq_locations

In [56]:

output_json = {}

kwargs = {
    "list_of_files":keys, 
    "target_table_name":"open_aq_locations", 
    "cols_and_types":cols_and_types_locations, 
    "start_ix":0, 
    "end_ix":len(keys),
    "output_json":output_json
}

# Implicit inputs:
# kwargs = {
#             "cols_with_apostrophes":["location", "city","country"],
#             "float_cols_to_round":["latitude", "longitude"],
#             "precision":6,
#             "datetime_column":"lastUpdated",
#             "datetime_cutoff":None
#             "want_data_since_cutoff":True
#          }

run_over_history_of_files(**kwargs)


### TO DO:
# delete_table("open_aq_locations")
# run_over_history_of_files(**kwargs)

Now handling date: 2015-06-29.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0}
Now handling date: 2015-06-30.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0, '2015-06-30.csv': 0}
Now handling date: 2015-07-01.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0, '2015-06-30.csv': 0, '2015-07-01.csv': 0}
Now handling date: 2015-07-02.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0, '2015-06-30.csv': 0, '2015-07-01.csv': 0, '2015-07-02.csv': 0}
Now handling date: 2015-07-03.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0, '2015-06-30.csv': 0, '2015-07-01.csv': 0, '2015-07-02.csv': 0, '2015-07-03.csv': 0}
Now handling date: 2015-07-04.csv
Number of new observations added to open_aq_locations: 0
{'2015-06-29.csv': 0, '2015-06-30.csv': 0, '2015-07-01.csv': 0, '2015-07-02.csv': 0, '2015-07-03.csv': 0, '2015-07-04.csv': 0}
Now han

KeyboardInterrupt: 

Updating open_aq_history

In [81]:
res = select_from_table("open_aq_history", 
                        #datetime_column="lastUpdated", 
                        #datetime_cutoff="2 days", 
                        #want_data_since_cutoff=True
                       )

#column_order = ["location", "city", "country", "latitude", "longitude"]
all_open_aq_history = pd.DataFrame(res["rows"]) #[column_order]

print("\nNumber of duplicates in the table:", all_open_aq_history.duplicated().sum())

print("\nA list of all locations ever reporting to the OpenAQ network:")
all_open_aq_history


A list of all locations ever reporting to the OpenAQ network:


In [84]:
res.text

'{"rows":[],"time":0.002,"fields":{"lastupdated":{"type":"date"},"value":{"type":"number"},"parameter":{"type":"string"},"sourcename":{"type":"string"},"location":{"type":"string"},"city":{"type":"string"},"iso3":{"type":"string"},"unit":{"type":"string"},"latitude":{"type":"number"},"longitude":{"type":"number"}},"total_rows":0}'

In [None]:
output_json = {}

# Updates for last 5 days of observations
kwargs = {
    "list_of_files":keys, 
    "target_table_name":"open_aq_history", 
    "cols_and_types":cols_and_types_history, 
    "start_ix":len(keys)-5, 
    "end_ix":len(keys),
    "output_json":output_json,
    "datetime_cutoff":"2 days"
}

# Implicit inputs:
# kwargs = {
#             "cols_with_apostrophes":["location", "city","country"],
#             "float_cols_to_round":["latitude", "longitude"],
#             "precision":6,
#             "datetime_column":"lastUpdated",
#             "want_data_since_cutoff":True
#          }

run_over_history_of_files(**kwargs)

In [350]:
# 135:310 comes from the code snippet below, to restart code and add in date to output of each loop
for csv in keys[135:310]:
    if csv[-3:] == "csv":
        print("Now handling date:", csv)
        url = "https://openaq-data.s3.amazonaws.com/"+csv
        data = pd.read_csv(url)
        
        kwargs = {
            "data_df":data,
            "target_table_name":"open_aq_locations",
            "cols_and_types":cols_and_types_locations,
            "cols_with_apostrophes":["location", "city","country"],
            "float_cols_to_round":["latitude", "longitude"],
            "precision":6
        }

        update_table_without_duplicates(**kwargs)

Now handling date: 2015-11-21.csv
Number of new observations added to open_aq_locations: 0
{"error":["syntax error at end of input"]}
Empty DataFrame
Columns: [location, city, country, latitude, longitude]
Index: []
Now handling date: 2015-11-22.csv
Number of new observations added to open_aq_locations: 0
{"error":["syntax error at end of input"]}
Empty DataFrame
Columns: [location, city, country, latitude, longitude]
Index: []
Now handling date: 2015-11-23.csv
Number of new observations added to open_aq_locations: 0
{"error":["syntax error at end of input"]}
Empty DataFrame
Columns: [location, city, country, latitude, longitude]
Index: []
Now handling date: 2015-11-24.csv
Number of new observations added to open_aq_locations: 0
{"error":["syntax error at end of input"]}
Empty DataFrame
Columns: [location, city, country, latitude, longitude]
Index: []
Now handling date: 2015-11-25.csv
Number of new observations added to open_aq_locations: 0
{"error":["syntax error at end of input"]}
Em

KeyboardInterrupt: 

In [359]:
# 310 comes from the code snippet below, to restart code and add in date to output of each loop
for csv in keys[309:598]:
    if csv[-3:] == "csv":
        print("Now handling date:", csv)
        url = "https://openaq-data.s3.amazonaws.com/"+csv
        data = pd.read_csv(url)
        
        kwargs = {
            "data_df":data,
            "target_table_name":"open_aq_locations",
            "cols_and_types":cols_and_types_locations,
            "cols_with_apostrophes":["location", "city","country"],
            "float_cols_to_round":["latitude", "longitude"],
            "precision":6
        }

        update_table_without_duplicates(**kwargs)

Now handling date: 2016-05-16.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2016-05-17.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2016-05-18.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2016-05-19.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2016-05-20.csv
Number of new observations added to open_aq_locations: 2
Completed up until index: 20
Now handling date: 2016-05-21.csv
Number of new observations added to open_aq_locations: 2
Completed up until index: 20
Now handling date: 2016-05-22.csv
Number of new observations added to open_aq_locations: 2
Completed up until index: 20
Now handling date: 2016-05-23.csv
Number of new observations added to open_aq_locations: 2
Completed up until index: 20
Now handling date: 2016-05-24.csv
Number

KeyboardInterrupt: 

In [366]:
# 310 comes from the code snippet below, to restart code and add in date to output of each loop
for csv in keys[597:]:
    if csv[-3:] == "csv":
        print("Now handling date:", csv)
        url = "https://openaq-data.s3.amazonaws.com/"+csv
        data = pd.read_csv(url)
        
        kwargs = {
            "data_df":data,
            "target_table_name":"open_aq_locations",
            "cols_and_types":cols_and_types_locations,
            "cols_with_apostrophes":["location", "city","country"],
            "float_cols_to_round":["latitude", "longitude"],
            "precision":6
        }

        update_table_without_duplicates(**kwargs)

Now handling date: 2017-02-28.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2017-03-01.csv
Number of new observations added to open_aq_locations: 2
Completed up until index: 20
Now handling date: 2017-03-02.csv
Number of new observations added to open_aq_locations: 5
Completed up until index: 20
Now handling date: 2017-03-03.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2017-03-04.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2017-03-05.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2017-03-06.csv
Number of new observations added to open_aq_locations: 1
Completed up until index: 20
Now handling date: 2017-03-07.csv
Number of new observations added to open_aq_locations: 4
Completed up until index: 20
Now handling date: 2017-03-08.csv
Number

In [365]:
# I wanted to add in a date display to the loop above
# Used the code below to figure where I had stopped previously
np.where(np.array(keys)==csv)

(array([598]),)

Read open_aq_locations table from Carto

In [77]:
res = select_from_table("open_aq_locations")
column_order = ["location", "city", "country", "latitude", "longitude"]
all_open_aq_locations = pd.DataFrame(res["rows"])[column_order]

print("\nNumber of duplicates in the table:", all_open_aq_locations.duplicated().sum())

print("\nA list of all locations ever reporting to the OpenAQ network:")
all_open_aq_locations


Number of duplicates in the table: 657

A list of all locations ever reporting to the OpenAQ network:


Unnamed: 0,location,city,country,latitude,longitude
0,Escuela E-10,Tocopilla,CL,-22.085519,-70.188683
1,Chiu Chiu,Calama,CL,-22.342264,-68.650897
2,Santa Margarita,Catemu,CL,-32.776573,-70.938144
3,Nueva Libertad,Talcahuano,CL,-36.735998,-73.118693
4,Lo Campo,Panquehue,CL,-32.797715,-70.898037
5,Coronel Sur,Coronel,CL,-37.031702,-73.138689
6,Catemu,Catemu,CL,-32.779208,-70.959114
7,Calabozo,Coronel,CL,-36.996431,-73.115805
8,Romeral,Hijuelas,CL,-32.823956,-71.006441
9,Lilla Essingen (E4/E20),Stockholm,SE,59.325519,18.003961


In [73]:
duped_locations = all_open_aq_locations[all_open_aq_locations.duplicated()]
duped_locations

Unnamed: 0,location,city,country,latitude,longitude
7810,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7902,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7905,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7911,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7929,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7931,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7934,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7942,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7945,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917
7947,,Miami-Fort Lauderdale-Miami Beach,US,26.593808,-80.058917


In [None]:
duped_locations = duped_locations.drop_duplicates(keep="first")
print(duped_locations)

shared = pd.merge(duped_locations, duped_locations, on=list(duped_locations.columns), how="inner")
shared["key"] = "x"
print(shared)
temp_df = pd.merge(duped_locations, shared, on=list(duped_locations.columns), how="left")
new_obs = temp_df[temp_df["key"].isnull()].drop("key", axis=1)
new_obs

Read open_aq_history table from Carto

In [None]:
res = select_from_table("open_aq_history", 
                        datetime_column="lastUpdated", 
                        datetime_cutoff="2 days", 
                        want_data_since_cutoff=True)

#column_order = ["location", "city", "country", "latitude", "longitude"]
all_open_aq_history = pd.DataFrame(res["rows"]) #[column_order]

print("\nNumber of duplicates in the table:", all_open_aq_history.duplicated().sum())

print("\nA list of all locations ever reporting to the OpenAQ network:")
all_open_aq_history

OpenAQ API Documentation: https://docs.openaq.org

In [None]:
# Cities
url = "https://api.openaq.org/v1/cities"
# Countries
url = "https://api.openaq.org/v1/countries"
# Fetches
url = "https://api.openaq.org/v1/fetches"
# Latest
url = "https://api.openaq.org/v1/latest"
# Locations
url = "https://api.openaq.org/v1/locations"
# Measurements
url = "https://api.openaq.org/v1/measurements"
# Parameters
url = "https://api.openaq.org/v1/parameters"
# Sources
url = "https://api.openaq.org/v1/sources"

Read from the OpenAQ API for most recent updates - check

In [None]:
## Pull in newest observations from OpenAQ API
url = "https://api.openaq.org/v1/latest"
# There is a limit from OpenAQ of 10,000 rows per API call
params = {
    "limit":10000
}

res = req.get(url, params=params)
data = res.json()["results"]

latest_data = pd.io.json.json_normalize(data, ['measurements'],[['coordinates', 'latitude'], ['coordinates', 'longitude'],'location', 'city', 'country'],  
                                          errors='ignore')

## Check existing record for possible duplicates from past readings

## TO DO: Format this for the UTC format the Carto table will use
look_back = "1 day"
table_name = "open_aq_history"
select_all_in_time_range_sql = """
SELECT * FROM {table_name} WHERE lastUpdated < {check_length}
""".format(table_name=table_name, check_length=look_back)

res = sql_api(select_all_in_time_range_sql)
data = pd.DataFrame(res["rows"])




## TO DO: kwargs and update_table_without_duplicates(**kwargs)
# Goal: update the long-running history of all observations

# Update list of observed locations in case new ones were introduced since last update

latest_locations = latest_data[["location", "city", "country", "coordinates.latitude", "coordinates.longitude"]]

kwargs = {
            "data_df":latest_locations,
            "target_table_name":"open_aq_locations",
            "cols_and_types":cols_and_types_locations,
            "cols_with_apostrophes":["location", "city","country"],
            "float_cols_to_roundfloat_cols":["latitude", "longitude"],
            "precision":6
        }

update_table_without_duplicates(**kwargs)



SQL statement that will join the location Unique ID to the newly observed data

In [None]:
## TO DO

## Write this! 

Experimentation

In [364]:
look_back_length = None
#look_back_length = ["a", "b"]
if look_back:
    print('hello')

hello


In [310]:
column_names = list(cols_and_types_locations.keys())
target_table_name = "open_aq_locations"
select_all_sql = """
SELECT * FROM {table_name}
""".format(table_name=target_table_name)
res = sql_api(select_all_sql)
target_table = pd.DataFrame(res["rows"], columns=column_names)
target_table = fix_precision_of_floats(target_table, ["latitude", "longitude"], 6)
target_table.shape

(7271, 5)

In [311]:
data_df = dec8
obs = data_df[column_names]
obs = obs.drop_duplicates(keep="first")
obs = keep_geolocated(obs)
obs = fix_precision_of_floats(obs, ["latitude", "longitude"], 6)
obs.shape

(7271, 5)

In [305]:
obs.head(10)

Unnamed: 0,location,city,country,latitude,longitude
0,Escuela E-10,Tocopilla,CL,-22.085519,-70.188683
7,Chiu Chiu,Calama,CL,-22.342264,-68.650897
8,Santa Margarita,Catemu,CL,-32.776573,-70.938144
9,Nueva Libertad,Talcahuano,CL,-36.735998,-73.118693
10,Lo Campo,Panquehue,CL,-32.797715,-70.898037
11,Coronel Sur,Coronel,CL,-37.031702,-73.138689
12,Catemu,Catemu,CL,-32.779208,-70.959114
13,Calabozo,Coronel,CL,-36.996431,-73.115805
14,Romeral,Hijuelas,CL,-32.823956,-71.006441
15,Lilla Essingen (E4/E20),Stockholm,SE,59.325519,18.003961


In [267]:
target_table.loc[0]

location     Escuela E-10
city            Tocopilla
country                CL
latitude         -22.0855
longitude        -70.1887
Name: 0, dtype: object

In [268]:
obs.loc[0]

location     Escuela E-10
city            Tocopilla
country                CL
latitude         -22.0855
longitude        -70.1887
Name: 0, dtype: object

In [273]:
target_table.loc[0].equals(obs.loc[0])

False

In [312]:
shared = target_table.merge(obs, on=column_names, how="inner")
shared["key"] = "x"
print(shared.shape)
shared.head(10)

(7250, 6)


Unnamed: 0,location,city,country,latitude,longitude,key
0,Escuela E-10,Tocopilla,CL,-22.085519,-70.188683,x
1,Chiu Chiu,Calama,CL,-22.342264,-68.650897,x
2,Santa Margarita,Catemu,CL,-32.776573,-70.938144,x
3,Nueva Libertad,Talcahuano,CL,-36.735998,-73.118693,x
4,Lo Campo,Panquehue,CL,-32.797715,-70.898037,x
5,Coronel Sur,Coronel,CL,-37.031702,-73.138689,x
6,Catemu,Catemu,CL,-32.779208,-70.959114,x
7,Calabozo,Coronel,CL,-36.996431,-73.115805,x
8,Romeral,Hijuelas,CL,-32.823956,-71.006441,x
9,Lilla Essingen (E4/E20),Stockholm,SE,59.325519,18.003961,x


In [314]:
temp_df = pd.merge(obs, shared, on=column_names, how="left")
new_obs = temp_df[temp_df["key"].isnull()].drop("key", axis=1)
#new_obs = new_obs.reset_index()[column_names]
new_obs

Unnamed: 0,location,city,country,latitude,longitude
76,MT00004,Zejtun (Citta' Beland),MT,35.852291,14.538986
549,Parque O'Higgins,Santiago,CL,-33.464142,-70.660797
638,Bristol St Paul's,Bristol,GB,51.462839,-2.584482
689,Belfast Stockman's Lane,Belfast,GB,54.572586,-5.974944
700,Derby St Alkmund's Way,Derby,GB,52.922983,-1.469507
1067,Fisherman's Landing,Gladstone,AU,-23.7937,151.1601
1238,St. John's,NEWFOUNDLAND,CA,47.6528,-52.8167
1381,Lancaster,Coeur d'Alene,US,47.7889,-116.8044
1561,Children's Park Site,Tucson,US,32.2953,-110.9822
2788,FR04158,Val-d'Oise,FR,49.063086,1.866381


In [232]:
print("Number of new observations added to", target_table_name + ":", new_obs.shape[0])
cols_and_types =cols_and_types_locations
cols_with_apostrophes = ["location", "city","country"]
update_in_batches(new_obs, 20, target_table_name, cols_and_types, cols_with_apostrophes)

Number of new observations added to open_aq_locations: 7271
Completed up until index: 20
Completed up until index: 40
Completed up until index: 60
Completed up until index: 80
Completed up until index: 100
Completed up until index: 120
Completed up until index: 140
Completed up until index: 160
Completed up until index: 180
Completed up until index: 200
Completed up until index: 220
Completed up until index: 240
Completed up until index: 260
Completed up until index: 280
Completed up until index: 300
Completed up until index: 320
Completed up until index: 340
Completed up until index: 360
Completed up until index: 380
Completed up until index: 400
Completed up until index: 420
Completed up until index: 440
Completed up until index: 460
Completed up until index: 480
Completed up until index: 500
Completed up until index: 520
Completed up until index: 540
Completed up until index: 560
Completed up until index: 580
Completed up until index: 600
Completed up until index: 620
Completed up u

Miscellaneous helper tables

In [122]:
# Table for converting from two letter ISO to three letter ISO
isos = pd.read_csv("/Users/nathansuberi/Desktop/Code Portfolio/ResourceWatchCode/Conversion_Standards/iso_conversions.csv", sep="\t", header=None)
isos.columns = ["country", "iso2", "iso3", "num"]
iso2s = isos.set_index("iso2")

Check OpenAQ API 'locations' endpoint for acknowledged sensor locations

In [377]:
# url = "https://api.openaq.org/v1/locations"

# # There are a total of 8055 locations in the database so far, according to this query
# # so this shouldn't miss any... but it may at some point

# ### FRANCIS ###
# # The 10000 limit on requests is a hard limit in their API... what to do if there are more than 
# # 10000 observations in the desired endpoint?
# params = {
#     "limit":10000
# }

# res = req.get(url, params=params)
# data = res.json()["results"]
# locations = pd.io.json.json_normalize(data, errors='ignore')
# locations.columns = ["city", "latitude", "longitude", "count", "country", "firstUpdated", "lastUpdated",
#              "location", "parameters", "sourceName", "sourceNames"]
# locations["iso3"] = iso2s.loc[locations["country"], "iso3"].values
# # Note - not storing parameters because these could change over time and it would be a pain to update
# locations = locations[["city", "latitude", "longitude", "iso3", "firstUpdated", "location", "sourceName"]]

# pre_clean = locations.shape[0]

# # View columns that have null values
# # https://stackoverflow.com/questions/14016247/python-find-integer-index-of-rows-with-nan-in-pandas
# print(pd.isnull(locations).any())
# print("number with no latitude",sum(pd.isnull(locations["latitude"])))
# print("number with no longitude",sum(pd.isnull(locations["longitude"])))
# print("number with no iso3",sum(pd.isnull(locations["iso3"])))

# keep_geotagged = pd.notnull(locations["latitude"]) & pd.notnull(locations["longitude"]) 

# # Remove all points that don't have a lat-lon
# locations = locations.loc[keep_geotagged]

# # Convert any remaining nan into empty string
# # http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.fillna.html
# locations = locations.fillna(value="")

# post_clean = locations.shape[0]
# print("Number of rows in OpenAQ locations database removed due to not having geo-coordinates:", pre_clean - post_clean)

# ## Having issues with non-standard characters... how to deal with this?

city            False
latitude         True
longitude        True
iso3             True
firstUpdated    False
location        False
sourceName      False
dtype: bool
number with no latitude 180
number with no longitude 180
number with no iso3 1
Number of rows in OpenAQ locations database removed due to not having geo-coordinates: 180


In [378]:
# # Use coordinates instead of location
# # OR use location, city, country
# # Check documentation for unique id

# locations.head()

Unnamed: 0,city,latitude,longitude,iso3,firstUpdated,location,sourceName
0,Ulaanbaatar,47.932907,106.92139,MNG,2015-09-01T00:00:00.000Z,100 ail,Agaar.mn
1,Omaha-Council Bluffs,41.32247,-95.93799,USA,2016-03-06T19:00:00.000Z,16th and Whitmore,AirNow
2,Farmington,36.8097,-107.6517,USA,2016-03-06T19:00:00.000Z,1NL Navajo Lake,AirNow
3,21 de mayo,-37.471184,-72.361465,CHL,2015-09-23T14:00:00.000Z,21 de mayo,Chile - SINCA
4,Tucson,32.205,-110.8772,USA,2016-03-06T19:00:00.000Z,22nd Street & Craycr,AirNow


Insert these observed sensor locations into open_aq_locations Carto table

In [381]:
table_name = "open_aq_locations"

## URI too large to insert more than about 20 rows at once, have to do in small sets
kwargs = {
    "data_df":locations,
    "batch_size":20,
    "target_table_name":table_name,
    "cols_and_types":cols_and_types_locations,
    "cols_with_apostrophes":["city", "location", "sourceName"]
}

update_in_batches(**kwargs)

Completed up until index: 40
Completed up until index: 60
Completed up until index: 80
Completed up until index: 100
Completed up until index: 120
Completed up until index: 140
Completed up until index: 160
Completed up until index: 180
Completed up until index: 200
Completed up until index: 220
Completed up until index: 240
Completed up until index: 260
Completed up until index: 280
Completed up until index: 300
Completed up until index: 320
Completed up until index: 340
Completed up until index: 360
Completed up until index: 380
Completed up until index: 400
Completed up until index: 420
Completed up until index: 440
Completed up until index: 460
Completed up until index: 480
Completed up until index: 500
Completed up until index: 520
Completed up until index: 540
Completed up until index: 560
Completed up until index: 580
Completed up until index: 600
Completed up until index: 620
Completed up until index: 640
Completed up until index: 660
Completed up until index: 680
Completed up 

List OpenAQ locations that we've previously acknowledged

In [353]:
table_name = "open_aq_locations"
select_all_sql = """
SELECT * FROM {table_name}
""".format(table_name=table_name)

res = sql_api(select_all_sql)
locations = pd.DataFrame(res["rows"])
locations = locations.set_index("location")
locations.head()

Unnamed: 0_level_0,city,firstupdated,iso3,latitude,longitude,sourcename
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
40RL01 - ROESELARE,Flanders,2016-11-17T00:00:00Z,BEL,50.95318,3.121155,EEA Belgium
40SZ01 - STEENOKKERZ,Flanders,2016-11-17T00:00:00Z,BEL,50.914577,4.504183,EEA Belgium
40SZ02 - STEENOKKERZ,Flanders,2016-11-17T00:00:00Z,BEL,50.91302,4.512184,EEA Belgium
40TS21 - TESSENDERLO,Flanders,2016-11-17T00:00:00Z,BEL,51.06571,5.107536,EEA Belgium
40WZ02 - MOL,Flanders,2016-11-17T00:00:00Z,BEL,51.1928,5.221534,EEA Belgium


Retrieve latest data

In [93]:
url = "https://api.openaq.org/v1/latest"

params = {
    "limit":10000
}

res = req.get(url, params=params)
data = res.json()["results"]

latest_data = pd.io.json.json_normalize(data, ['measurements'],[['coordinates', 'latitude'], ['coordinates', 'longitude'],'location', 'city', 'country'],  
                                          errors='ignore')

##
## Potential error - if no observed points have an averagingPeriod during an update, this can fail
##

#latest_data.columns = ["averagingPeriod", "lastUpdated", "parameter", "sourceName", "unit", "value", "latitude", "longitude", "location","city", "country"]
#latest_data["iso3"] = iso2s.loc[latest_data["country"], "iso3"].values
latest_data = latest_data.set_index("location")

## May need to develop function for adding iso3 that is more flexible for a range of spellings...
# Have a check whether anything was not successfully coded. Determine whether to add this new spelling
# to running list.

In [94]:
latest_data.head()

Unnamed: 0_level_0,averagingPeriod,lastUpdated,parameter,sourceName,unit,value,coordinates.latitude,coordinates.longitude,city,country
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100 ail,,2017-12-12T19:00:00.000Z,no2,Agaar.mn,µg/m³,26.0,47.932907,106.92139,Ulaanbaatar,MN
100 ail,,2017-12-12T19:00:00.000Z,o3,Agaar.mn,µg/m³,20.0,47.932907,106.92139,Ulaanbaatar,MN
100 ail,,2017-12-12T19:00:00.000Z,so2,Agaar.mn,µg/m³,28.0,47.932907,106.92139,Ulaanbaatar,MN
100 ail,,2017-12-12T19:00:00.000Z,pm10,Agaar.mn,µg/m³,177.0,47.932907,106.92139,Ulaanbaatar,MN
100 ail,,2017-12-12T19:00:00.000Z,co,Agaar.mn,µg/m³,1158.0,47.932907,106.92139,Ulaanbaatar,MN


Check to see that new data all has a corresponding location

In [367]:
unique_places_in_latest_data = latest_data.index.unique()
new_places_ix = [place not in locations.index for place in unique_places_in_latest_data]
new_places = unique_places_in_latest_data[new_places_ix]
print(new_places)

### Replaced apostrophes, this could be a problem
# Also dropped non-georeferenced
# Follow up on both of these and see if there are still any unaccounted for

print("\nTotal number of unique places in latest data:", len(unique_places_in_latest_data))
print("Previously unseen places in latest data:", len(new_places))

Index(['100 ail', '16th and Whitmore', '1NL Navajo Lake', '21 de mayo',
       '22nd Street & Craycr', '24th & O', '2912 Coffey', '2LL Los Lunas',
       '40AB01 - ANTWERPEN', '40AB02 - BERENDRECHT',
       ...
       'תחנה:איינשטין', 'תחנה:אריאל', 'תחנה:גבעת המורה', 'תחנה:גליל מערבי',
       'תחנה:גן שמואל', 'תחנה:חיפה', 'תחנה:כביש 1 מוצא', 'תחנה:ניידת1',
       'תחנה:קיסריה', 'ฺBan-Tai, Kanchanaburi'],
      dtype='object', name='location', length=240)

Total number of unique places in latest data: 8055
Previously unseen places in latest data: 240


Add any "new places" to the open_aq_locations table
* Note: firstUpdated will be set to the earliest "lastUpdated" field for that sensor in the new data
* This will likely not be correct... will need to verify this with OpenAQ partners

In [374]:
location_table_columns =["location","firstUpdated","sourceName","city","iso3","latitude","longitude"]
new_places_df = pd.DataFrame(columns=location_table_columns).set_index("location")
new_places_df

Unnamed: 0_level_0,firstUpdated,sourceName,city,iso3,latitude,longitude
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [369]:
latest_data.loc[new_places[0], ]

locations.reset_index().set_index("city").loc["Ulaanbaatar"]
latest_data.reset_index().set_index("city").loc["Ulaanbaatar"]

Unnamed: 0_level_0,averagingPeriod,lastUpdated,parameter,sourceName,unit,value,latitude,longitude,city,country,iso3
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100 ail,,2017-12-08T10:45:00.000Z,o3,Agaar.mn,µg/m³,0.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,pm10,Agaar.mn,µg/m³,203.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,so2,Agaar.mn,µg/m³,41.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,co,Agaar.mn,µg/m³,3771.0,47.932907,106.92139,Ulaanbaatar,MN,MNG
100 ail,,2017-12-08T10:45:00.000Z,no2,Agaar.mn,µg/m³,86.0,47.932907,106.92139,Ulaanbaatar,MN,MNG


In [157]:
# Load files to experiment with de-duping

# Set above: OPENAQ_DATA_FOLDER = "/Users/nathansuberi/Desktop/RW_Data/OpenAQ/"
sample1_file = "open_aq_latest_2017-12-07_09-19-10.csv"
sample2_file = "open_aq_latest_2017-12-08_09-41-36.csv"

df1 = pd.read_csv(OPENAQ_DATA_FOLDER+sample1_file, index_col=[0])
df2 = pd.read_csv(OPENAQ_DATA_FOLDER+sample2_file, index_col=[0])

df1.set_index(["latitude", "longitude"], inplace=True)
df2.set_index(["latitude", "longitude"], inplace=True)

In [167]:
# Determine if there are any overlaps
df1.index.levels[0]
df1.index.levels[1]

Float64Index([-158.088592529,  -157.96913147,  -157.87109375, -157.858093262,
              -156.492416382, -156.446105957, -156.370346069, -155.913299561,
              -155.778137207, -155.468902588,
              ...
               153.028106689,  153.029998779,  153.032104492,  153.035003662,
               153.087203979,  153.103805542,  153.135894775,  153.149505615,
               153.152694702,  153.158096313],
             dtype='float64', name='longitude', length=6183)

In [None]:
# Insert original data into Carto table
# Insert value sql

df = pd.read_csv("open_aq_latest_2017-12-08 09/21/31.219395.csv")

table_name = "open_aq_history"

columns = str(tuple(df.columns)).replace("'","")
values = ", ".join(list(df.apply(lambda row: dump_row_contents(row, cols_and_types_history), axis=1)))

insert_value_sql = """
INSERT INTO {table_name} {columns} VALUES {values}
""".format(table_name=table_name, columns=columns, values=values)

print(insert_value_sql)

res = sql_api(insert_value_sql)
print(res.text)


# Extract data from Carto table to run the de-duping method

# Select all from a table in a certain time range

## TO DO: Format this for the UTC format the Carto table will use
look_back = "1 day"
select_all_in_time_range_sql = """
SELECT * FROM {table_name} WHERE lastUpdated 
""".format(table_name=table_name)

res = sql_api(select_all_in_time_range_sql)
print(res.text)

# Add in the de-duped data as an extension to the original Carto table

Save latest data 
* Put on S3
* Add into open_aq_history table

In [139]:
# Current datetime, in desired format for naming convention
cur_datetime = str(datetime.now())
cur_datetime = cur_datetime.split(".")[0]
cur_datetime = cur_datetime.replace(":", "-")
cur_datetime = cur_datetime.replace(" ", "_")

folder = "/Users/nathansuberi/Desktop/RW_Data/open_aq/"
file_name = "open_aq_latest_{datetime}.csv".format(datetime=cur_datetime)
current_file_name = folder + file_name\
print(current_file_name)

#df.to_csv(current_file_name)

/Users/nathansuberi/Desktop/RW_Data/open_aq/open_aq_latest_2017-12-08_09-48-38.csv


In [62]:
## All of this made unnecessary by magic of json_normalize, but list flattening is a nice trick

# # Extract measurements

# # All possible measurements:
# # Flattening nested lists: https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
# parameters = [obs["parameter"] for msr in df['measurements'] for obs in msr]
# parameters = np.unique(parameters, return_counts=True)
# #(array(['bc', 'co', 'no2', 'o3', 'pm10', 'pm25', 'so2'],
# #       dtype='<U4'), array([   17,  3453, 24056,  6837,  4493,  2953,  4665]))

# # Sometimes has an averaging period, other times not
# fields = ["averagingPeriod", "lastUpdated", "parameter", "sourceName", "unit", "value"]

# parameters

(array(['bc', 'co', 'no2', 'o3', 'pm10', 'pm25', 'so2'],
       dtype='<U4'), array([   17,  3453, 24056,  6837,  4493,  2953,  4665]))

In [103]:
# Exploring Python's datetime library
# Docs: https://docs.python.org/3/library/datetime.html

# This is UTC time, from Greenwich mean time
print(datetime.utcnow())
# This takes my current timezone
print(datetime.now())
# This makes a 1 day timedelta
print(timedelta(days=1))
print(datetime.now() - timedelta(days=1))

2017-12-08 14:19:57.581782
2017-12-08 09:19:57.582269
1 day, 0:00:00
2017-12-07 09:19:57.582608


In [249]:
# Experiment with breaking apart data structure
x = np.random.rand(107)
x = pd.DataFrame(x)
#print(x)

pieces = int(len(x) / piece_len)
rg = range(pieces+1)
y = pd.DataFrame([])

for r in rg:
    #print(r*piece_len)
    #print(r*piece_len+piece_len)
    y = y.append(x.iloc[r*piece_len:r*piece_len+piece_len], ignore_index=True)
#y = np.append(y,x[pieces*piece_len:])

print("These two dfs are equal:",x.equals(y))

These two dfs are equal: True
