# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from copy import copy
from functools import reduce
import cartoframes
import json

# Authenticate to Carto

In [None]:
CARTO_USER = 'rw-nrt'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Constants

In [None]:
SD_CUTOFF = 3
WINDOW_TIME = timedelta(hours = 12)
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'

# Decide whether to keep a day's data from a station
# # Want stations to report at least 3 times, covering 75% of day
MIN_COVERAGE = 3
MIN_SPAN = timedelta(hours = 10)

# Want locations which have valid readings on 75% of days in month
MIN_LOC_DAYS = 15

# Pull in legit data

In [None]:
# Not reading in the whole table
#pm25_data = cc.read('cit_003a_air_quality_pm25')
pm25_data = pd.read_csv('/Users/nathansuberi/Desktop/RW_Data/pm25_data_for_openaq_blog.csv')
pm25_data.shape

In [None]:
pm25_data['utc'].iloc[0]

# Helper Functions

In [None]:
def window_by_time(history, ix, window_time):
    # Load info
    dt, ppm, ix = history[ix]
    
    # Find lower bound:
    lower_bd = copy(ix)
    if lower_bd > 0:
        while(history[lower_bd][0] > dt - window_time) & (lower_bd > 0):
            lower_bd -= 1
    
    # Find upper bound:
    upper_bd = copy(ix)
    while(history[upper_bd][0] < dt + window_time) & (upper_bd < len(history)-1):
        upper_bd += 1
    
    # Return result:
    if lower_bd == upper_bd:
        return [history[ix]]
    else:
        return history[lower_bd:upper_bd]

def return_datetime(timestamp):
    return datetime.strptime(timestamp, DATE_FORMAT)
    #return timestamp.to_pydatetime()

def extract_vals(tuples):
    _, vals, _ = zip(*tuples)
    return vals

def extract_vals_ids(tuples):
    _, vals, ids = zip(*tuples)
    return (vals, ids)

def id_outliers(info):
    window, mean, sd = info
    outlier_ids = []
    vals, ids = extract_vals_ids(window)
    for val, _id in zip(vals, ids):
        if np.abs((val-mean)/sd) > SD_CUTOFF:
            outlier_ids.append(_id)
        else:
            pass
    return outlier_ids

def flatten(lst, elems):
    for elem in elems:
        if elem not in lst:
            lst.append(elem)
    return lst

def extract_day(utc):
    return datetime.strptime(utc, DATE_FORMAT).strftime('%d')


###
# Iterate over locations, set outlier column vals for each as we go
###

def mark_flags(df, info, num_locations):
    
    # Track progress through the reduce function
    ix, loc_name = info
    print('Loc #{}/{}'.format(ix, num_locations))
    print('loc_name: {}'.format(loc_name))
    
    # Extract information for this location
    loc_data = df[df['location'] == loc_name]
    print('Number of observations: {}'.format(loc_data.shape[0]))

    ###
    # Mark outliers
    ###
    
    # Convert datetimes, rezip_to_values
    utc_val = loc_data[['utc', 'value']].sort_values(by='utc')
    dts = list(map(return_datetime, utc_val['utc']))
    rezipped = list(zip(dts, loc_data['value']))
    
    # Add column of indices to history of values
    history = list(zip(*zip(*rezipped), range(len(rezipped))))
    
    # Create windows over history
    windowed_by_time_history = list(map(lambda ix: window_by_time(history, ix, WINDOW_TIME), range(len(history))))
        
    # Calculate mean and standard deviation for windows
    mean_windows = list(map(lambda tuples: np.mean(extract_vals(tuples)), windowed_by_time_history))
    sd_windows = list(map(lambda tuples: np.std(extract_vals(tuples)), windowed_by_time_history))
    
    # Package windowed history along w/ means and stardard deviations
    eval_package = list(zip(windowed_by_time_history, mean_windows, sd_windows))
    
    # Identify outliers
    outlier_ids = reduce(flatten, list(map(id_outliers, eval_package)), [])
    #print('Num outliers: {}'.format(len(outlier_ids)))
    
    # Mark outliers in dataframe, return to reduce statement
    outlier_ixs = loc_data.iloc[outlier_ids].index
    df.loc[outlier_ixs, 'outlier'] = True
    #print('Outlier indices: {}'.format(outlier_ixs))
    
    ###
    # Mark poorly represented locations
    ###
    
    # Mark which days this location has adequate coverage for
    
    loc_data_days = [extract_day(utc) for utc in loc_data['utc']]
    loc_data['day'] = loc_data_days
    
    rejected_days = []
    accepted_days = []
   
    for day in set(loc_data_days):
        #print('Day: {}'.format(day))
        
        # Accept by default if all tests passed
        accept = True
        
        day_of_loc_data = loc_data[loc_data['day']==day]
        #print(day_of_loc_data.shape)
        
        # Reject if less than MIN_COVERAGE observations for station that day
        if day_of_loc_data.shape[0] < MIN_COVERAGE:
            #print('Not enough reports to count location {} on day {}'.format(loc_name, day))
            rejected_days.append(day)
            accept = False
           
        # Reject if less than MIN_SPAN hours of the day covered for station that day
        time_range = sorted(day_of_loc_data['utc'])
        #print(time_range)
        start, end = time_range[0], time_range[len(time_range)-1]
        start = return_datetime(start)
        end = return_datetime(end)
        len_range = end - start
        #print('Len range: {}'.format(len_range))
        
        if len_range < MIN_SPAN:
            #print('Not enough coverage of day to count location {} on day {}'.format(loc_name, day))
            rejected_days.append(day)
            accept = False
            
        # Otherwise, accept this day at this location
        if accept:
            accepted_days.append(day)
        
    # Label days for which we had insufficient coverage
    for day in rejected_days:
        insuff_data_ix = loc_data.loc[loc_data['day']==day].index
        df.loc[insuff_data_ix, 'poor_day_at_station'] = True
        
    # Mark whether location has adequate coverage in the month
    if len(accepted_days) < MIN_LOC_DAYS:
        #print('Underrepresented station - only reporting for {}, less than minimum of {} days this month'.format(len(accepted_days), MIN_LOC_DAYS))
        df.loc[:, 'poor_station'] = True
    
    return df

# Mark observations as outliers

In [None]:
locations = pm25_data['location'].unique()
process_locations = locations#['ES1535A']
labelled_data = reduce(lambda df, info: mark_flags(df, info, len(process_locations)), enumerate(process_locations), pm25_data)

# Export to Carto

In [None]:
#cc.write(labelled_data, 'cit_003a_air_quality_pm25_flagged', overwrite=True)

In [None]:
labelled_data.to_csv('cit_003a_air_quality_pm25_flagged_2.csv')

# View Example, previously troublesome location

In [None]:
# Write data back to Carto
albacete_example = labelled_data.loc[labelled_data['city'].str.contains('Albacete')]
albacete_example

In [None]:
albacete_example.to_csv('albacete_cleaned_example.csv')

# Emperimentation below

# Generate sample data

In [None]:
LEN_SAMPLE = 100
WINDOW_SIZE = 4


obs = np.random.randn(100)*LEN_SAMPLE
dates = [datetime.now() - timedelta(seconds=float(10000*np.random.randn(1))) for _ in range(LEN_SAMPLE)]

In [None]:
history = list(zip(sorted(dates), obs, range(len(obs))))

In [None]:
def window_by_size(index, window_size, lst):
    # Window is entire list
    if len(lst) < window_size:
        return lst
    
    # Window collides with beginning of the list
    if (index < window_size):
        if len(lst) < index + window_size:
            return lst
        else:
            return lst[:index + window_size]

    # Window collides with end of the list
    if (len(lst) < index + window_size):
        return lst[index - window_size:]
    
    # Entire window in list
    return lst[index - window_size : index + window_size]

def window_by_time(index, window_time, lst):
    obs_time = lst[index][0]
    
    # Find lower bound:
    lower_bd = copy(index)
    while(lst[lower_bd][0] > obs_time - window_time) & (lower_bd > 0):
        lower_bd -= 1
    
    # Find upper bound:
    upper_bd = copy(index)
    while(lst[upper_bd][0] < obs_time + window_time) & (upper_bd < len(lst)-1):
        upper_bd += 1
    
    # Return result:
    return lst[lower_bd:upper_bd]

In [None]:
windowed_by_size_history = list(map(lambda ix: window_by_size(ix, WINDOW_SIZE, obs), range(len(obs))))
windowed_by_time_history = list(map(lambda ix: window_by_time(ix, WINDOW_TIME, history), range(len(history))))

In [None]:
windowed_by_time_history

In [None]:
def extract_vals(tuples):
    _, vals, _ = zip(*tuples)
    return vals

def extract_vals_ids(tuples):
    _, vals, ids = zip(*tuples)
    return (vals, ids)

mean_windows = list(map(lambda tuples: np.mean(extract_vals(tuples)), windowed_by_time_history))
sd_windows = list(map(lambda tuples: np.std(extract_vals(tuples)), windowed_by_time_history))

eval_package = list(zip(windowed_by_time_history, mean_windows, sd_windows))

In [None]:
def id_outliers(info):
    window, mean, sd = info
    #response = []
    outlier_ids = []
    vals, ids = extract_vals_ids(window)
    for val, _id in zip(vals, ids):
        if np.abs((val-mean)/sd) > CUTOFF:
            #response.append('Throw away {}'.format(_id))
            outlier_ids.append(_id)
        else:
            pass
            #response.append('Keep {}'.format(_id))
    return outlier_ids

In [None]:
def flatten(lst, elems):
    for elem in elems:
        if elem not in lst:
            lst.append(elem)
    return lst

outlier_ids = reduce(flatten, list(map(id_outliers, eval_package)), [])

In [None]:
sorted(outlier_ids)

# Pull in legit data

In [None]:
pm25_data.columns

In [None]:
locations = pm25_data['location'].unique()
locations

In [None]:
loc_zero = locations[0]
data_zero = pm25_data[pm25_data['location'] == loc_zero]

zero_date_vals = data_zero[['utc', 'value']]

def return_datetime(timestamp):
    return timestamp.to_pydatetime()
dts = list(map(return_datetime, zero_date_vals['utc']))

new_vals = list(zip(dts, data_zero['value']))

In [None]:
history = list(zip(*zip(*new_vals), range(len(new_vals))))

In [None]:
history

In [None]:
windowed_by_time_history = list(map(lambda ix: window_by_time(ix, WINDOW_TIME, history), range(len(history))))

In [None]:
def extract_vals(tuples):
    _, vals, _ = zip(*tuples)
    return vals

def extract_vals_ids(tuples):
    _, vals, ids = zip(*tuples)
    return (vals, ids)

mean_windows = list(map(lambda tuples: np.mean(extract_vals(tuples)), windowed_by_time_history))
sd_windows = list(map(lambda tuples: np.std(extract_vals(tuples)), windowed_by_time_history))

eval_package = list(zip(windowed_by_time_history, mean_windows, sd_windows))

In [None]:
def id_outliers(info):
    window, mean, sd = info
    outlier_ids = []
    vals, ids = extract_vals_ids(window)
    for val, _id in zip(vals, ids):
        if np.abs((val-mean)/sd) > CUTOFF:
            outlier_ids.append(_id)
        else:
            pass
    return outlier_ids

In [None]:
def flatten(lst, elems):
    for elem in elems:
        if elem not in lst:
            lst.append(elem)
    return lst

outlier_ids = reduce(flatten, list(map(id_outliers, eval_package)), [])

In [None]:
print(sorted(outlier_ids))
print(len(outlier_ids))

In [None]:
pm25_data.loc[pm25_data.iloc[outlier_ids].index, 'outlier'] = True

In [None]:
pm25_data

In [None]:
# With dataframes the way I tried it is slower

# With dataframe -- untested
def window_by_time_df(history, ix, window_time):
    # Load info
    dt, ppm, ix = history.iloc[ix][['dt', 'ppm', 'ix']].values 
   
    # Find lower bound:
    lower_bd = copy(ix)
    while(history.iloc[lower_bd]['dt'] > dt - window_time) & (lower_bd > 0):
        lower_bd -= 1
    
    # Find upper bound:
    upper_bd = copy(ix) 
    while(history.iloc[upper_bd]['dt'] < dt + window_time) & (upper_bd < history.shape[0]-1):
        upper_bd += 1
    
    # Return result:
    return history[lower_bd:upper_bd]

def id_outliers_df(info):
    window, mean, sd = info
    outlier_ids = []
    #print(window)
    for ppm, ix in window[['ppm', 'ix']].values:
        #print('Ppm: {}'.format(ppm))
        #print('Ix: {}'.format(ix))
        if np.abs((ppm-mean)/sd) > CUTOFF:
            outlier_ids.append(ix)
        else:
            pass
    return outlier_ids

def mark_outliers_df(df, info, num_locations):
    # Track progress through the reduce function
    ix, loc_name = info
    print('Loc #{}/{}'.format(ix, num_locations))
    print('loc_name: {}'.format(loc_name))
    
    # Extract information for this location
    loc_data = df[df['location'] == loc_name]
    print('Number of observations: {}'.format(loc_data.shape[0]))

    # Convert datetimes, rezip_to_values
    utc_val = loc_data[['utc', 'value']]
    dts = list(map(return_datetime, utc_val['utc']))
    rezipped = list(zip(dts, loc_data['value']))
    
    # Add column of indices to history of values
    history = list(zip(*zip(*rezipped), range(len(rezipped))))
    #history = pd.DataFrame({'dt':loc_data['utc'], 'ppm':loc_data['value'], 'ix':range(loc_data.shape[0])})
    #print('History: {}'.format(history))
    
    # Create windows over history
    windowed_by_time_history = list(map(lambda ix: window_by_time(history, ix, WINDOW_TIME), range(len(history))))
    #windowed_by_time_history = list(map(lambda ix: window_by_time(history, ix, WINDOW_TIME), range(history.shape[0])))
    #print('Windows: {}'.format(windowed_by_time_history))
        
    # Calculate mean and standard deviation for windows
    mean_windows = list(map(lambda tuples: np.mean(extract_vals(tuples)), windowed_by_time_history))
    sd_windows = list(map(lambda tuples: np.std(extract_vals(tuples)), windowed_by_time_history))
    #mean_windows = list(map(lambda window: np.mean(window['ppm']), windowed_by_time_history))
    #sd_windows = list(map(lambda window: np.std(window['ppm']), windowed_by_time_history))

    # Package windowed history along w/ means and stardard deviations
    eval_package = list(zip(windowed_by_time_history, mean_windows, sd_windows))
    
    # Identify outliers
    outlier_ids = reduce(flatten, list(map(id_outliers, eval_package)), [])
    print('Num outliers: {}'.format(len(outlier_ids)))
    
    # Mark outliers in dataframe, return to reduce statement
    df.loc[df.iloc[outlier_ids].index, 'outlier'] = True
    return df

In [None]:
loc_data = pm25_data[pm25_data['location'] == locations[1892]]

utc_val = loc_data[['utc', 'value']]
    
dts = list(map(return_datetime, utc_val['utc']))

new_vals = list(zip(dts, loc_data['value']))

history = list(zip(*zip(*new_vals), range(len(new_vals))))

windowed_by_time_history = list(map(lambda ix: window_by_time(ix, WINDOW_TIME, history), range(len(history))))

mean_windows = list(map(lambda tuples: np.mean(extract_vals(tuples)), windowed_by_time_history))
sd_windows = list(map(lambda tuples: np.std(extract_vals(tuples)), windowed_by_time_history))

eval_package = list(zip(windowed_by_time_history, mean_windows, sd_windows))
