# Functions to clean the data

In [8]:
import requests
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import warnings

In [2]:
from ipynb.fs.full.Load_Data import load_data

In [4]:
df = load_data(lat_start = 51, lat_end = 52, long_start = 9, long_end = 10, start_year = 2018, start_month = 4, start_day = 30, delta_hours = 1)
df

Unnamed: 0,measurement_PM10,measurement_PM2.5,time,lat,lon,sensor_id,measurement_id
0,18.43,11.90,2018-04-30T00:00:03Z,51.732,9.032,9946,9946_2018-04-30T00:00:03Z
1,9.17,8.27,2018-04-30T00:00:11Z,51.340,9.429,4784,4784_2018-04-30T00:00:11Z
2,,,2018-04-30T00:00:12Z,51.340,9.429,4785,4785_2018-04-30T00:00:12Z
3,12.87,8.23,2018-04-30T00:00:14Z,51.520,9.954,11998,11998_2018-04-30T00:00:14Z
4,,,2018-04-30T00:00:14Z,51.520,9.954,11999,11999_2018-04-30T00:00:14Z
...,...,...,...,...,...,...,...
1082,11.50,9.00,2018-04-30T00:59:50Z,51.290,9.641,4973,4973_2018-04-30T00:59:50Z
1083,,,2018-04-30T00:59:51Z,51.290,9.641,4974,4974_2018-04-30T00:59:51Z
1084,12.47,11.17,2018-04-30T00:59:54Z,51.854,9.672,9870,9870_2018-04-30T00:59:54Z
1085,13.43,12.00,2018-04-30T00:59:57Z,51.561,9.987,12334,12334_2018-04-30T00:59:57Z


In [16]:
# check for missing values and potentially remove those. Also print how many observations have been removed
def remove_missing(df):
    """Detects and removes missing values, the function prints out how many values were deleted."""
    '''The function deletes the entire row if either the PM10 or PM2.5 value is missing.'''
    
    '''INPUT:'''
    
    '''df:              A pandas data frame containing PM2.5 and PM10 measurements for a sensor at a given time'''
    
    '''OUTPUT:'''
    
    '''A pandas data frame without missing values'''
    
    miss = np.any([pd.isna(df['measurement_PM10']),pd.isna(df['measurement_PM2.5'])],axis = 0)
    n_miss = miss.sum()
    print(n_miss,"observations were removed from the data frame")
    return df[miss == False]    
df = remove_missing(df)

0 observations were removed from the data frame


Unnamed: 0,measurement_PM10,measurement_PM2.5,time,lat,lon,sensor_id,measurement_id
0,18.43,11.90,2018-04-30T00:00:03Z,51.732,9.032,9946,9946_2018-04-30T00:00:03Z
1,9.17,8.27,2018-04-30T00:00:11Z,51.340,9.429,4784,4784_2018-04-30T00:00:11Z
3,12.87,8.23,2018-04-30T00:00:14Z,51.520,9.954,11998,11998_2018-04-30T00:00:14Z
5,8.00,6.87,2018-04-30T00:00:18Z,51.355,9.529,3819,3819_2018-04-30T00:00:18Z
6,8.37,7.17,2018-04-30T00:00:21Z,51.854,9.672,9870,9870_2018-04-30T00:00:21Z
...,...,...,...,...,...,...,...
1079,13.60,9.40,2018-04-30T00:59:46Z,51.520,9.954,11998,11998_2018-04-30T00:59:46Z
1080,10.97,9.80,2018-04-30T00:59:47Z,51.537,9.947,1781,1781_2018-04-30T00:59:47Z
1082,11.50,9.00,2018-04-30T00:59:50Z,51.290,9.641,4973,4973_2018-04-30T00:59:50Z
1084,12.47,11.17,2018-04-30T00:59:54Z,51.854,9.672,9870,9870_2018-04-30T00:59:54Z


In [66]:
# check for outliers and filter those using different methods
def remove_outliers(df,method = "Z-score", z_val = 2.58, crit_val = [0,100], quantile = [0,0.99]):
    '''function to remove outliers following a selected method'''
    '''deletes entire row in case either Pm10 or PM2.5 values is an outlier'''
    
    '''INPUTS:'''
    
    '''df:                         Pandas Data Frame'''
    '''method:                     A string indicating which method to use for filtering the data possible options: Z-score, critical_value, quantile'''
    '''z_val:                      Z-value, default is 2.58 (only standardised values between -1.96 and 1.96 are kept). type = byte.'''
    '''crit_val:                   List of lower and upper bound to filter values. Type: list, default: [0,100]'''
    '''quantile:                   List of lower and upper quantile to filter values. Type: list, default: [0,0.95]'''
    
    '''OUTPUTS:'''
    
    '''Pandas data frame without outliers. Prints out how many observations were removed'''
    measurement = df[['measurement_PM10','measurement_PM2.5']]
    
    if method == "Z-score":
        mean = measurement.mean()
        sd = measurement.std()
        z_score = (measurement - mean)/sd
        exclude_PM10 = np.any([z_score['measurement_PM10'] < -z_val, z_score['measurement_PM10'] > z_val],axis = 0)
        exclude_PM25 = np.any([z_score['measurement_PM2.5'] < -z_val, z_score['measurement_PM2.5'] > z_val],axis = 0)
    
    if method == "critical_value":
        if len(crit_val) < 2:
            print("ERROR: please provide a lower and an upper bound")
            return
        if len(crit_val) > 2:
            warnings.warn("Only first two elements of crit_val will be used!")
        exclude_PM10 = np.any([measurement['measurement_PM10'] < crit_val[0], measurement['measurement_PM10'] > crit_val[1]],axis = 0)
        exclude_PM25 = np.any([measurement['measurement_PM2.5'] < crit_val[0], measurement['measurement_PM2.5'] > crit_val[1]],axis = 0)
        
    if method == "quantile":
        if len(quantile) < 2:
            print("ERROR: please provide a lower and an upper quantile")
            return 
        if len(quantile) > 2:
            warnings.warn("Only first two elements of quantile will be used!")
        upper = measurement.quantile(quantile[1])
        lower = measurement.quantile(quantile[0])
        exclude_PM10 = np.any([measurement['measurement_PM10'] < lower[0], measurement['measurement_PM10'] > upper[0]],axis = 0)
        exclude_PM25 = np.any([measurement['measurement_PM2.5'] < lower[1], measurement['measurement_PM2.5'] > upper[1]],axis = 0)
            
    exclude = np.any([exclude_PM10 == True, exclude_PM25 == True],axis = 0)
    n_excluded = exclude.sum()
    print(n_excluded,"observations were deleted")
    return df[exclude == False]

remove_outliers(df,method = "quantile")

13 observations were deleted


Unnamed: 0,measurement_PM10,measurement_PM2.5,time,lat,lon,sensor_id,measurement_id
0,18.43,11.90,2018-04-30T00:00:03Z,51.732,9.032,9946,9946_2018-04-30T00:00:03Z
1,9.17,8.27,2018-04-30T00:00:11Z,51.340,9.429,4784,4784_2018-04-30T00:00:11Z
3,12.87,8.23,2018-04-30T00:00:14Z,51.520,9.954,11998,11998_2018-04-30T00:00:14Z
5,8.00,6.87,2018-04-30T00:00:18Z,51.355,9.529,3819,3819_2018-04-30T00:00:18Z
6,8.37,7.17,2018-04-30T00:00:21Z,51.854,9.672,9870,9870_2018-04-30T00:00:21Z
...,...,...,...,...,...,...,...
1079,13.60,9.40,2018-04-30T00:59:46Z,51.520,9.954,11998,11998_2018-04-30T00:59:46Z
1080,10.97,9.80,2018-04-30T00:59:47Z,51.537,9.947,1781,1781_2018-04-30T00:59:47Z
1082,11.50,9.00,2018-04-30T00:59:50Z,51.290,9.641,4973,4973_2018-04-30T00:59:50Z
1084,12.47,11.17,2018-04-30T00:59:54Z,51.854,9.672,9870,9870_2018-04-30T00:59:54Z
