In [2]:
import os
import numpy as np
import pandas as pd

## Functions

In [2]:
# Codes for outlier range calculation using IQR and 

def calc_outlier_ranges_IQR(data, axis=None, decrease_lower_range_by=None, increase_upper_range_by=None):
    """
    calculate lower and upper range of outlier detection using IQR method.

    :param data: An array or list. Flattened array or list is preferred. If not flattened, adjust axis argument or
                 preprocess data before giving ito this function.
    :param axis: Axis or axes along which the percentiles are computed. Default set to None for flattened array or list.
    :param decrease_lower_range_by: A user-defined value to decrease lower range of outlier detection.
                                    Default set to None.
    :param increase_upper_range_by: A user-defined value to increase upper range of outlier detection.
                                    Default set to None.

    :return: lower_range, upper_range values of outlier detection.
    """
    q1 = np.nanpercentile(data, 25, axis=axis)
    median = np.nanpercentile(data, 50, axis=axis)
    q3 = np.nanpercentile(data, 75, axis=axis)

    iqr = q3 - q1

    lower_range = np.nanmin([i for i in data if i >= (q1 - 1.5 * iqr)])
    upper_range = np.nanmax([i for i in data if i <= (q3 + 1.5 * iqr)])

    # adjusts lower and upper values by an author-defined range
    if (decrease_lower_range_by is not None) | (increase_upper_range_by is not None):
        if (decrease_lower_range_by is not None) & (increase_upper_range_by is None):
            lower_range = lower_range - decrease_lower_range_by

        elif (increase_upper_range_by is not None) & (decrease_lower_range_by is None):
            upper_range = upper_range + increase_upper_range_by

        elif (increase_upper_range_by is not None) & (decrease_lower_range_by is not None):
            lower_range = lower_range - decrease_lower_range_by
            upper_range = upper_range + increase_upper_range_by

    return lower_range, upper_range, median


def calc_outlier_ranges_MAD(data, axis=None, threshold=3):
    """
    calculate lower and upper range of outlier detection using Median Absolute Deviation (MAD) method.

    A good paper on MAD-based outlier detection:
    https://www.sciencedirect.com/science/article/pii/S0022103113000668

    :param data: An array or list. Flattened array or list is preferred. If not flattened, adjust axis argument or
                 preprocess data before giving ito this function.
    :param axis: Axis or axes along which the percentiles are computed. Default set to None for flattened array or list.
    :param threshold: Value of threshold to use in MAD method.

    :return: lower_range, upper_range values of outlier detection.
    """
    # Calculate the median along the specified axis
    median = np.nanmedian(data, axis=axis)

    # Calculate the absolute deviations from the median
    abs_deviation = np.abs(data - median)

    # Calculate the median of the absolute deviations
    MAD = np.nanmedian(abs_deviation, axis=axis)

    lower_range = median - threshold * MAD
    upper_range = median + threshold * MAD

    return lower_range, upper_range, median

## Outlier analysis for WestUS

In [3]:
data = '../../Eff_Precip_Model_Run/Model_csv/train_test.parquet'
df = pd.read_parquet(data)
df.head()

Unnamed: 0,Effective_precip_train,year,month,MODIS_Day_LST,MODIS_LAI,MODIS_NDVI,PRISM_Precip,PRISM_Tmax,PRISM_Tmin,Ssebop_ETa,...,DAYMET_sun_hr,Bulk_density,Clay_content,Field_capacity,Sand_content,DEM,Slope,AWC,Latitude,Longitude
0,5.695302,2016,1,268.320007,0.228571,0.004818,7.65,0.095,-11.095,6.0,...,8.117245,151.5,26.833334,24.666666,38.666668,1043.0,0.823705,180.0,48.725521,-112.085495
1,3.39569,2016,1,268.579987,0.228571,-0.001483,7.65,0.095,-11.095,6.0,...,8.117245,151.0,26.166666,24.166666,37.166668,1031.0,0.611923,170.0,48.725521,-112.065727
2,4.910922,2016,1,268.404999,0.257143,-0.001483,7.933,0.114,-11.404,7.0,...,8.117245,152.166672,26.5,25.666666,38.833332,1022.0,0.286236,100.0,48.725521,-112.045967
3,8.388258,2016,1,268.415009,0.271429,-0.008232,7.933,0.114,-11.404,7.0,...,8.117245,150.5,27.333334,25.5,38.5,1023.0,0.258009,160.0,48.725521,-112.026207
4,5.456954,2016,1,268.415009,0.228571,-0.015721,8.136,0.179,-11.498,0.0,...,8.117245,153.5,28.833334,25.166666,35.333332,1024.0,0.488622,110.0,48.725521,-112.006439


In [4]:
lower_range, upper_range, median = calc_outlier_ranges_IQR(data=df.Effective_precip_train, 
                                                           axis=None, decrease_lower_range_by=None, increase_upper_range_by=None)

lower_range, upper_range, median

(0.0, 101.67916870117188, 23.550756454467773)

In [5]:
lower_range, upper_range, median = calc_outlier_ranges_MAD(data=df.Effective_precip_train, axis=None)

lower_range, upper_range, median

(-19.145645141601562, 66.24715805053711, 23.550756)

In [6]:
df.Effective_precip_train.max()

195.69199

In [5]:
df.quantile(0.99)

Effective_precip_train     139.238254
year                      2020.000000
month                       12.000000
MODIS_Day_LST              324.290009
MODIS_LAI                    2.050000
MODIS_NDVI                   0.690436
PRISM_Precip               196.911763
PRISM_Tmax                  36.507999
PRISM_Tmin                  20.659000
Ssebop_ETa                 155.000000
GRIDMET_Precip             197.100006
GRIDMET_RET                220.065002
GRIDMET_vap_pres_def         3.120968
GRIDMET_max_RH              97.407143
GRIDMET_min_RH              61.639286
GRIDMET_wind_vel             6.038710
GRIDMET_short_rad          346.706451
DAYMET_sun_hr               15.453017
Bulk_density               173.000000
Clay_content                34.500000
Field_capacity              30.000000
Sand_content                73.333336
DEM                       2545.000000
Slope                       10.963465
AWC                        220.000000
Latitude                    48.468601
Longitude   