In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Codes for outlier range calculation using IQR and 

def calc_outlier_ranges_IQR(data, axis=None, decrease_lower_range_by=None, increase_upper_range_by=None):
    """
    calculate lower and upper range of outlier detection using IQR method.

    :param data: An array or list. Flattened array or list is preferred. If not flattened, adjust axis argument or
                 preprocess data before giving ito this function.
    :param axis: Axis or axes along which the percentiles are computed. Default set to None for flattened array or list.
    :param decrease_lower_range_by: A user-defined value to decrease lower range of outlier detection.
                                    Default set to None.
    :param increase_upper_range_by: A user-defined value to increase upper range of outlier detection.
                                    Default set to None.

    :return: lower_range, upper_range values of outlier detection.
    """
    q1 = np.nanpercentile(data, 25, axis=axis)
    median = np.nanpercentile(data, 50, axis=axis)
    q3 = np.nanpercentile(data, 75, axis=axis)

    iqr = q3 - q1

    lower_range = np.nanmin([i for i in data if i >= (q1 - 1.5 * iqr)])
    upper_range = np.nanmax([i for i in data if i <= (q3 + 1.5 * iqr)])

    # adjusts lower and upper values by an author-defined range
    if (decrease_lower_range_by is not None) | (increase_upper_range_by is not None):
        if (decrease_lower_range_by is not None) & (increase_upper_range_by is None):
            lower_range = lower_range - decrease_lower_range_by

        elif (increase_upper_range_by is not None) & (decrease_lower_range_by is None):
            upper_range = upper_range + increase_upper_range_by

        elif (increase_upper_range_by is not None) & (decrease_lower_range_by is not None):
            lower_range = lower_range - decrease_lower_range_by
            upper_range = upper_range + increase_upper_range_by

    return lower_range, upper_range, median


def calc_outlier_ranges_MAD(data, axis=None, threshold=3):
    """
    calculate lower and upper range of outlier detection using Median Absolute Deviation (MAD) method.

    A good paper on MAD-based outlier detection:
    https://www.sciencedirect.com/science/article/pii/S0022103113000668

    :param data: An array or list. Flattened array or list is preferred. If not flattened, adjust axis argument or
                 preprocess data before giving ito this function.
    :param axis: Axis or axes along which the percentiles are computed. Default set to None for flattened array or list.
    :param threshold: Value of threshold to use in MAD method.

    :return: lower_range, upper_range values of outlier detection.
    """
    # Calculate the median along the specified axis
    median = np.nanmedian(data, axis=axis)

    # Calculate the absolute deviations from the median
    abs_deviation = np.abs(data - median)

    # Calculate the median of the absolute deviations
    MAD = np.nanmedian(abs_deviation, axis=axis)

    lower_range = median - threshold * MAD
    upper_range = median + threshold * MAD

    return lower_range, upper_range, median

## Outlier analysis for left zone

In [3]:
left_zone_data = '../../Eff_Precip_Model_Run/left_zone/Model_csv/left_zone_train_test_v2.parquet'
left_zone_df = pd.read_parquet(left_zone_data)
left_zone_df.head()

Unnamed: 0,Effective_precip_train,year,month,MODIS_Day_LST,MODIS_LAI,MODIS_NDVI,MODIS_NDWI,PRISM_Precip,PRISM_Tmax,PRISM_Tmin,...,GRIDMET_short_rad,DAYMET_sun_hr,Bulk_density,Clay_content,Field_capacity,Sand_content,DEM,Slope,Latitude,Longitude
0,1.210076,2016,1,289.670013,0.1125,0.243367,-0.051062,47.159,1.713,-4.602,...,66.880646,9.332632,148.5,20.666666,24.0,41.166668,815.0,7.437994,44.318386,-117.125038
1,1.628079,2016,1,289.670013,0.125,0.160589,0.033128,47.159,1.713,-4.602,...,65.903229,9.332632,148.666672,21.5,24.333334,41.833332,749.0,4.882878,44.318386,-117.105278
2,2.192818,2016,1,291.054993,0.125,0.160589,0.033128,43.838001,2.622,-5.876,...,65.903229,9.332632,148.166672,20.833334,24.5,41.333332,720.0,3.616556,44.318386,-117.085518
3,2.242957,2016,1,291.524994,0.1375,0.109289,0.021101,43.838001,2.622,-5.876,...,64.932259,9.332632,150.666672,23.0,24.666666,38.0,702.0,4.889362,44.318386,-117.06575
4,1.876381,2016,1,292.140015,0.0625,0.158664,-0.001627,45.709999,2.2,-5.52,...,64.932259,9.332632,151.833328,25.666666,24.333334,37.666668,698.0,6.122572,44.318386,-117.04599


In [4]:
lower_range, upper_range, median = calc_outlier_ranges_IQR(data=left_zone_df.Effective_precip_train, 
                                                           axis=None, decrease_lower_range_by=None, increase_upper_range_by=None)

lower_range, upper_range, median

(0.0, 63.482608795166016, 15.313152313232422)

In [6]:
lower_range, upper_range, median = calc_outlier_ranges_MAD(data=left_zone_df.Effective_precip_train, axis=None)

lower_range, upper_range, median

(-11.41236686706543, 42.03867149353027, 15.313152)

## Outlier analysis for 11 states

In [7]:
data = '../../Eff_Precip_Model_Run/Model_csv/train_test_v2.parquet'
df = pd.read_parquet(data)
df.head()

Unnamed: 0,Effective_precip_train,year,month,MODIS_Day_LST,MODIS_LAI,MODIS_NDVI,MODIS_NDWI,PRISM_Precip,PRISM_Tmax,PRISM_Tmin,...,GRIDMET_short_rad,DAYMET_sun_hr,Bulk_density,Clay_content,Field_capacity,Sand_content,DEM,Slope,Latitude,Longitude
0,2.026721,2016,1,280.242737,0.571429,0.166787,0.090796,90.094002,0.89,-3.451,...,40.403225,10.416129,139.5,19.5,29.833334,37.5,619.0,11.438514,48.705757,-118.073662
1,3.328477,2016,1,263.649628,0.3,0.015139,0.616729,35.312,0.353,-5.647,...,48.587097,8.243287,147.833328,23.0,26.333334,39.833332,1110.0,6.684582,47.875713,-114.61515
2,3.404121,2016,1,267.554932,0.242857,0.015139,0.56755,35.761002,0.467,-5.784,...,48.587097,8.243287,150.166672,23.0,25.833334,40.166668,1081.0,5.887539,47.875713,-114.595383
3,4.490189,2016,1,267.554932,0.185714,0.005284,0.561565,35.761002,0.467,-5.784,...,48.470966,8.243287,147.0,21.833334,24.5,41.5,983.0,5.829579,47.875713,-114.575623
4,5.257507,2016,1,267.143799,0.185714,0.045994,0.561565,36.703999,0.484,-5.861,...,48.470966,8.243287,148.0,21.0,24.666666,42.5,1023.0,4.581026,47.875713,-114.555862


In [10]:
lower_range, upper_range, median = calc_outlier_ranges_IQR(data=df.Effective_precip_train, 
                                                           axis=None, decrease_lower_range_by=None, increase_upper_range_by=None)

lower_range, upper_range, median

(0.0, 74.79463195800781, 18.899900436401367)

In [11]:
lower_range, upper_range, median = calc_outlier_ranges_MAD(data=df.Effective_precip_train, axis=None)

lower_range, upper_range, median

(-13.734984397888184, 51.53478527069092, 18.8999)