In [4]:
import pandas as pd
import numpy as np
# from warnings import warn

In [5]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]


data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
data.head(3)
print(data.shape)

(891, 6)


In [6]:
pd.Series(data.Fare.unique()).sort_values()
#data.Fare.unique()

104      0.0000
163      4.0125
245      5.0000
152      6.2375
240      6.4375
         ...   
164    227.5250
75     247.5208
148    262.3750
23     263.0000
127    512.3292
Length: 248, dtype: float64

### Detect by arbitrary boundary
#### identify outliers based on arbitrary boundaries

In [7]:
def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
    '''
    identify outliers based on arbitrary boundaries passed to the function.
    '''
    para = (upper_fence,lower_fence)
    tmp = pd.concat([data[col] > upper_fence, data[col] < lower_fence],axis = 1)
    outlier_index = tmp.any(axis = 1)
    print('Num of outliers detected: ',outlier_index.value_counts()[1])
    print('Proportion of outlier detected :',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [8]:
index,para = outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outliers detected:  19
Proportion of outlier detected : 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [9]:
# check the 19 found outliers
data.loc[index, 'Fare'].sort_values()

179      0.0000
806      0.0000
732      0.0000
674      0.0000
633      0.0000
597      0.0000
815      0.0000
466      0.0000
481      0.0000
302      0.0000
277      0.0000
271      0.0000
263      0.0000
413      0.0000
822      0.0000
378      4.0125
679    512.3292
737    512.3292
258    512.3292
Name: Fare, dtype: float64

### IQR method
#### outlier detection by Interquartile Ranges Rule

In [10]:
def outlier_detect_IQR(data,col,threshold = 3):
    '''
    outlier detection by Interquartile Ranges Rule, also known as Tukey's test. 
    calculate the IQR ( 75th quantile - 25th quantile) 
    and the 25th 75th quantile. 
    Any value beyond:
        upper bound = 75th quantile + （IQR * threshold）
        lower bound = 25th quantile - （IQR * threshold）   
    are regarded as outliers. Default threshold is 3.
    '''
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (lower_fence, upper_fence)
    tmp = pd.concat([data[col] > upper_fence, data[col] < lower_fence],axis = 1)
    outlier_index = tmp.any(axis = 1)
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [11]:
index,para = outlier_detect_IQR(data=data,col='Fare',threshold=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 31
Proportion of outlier detected 0.03479236812570146
Upper bound: -107.53760000000001 
Lower bound: 146.448


In [12]:
# check the 31 found outliers
data.loc[index,'Fare'].sort_values()

31     146.5208
195    146.5208
305    151.5500
708    151.5500
297    151.5500
498    151.5500
609    153.4625
332    153.4625
268    153.4625
318    164.8667
856    164.8667
730    211.3375
779    211.3375
689    211.3375
377    211.5000
527    221.7792
700    227.5250
716    227.5250
557    227.5250
380    227.5250
299    247.5208
118    247.5208
311    262.3750
742    262.3750
341    263.0000
88     263.0000
438    263.0000
27     263.0000
679    512.3292
258    512.3292
737    512.3292
Name: Fare, dtype: float64

### Mean and Standard Deviation Method
#### outlier detection by Mean and Standard Deviation Method.

In [15]:
def outlier_detect_mean_std(data,col,threshold = 3):
    '''
    outlier detection by Mean and Standard Deviation Method.
    If a value is a certain number(called threshold) of standard deviations away 
    from the mean, that data point is identified as an outlier. 
    Default threshold is 3.
    This method can fail to detect outliers because the outliers increase the standard deviation. 
    The more extreme the outlier, the more the standard deviation is affected.
    '''
    
    upper_fence = data[col].mean() + threshold * data[col].std()
    lower_fence = data[col].mean() - threshold * data[col].std()
    
    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col] > upper_fence, data[col] < lower_fence],axis = 1)
    outlier_index = tmp.any(axis = 1)
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [16]:
index,para = outlier_detect_mean_std(data=data,col='Fare',threshold=3)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 20
Proportion of outlier detected 0.02244668911335578
Upper bound: 181.2844937601173 
Lower bound: -116.87607782296804


In [17]:
# check the 20 found outliers
data.loc[index,'Fare'].sort_values()

779    211.3375
730    211.3375
689    211.3375
377    211.5000
527    221.7792
716    227.5250
700    227.5250
380    227.5250
557    227.5250
118    247.5208
299    247.5208
311    262.3750
742    262.3750
27     263.0000
341    263.0000
88     263.0000
438    263.0000
258    512.3292
737    512.3292
679    512.3292
Name: Fare, dtype: float64

### MAD method
#### outlier detection by Median and Median Absolute Deviation Method (MAD)

In [None]:
def outlier_detect_MAD(data,col,threshold=3.5):
    """
    outlier detection by Median and Median Absolute Deviation Method (MAD)
    The median of the residuals is calculated. Then, the difference is calculated between each historical value and this median. 
    These differences are expressed as their absolute values, and a new median is calculated and multiplied by 
    an empirically derived constant to yield the median absolute deviation (MAD). 
    If a value is a certain number of MAD away from the median of the residuals, 
    that value is classified as an outlier. The default threshold is 3 MAD.
    
    This method is generally more effective than the mean and standard deviation method for detecting outliers, 
    but it can be too aggressive in classifying values that are not really extremely different. 
    Also, if more than 50% of the data points have the same value, MAD is computed to be 0, 
    so any value different from the residual median is classified as an outlier.
    """
    

In [None]:
# too aggressive for our dataset, about 18% of cases are detected as outliers.
index = outlier_detect_MAD(data=data,col='Fare',threshold=3.5)