In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## What are the criteria to identify an outlier?
- Data point that falls outside of 1.5 times of an interquartile range above the 3rd quartile and below the 1st quartile
- Data point that falls outside of 3 standard deviations. we can use a z score and if the z score falls outside of 2 standard deviation

## Various ways of finding the outlier.
- Using scatter plots
- Box plot
- using z score
- using the IQR interquantile range

In [2]:
dataset= [11,10,12,14,12,15,14,13,15,102,12,14,17,19,107, 10,13,12,14,12,108,12,11,14,13,15,10,15,12,10,14,13,15,10]

In [3]:
dataset

[11,
 10,
 12,
 14,
 12,
 15,
 14,
 13,
 15,
 102,
 12,
 14,
 17,
 19,
 107,
 10,
 13,
 12,
 14,
 12,
 108,
 12,
 11,
 14,
 13,
 15,
 10,
 15,
 12,
 10,
 14,
 13,
 15,
 10]

## Detecting Outliers using Z score

z = (X — μ) / σ

In [4]:
outliers = []

def detect_outliers(data):
    threshold = 3
    mean = np.mean(data)
    sd = np.std(data)
    
    for i in data:
        z_score = (i - mean)/ sd
        if np.abs(z_score) > threshold:
            outliers.append(i)
            
    return outliers


In [5]:
outliers_pt = detect_outliers(dataset)

In [6]:
outliers_pt

[102, 107, 108]

## InterQuantile Range

75%- 25% values in a dataset

Steps
1. Arrange the data in increasing order
2. Calculate first(q1) and third quartile(q3)
3. Find interquartile range (q3-q1)
4. Find lower bound q1*1.5
5. Find upper bound q3*1.5

Anything that lies outside of lower and upper bound is an outlier


In [13]:
dataset = sorted(dataset)

In [14]:
quantile1, quantile3 = np.percentile(dataset, [25, 75])

print(quantile1, quantile3)

12.0 15.0


In [29]:
iqr = quantile3 - quantile1

upp_bound = quantile3 + (1.5 * iqr)
low_bound = quantile1 - (1.5 * iqr)

print(upp_bound, low_bound)

19.5 7.5


In [30]:
outlier_iqr = []
for x in dataset:
    if ((x < low_bound) or (x > upp_bound)):
        outlier_iqr.append(x)
        
print(outlier_iqr)

[102, 107, 108]
