In [1]:
import numpy as np

def detect_outliers_zscore(data, threshold=3):
    mean = np.mean(data)
    std = np.std(data)
    z_scores = [(y - mean) / std for y in data]
    print('ran')
    return np.where(np.abs(z_scores) > threshold)


In [2]:
def detect_outliers_iqr(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    outlier_indices = np.where((data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR))
    print('ran')
    return outlier_indices


In [3]:
from sklearn.ensemble import IsolationForest
import numpy as np

def detect_outliers_isolation_forest(data):
    # Reshape data if it's a 1D array
    if data.ndim == 1:
        data = data.reshape(-1, 1)
    
    clf = IsolationForest(random_state=42, contamination='auto')
    preds = clf.fit_predict(data)
    
    # Find indices of outliers
    outlier_indices = np.where(preds == -1)
    print('ran')
    return outlier_indices


In [4]:
from sklearn.neighbors import LocalOutlierFactor

def detect_outliers_lof(data):
    # Reshape data if it's a 1D array
    if data.ndim == 1:
        data = data.reshape(-1, 1)
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto')
    preds = lof.fit_predict(data)
    outlier_indices = np.where(preds == -1)
    print('ran')
    return outlier_indices


In [5]:
from sklearn.cluster import DBSCAN

def detect_outliers_dbscan(data, eps=0.5, min_samples=5):
    # Reshape data if it's a 1D array
    if data.ndim == 1:
        data = data.reshape(-1, 1)
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    preds = dbscan.fit_predict(data)
    outlier_indices = np.where(preds == -1)
    print('ran')
    return outlier_indices


In [6]:
from sklearn.svm import OneClassSVM

def detect_outliers_one_class_svm(data, nu=0.05, gamma='scale'):
    # Reshape data if it's a 1D array
    if data.ndim == 1:
        data = data.reshape(-1, 1)
    oc_svm = OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    preds = oc_svm.fit_predict(data)
    outlier_indices = np.where(preds == -1)
    print('ran')
    return outlier_indices


In [7]:
import pandas as pd
import os
from pync import Notifier

df = pd.read_csv('../csv/merged_data.csv')

# Detect outliers using Z-Score
target_feature = df['aqi'].values
outliers_zscore = detect_outliers_zscore(target_feature)
outliers_iqr = detect_outliers_iqr(target_feature)
outliers_isolation_forest = detect_outliers_isolation_forest(target_feature)
outliers_lof = detect_outliers_lof(target_feature)
# outliers_dbscan = detect_outliers_dbscan(target_feature)
# outliers_oneclass_svm =detect_outliers_one_class_svm(target_feature)
print("Outliers detected using Z-Score:", outliers_zscore)
print("Outliers detected using IQR:", outliers_iqr)
print("Outliers detected using Isolation Forest:", outliers_isolation_forest)
print("Outliers detected using Local Outlier Factor:", outliers_lof)
# print("Outliers detected using DBScan:", outliers_dbscan)
# print("Outliers detected using One Class SVM:", outliers_oneclass_svm)


Notifier.notify('Your notebook has finished running.', title='Notification')
os.system('say "Your notebook has finished running."')


  df = pd.read_csv('../csv/merged_data.csv')


ran
ran
ran
ran
Outliers detected using Z-Score: (array([    279,     284,     478, ..., 1028593, 1028594, 1028595]),)
Outliers detected using IQR: (array([    265,     279,     284, ..., 1028593, 1028594, 1028595]),)
Outliers detected using Isolation Forest: (array([      6,       8,       9, ..., 1028655, 1028656, 1028657]),)
Outliers detected using Local Outlier Factor: (array([   1500,    7363,    7366,    8644,   25629,   43467,   48730,
         58678,   65105,   89628,  103066,  111360,  111368,  111376,
        111408,  151073,  151355,  164550,  165315,  171252,  193863,
        222778,  225726,  225730,  225747,  264820,  295326,  366827,
        390001,  435521,  435531,  474857,  474899,  494973,  494974,
        524459,  606726,  972803,  972809,  972815, 1023843, 1023848,
       1024283, 1024287, 1024348, 1024384]),)


0