In [366]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
from matplotlib import pyplot as plt
from datetime import date
from typing import Tuple, List

In [367]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [368]:
df = df.select_dtypes(include=['float64', 'int64'])
df.isnull().sum()

carat    0
depth    0
table    0
price    0
x        0
y        0
z        0
dtype: int64

In [369]:
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [370]:
df.shape

(53940, 7)

### Implementation of *Local Outlier Factor* algorithm

#### Metrics definition

In [371]:
def euclidean_distance(x, y, axis=0):
    return np.sqrt(np.sum((x - y) ** 2, axis=axis))

def manhattan_distance(x, y, axis=0):
    return np.sum(np.abs(x - y), axis=axis)

#### K-Distance & K-Neighbors

In [373]:
def k_neighbors(point: np.ndarray, neighbors: np.ndarray, k: int, distance=euclidean_distance):
    pds = []
    for neighbor in neighbors:
        pds.append((neighbor, distance(point, neighbor)))
    pds = sorted(pds, key=lambda x: x[1])
    if len(pds) >= k:
        k_distance = pds[k-1][1]
        k_neighbors = np.array([pd[0] for pd in pds if pd[1] <= k_distance])
        return k_distance, k_neighbors
    else:
        return None, None