In [1]:
import pandas as pd
import numpy as np 


# Machine Learning Tutorial: 
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

In [2]:
q = pd.Series([1.65, 1.45, 1.55, 1.65, 1.75, 1.80, 1.47, 1.32, 1.76, 1.85, 1.68, 16.9], index=pd.period_range('2019-01-01',freq='Q',periods=12))
df = pd.DataFrame(q)

In [3]:
df.columns = ['AUM (Millions)']

In [4]:
df  

Unnamed: 0,AUM (Millions)
2019Q1,1.65
2019Q2,1.45
2019Q3,1.55
2019Q4,1.65
2020Q1,1.75
2020Q2,1.8
2020Q3,1.47
2020Q4,1.32
2021Q1,1.76
2021Q2,1.85


In [5]:
def anomaly_detection (df):
    for column in df:
        # Z-Score
        stdev = np.std(df[column])
        mean = df[column].mean()
        z_score = (df[column]-mean)/stdev
        outlier_z = z_score>3
        cleaned_df = df[]
        print(f'{np.where(z_score>3)}')
        # IQR
        percentile25 = df[column].quantile(0.25)
        percentile75 = df[column].quantile(0.75)
        iqr = percentile75 - percentile25
        upper_limit = percentile75 + 1.5 * iqr
        lower_limit = percentile25 - 1.5 * iqr
        outlier_iqr = [(df[column]>upper_limit) | (df[column]<lower_limit)]
        print(f'{np.where((df[column]>upper_limit) | (df[column]<lower_limit))}')

In [16]:
def anomaly_detection (df):
    for column in df:
        # Z-Score
        stdev = np.std(df[column])
        mean = df[column].mean()
        z_score = (df[column]-mean)/stdev
        outlier_z = z_score>3
        df_clean = df[outlier_z==False]
        return df_clean

In [17]:
anomaly_detection(df)


Unnamed: 0,AUM (Millions)
2019Q1,1.65
2019Q2,1.45
2019Q3,1.55
2019Q4,1.65
2020Q1,1.75
2020Q2,1.8
2020Q3,1.47
2020Q4,1.32
2021Q1,1.76
2021Q2,1.85


In [59]:
# Z score method:
mean = df['AUM (Millions)'].mean()
stdev = np.std(df['AUM (Millions)'])
df['Z-Score'] = (df['AUM (Millions)']-mean)/stdev # Z-score > 3 == outlier
# Outliers as a list:
outliers_z = df['Z-Score']>3
# Position of the outlier in original dataframe:
print(np.where(df['Z-Score'] > 3))

2019Q1    False
2019Q2    False
2019Q3    False
2019Q4    False
2020Q1    False
2020Q2    False
2020Q3    False
2020Q4    False
2021Q1    False
2021Q2    False
2021Q3    False
2021Q4     True
Freq: Q-DEC, Name: Z-Score, dtype: bool

In [61]:
# IQR Method
percentile25 = df['AUM (Millions)'].quantile(0.25)
percentile75 = df['AUM (Millions)'].quantile(0.75)
iqr = percentile75 - percentile25
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
# Outliers as a list:
outliers_iqr = [(df['AUM (Millions)']>upper_limit) | (df['AUM (Millions)']<lower_limit)]
# Position of outlier in original dataframe:
print(np.where((df['AUM (Millions)']>upper_limit) | (df['AUM (Millions)']<lower_limit)))


(array([11], dtype=int64),)


In [73]:
#scale and normalize
scaler = StandardScaler()
# X = X.values
X_s = scaler.fit_transform(X.reshape(-1,1))
X_norm = pd.DataFrame(normalize(X_s))

In [82]:
pca = PCA(n_components = 1)
X_reduce = pca.fit_transform(X_norm)
X_reduce = pd.DataFrame(X_reduce)
X_reduce.columns = list([f'P{i}' for i in range(1, len(X_reduce.columns)+1)])

In [83]:
db_model = DBSCAN(eps = 0.05, min_samples = 10).fit(X_reduce)
labels = db_model.labels_

In [85]:
n_clusters = len(np.unique(labels))-1
anomaly = list(labels).count(-1)
print(f'Clusters: {n_clusters}')
print(f'Abnormal points: {anomaly}')

Clusters: 1
Abnormal points: 1


In [112]:
X_anomaly = X[np.argwhere(labels==-1).reshape((-1,))]
print(X_anomaly)
# Check for anomaly in original dataframe:
print(np.where(df['AUM (Millions)'] == X_anomaly[0]))

[16.9]
(array([11], dtype=int64),)


In [32]:
import requests
import numpy as np
import pandas as pd

r = requests.get('https://coderbyte.com/api/challenges/json/list-numbers')
data = r.json()['data']

type(data)


list

In [38]:
strArr = ["b-e","b-c","c-d","a-b","e-f"]
# paths_list = strArr.split(',')
# print(paths_list)

In [44]:
for edge in strArr:
        paths = edge.split(',')
        print(paths)
        start, end = edge.split("-")
        print(start, end)


['b-e']
b e
['b-c']
b c
['c-d']
c d
['a-b']
a b
['e-f']
e f


('e', 'f', ['e-f'])

In [None]:
# ends with begining of another or begins with end of another
