In [1]:
import os
import random
import pandas as pd
import numpy as np
from numpy import quantile, where, random
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.ensemble import IsolationForest
import hdbscan
pd.set_option('display.max_rows', None)

In [2]:
## SET RANDOM STATE
random.seed(39)
distance_metric = 'braycurtis'
outlier_value = 0.15

In [3]:
def set_initial_results_dataframe(csv_file_path):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    results_df = pd.DataFrame(index=temp_df.index)
    results_df.insert(0, 'id', temp_df.index)

    return results_df

def set_anomaly_by_id(row, outlier_list):
    if row.id in outlier_list:
        return 'anomaly'
    return 'notanomaly'

def set_anomaly_by_score(row):
    if row['anomaly_score'] == -1:
        return 'anomaly'
    return 'notanomaly'

In [4]:
# n_neighbors = default
# default distance metric = braycurtis
# outlier_value = 0.15
def run_LOF(csv_file_path, results_df, distance=distance_metric):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    diststr = distance if distance == 'braycurtis' else 'precomputed'

    model = LocalOutlierFactor(metric=diststr, contamination=outlier_value, n_neighbors=len(temp_df))
    temp_df['anomaly_score'] = model.fit_predict(temp_df.values)
    temp_df['anomaly_score2'] = temp_df.apply(set_anomaly_by_score, axis=1)

    column_name = 'LOF_' + '_'.join(csv_file_path.split('/')[-1].split('_')[1:3]) + f'_{distance}'
    results_df[column_name] = temp_df['anomaly_score2']

    return results_df

In [5]:
# n_neighbors = default
# default distance metric = braycurtis
# outlier_value = 0.15
def run_KNN(csv_file_path, results_df, distance=distance_metric):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    diststr = distance if distance == 'braycurtis' else 'precomputed'

    model = NearestNeighbors(metric=diststr)
    model.fit(temp_df.values)
    distances, indexes = model.kneighbors(temp_df.values)

    threshold = pd.Series(distances.mean(axis = 1)).quantile(1 - outlier_value)
    outlier_index = np.where(distances.mean(axis = 1) > threshold)
    outlier_values = temp_df.iloc[outlier_index]
    outlier_samples = outlier_values.index.values.tolist()

    column_name = 'KNN_' + '_'.join(csv_file_path.split('/')[-1].split('_')[1:3]) + f'_{distance}'
    results_df[column_name] = results_df.apply(set_anomaly_by_id, outlier_list=outlier_samples , axis=1)

    return results_df

In [232]:
# default distance metric = braycurtis
def run_OPTICS(csv_file_path, results_df, distance=distance_metric):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    diststr = distance if distance == 'braycurtis' else 'precomputed'

    model = OPTICS(metric=diststr)
    model.fit(temp_df.values)
    
    scores = model.core_distances_
    threshold = quantile(scores, 1-outlier_value)
    outlier_index = where(scores >= threshold)
    outlier_values = temp_df.iloc[outlier_index]
    outlier_samples = outlier_values.index.values.tolist()

    column_name = 'OPTICS_' + '_'.join(csv_file_path.split('/')[-1].split('_')[1:3]) + f'_{distance}'
    results_df[column_name] = results_df.apply(set_anomaly_by_id, outlier_list=outlier_samples , axis=1)

    return results_df

In [233]:
# cluster_selection_method = leaf
# default distance metric = braycurtis
def run_HDBScan(csv_file_path, results_df, distance=distance_metric):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    diststr = distance if distance == 'braycurtis' else 'precomputed'

    model = hdbscan.HDBSCAN(metric=diststr, cluster_selection_method='leaf', allow_single_cluster=True)
    model.fit(temp_df.values)
    
    threshold = pd.Series(model.outlier_scores_).quantile(1-outlier_value)
    outliers = np.where(model.outlier_scores_ > threshold)[0]
    outlier_values = temp_df.iloc[outliers]
    outlier_samples = outlier_values.index.values.tolist()

    column_name = 'HDBSCAN_' + '_'.join(csv_file_path.split('/')[-1].split('_')[1:3]) + f'_{distance}'
    results_df[column_name] = results_df.apply(set_anomaly_by_id, outlier_list=outlier_samples , axis=1)

    return results_df

In [151]:
# number estimators = 200
# max samples = auto
def run_IsolationForest(csv_file_path, results_df):

    temp_df = pd.read_csv(csv_file_path, index_col=[0])
    model = IsolationForest(n_estimators=200, contamination=outlier_value)
    temp_df['anomaly_score'] = model.fit_predict(temp_df.values)
    temp_df['anomaly_score2'] = temp_df.apply(set_anomaly_by_score, axis=1)

    column_name = 'ISOLATIONF_' + '_'.join(csv_file_path.split('/')[-1].split('_')[1:3])
    results_df[column_name] = temp_df['anomaly_score2']

    return results_df

In [234]:
study_name="turnbaugh"
taxa_rank="genus"
transformation="clr"
distance_measure="gunifrac"

tempfiles_path = '/home/pedro/Desktop/Masters/4th Semester/Thesis/experiments/results/turnbaugh/genus_clr_gunifrac/tempfiles'
tempfiles = os.listdir(tempfiles_path)

for file in tempfiles:
    if all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'otus']):
        class1_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'distances']):
        class1_distances = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'otus']):
        class2_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'distances']):
        class2_distances = os.path.join(tempfiles_path,file)

In [6]:
study_name="claesson"
taxa_rank="genus"
transformation="Log10"
unifrac_measure="gunifrac"

tempfiles_path = '/home/pedro/Desktop/Masters/4th Semester/Thesis/experiments/results/claesson/genus_Log10_gunifrac/tempfiles'
tempfiles = os.listdir(tempfiles_path)

for file in tempfiles:
    if all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'otus']):
        class1_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'distances']):
        class1_distances = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'otus']):
        class2_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'distances']):
        class2_distances = os.path.join(tempfiles_path,file)

In [178]:
study_name="sokol"
taxa_rank="genus"
transformation="clr"
distance_measure="va-wunifrac"

tempfiles_path = '/home/pedro/Desktop/Masters/4th Semester/Thesis/experiments/results/sokol/genus_clr_va-wunifrac/tempfiles'
tempfiles = os.listdir(tempfiles_path)

for file in tempfiles:
    if all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'otus']):
        class1_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class1', 'distances']):
        class1_distances = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'otus']):
        class2_otu = os.path.join(tempfiles_path,file)
    elif all(x in file for x in [study_name, taxa_rank, transformation, unifrac_measure, 'class2', 'distances']):
        class2_distances = os.path.join(tempfiles_path,file)

In [7]:
class1_df_results = set_initial_results_dataframe(class1_otu)
class2_df_results = set_initial_results_dataframe(class2_otu)

In [241]:
# LOF
print('RUNNING LOCAL OUTLIER FACTOR MODEL ON CLASSES 1 AND 2')
class1_df_results = run_LOF(class1_otu, class1_df_results)
class1_df_results = run_LOF(class1_distances, class1_df_results, distance=distance_measure)
class2_df_results = run_LOF(class2_otu, class2_df_results)
class2_df_results = run_LOF(class2_distances, class2_df_results, distance=distance_measure)

RUNNING LOCAL OUTLIER FACTOR MODEL ON CLASSES 1 AND 2


In [242]:
# KNN
print('RUNNING K-NEAREST NEIGHBORS MODEL ON CLASSES 1 AND 2')
class1_df_results = run_KNN(class1_otu, class1_df_results)
class1_df_results = run_KNN(class1_distances, class1_df_results, distance=distance_measure)
class2_df_results = run_KNN(class2_otu, class2_df_results)
class2_df_results = run_KNN(class2_distances, class2_df_results, distance=distance_measure)

RUNNING K-NEAREST NEIGHBORS MODEL ON CLASSES 1 AND 2


In [243]:
# OPTICS
print('RUNNING OPTICS MODEL ON CLASSES 1 AND 2')
class1_df_results = run_OPTICS(class1_otu, class1_df_results)
class1_df_results = run_OPTICS(class1_distances, class1_df_results, distance=distance_measure)
class2_df_results = run_OPTICS(class2_otu, class2_df_results)
class2_df_results = run_OPTICS(class2_distances, class2_df_results, distance=distance_measure)

RUNNING OPTICS MODEL ON CLASSES 1 AND 2


In [244]:
# HDBSCAN
print('RUNNING HDBSCAN MODEL ON CLASS 1 AND 2')
class1_df_results = run_HDBScan(class1_otu, class1_df_results)
class1_df_results = run_HDBScan(class1_distances, class1_df_results, distance=distance_measure)
class2_df_results = run_HDBScan(class2_otu, class2_df_results)
class2_df_results = run_HDBScan(class2_distances, class2_df_results, distance=distance_measure)

RUNNING HDBSCAN MODEL ON CLASS 1 AND 2


In [245]:
# Isolation Forest
print('RUNNING ISOLATION FOREST MODEL ON CLASS 1 AND 2')
class1_df_results = run_IsolationForest(class1_otu, class1_df_results)
class2_df_results = run_IsolationForest(class2_otu, class2_df_results)

RUNNING ISOLATION FOREST MODEL ON CLASS 1 AND 2


In [247]:
class1_df_results

Unnamed: 0,id,LOF_genus_relative_braycurtis,LOF_genus_relative_gunifrac,KNN_genus_relative_braycurtis,KNN_genus_relative_gunifrac,OPTICS_genus_relative_braycurtis,OPTICS_genus_relative_gunifrac,HDBSCAN_genus_relative_braycurtis,HDBSCAN_genus_relative_gunifrac,ISOLATIONF_genus_relative
EMC04.441704,EMC04.441704,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly
EMC01.441661,EMC01.441661,notanomaly,anomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly
EMC09.441635,EMC09.441635,anomaly,anomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly
EMC05.441628,EMC05.441628,anomaly,notanomaly,notanomaly,anomaly,notanomaly,anomaly,notanomaly,anomaly,notanomaly
EMC07.441636,EMC07.441636,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,anomaly
EMC06.441610,EMC06.441610,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly
EMC08.441752,EMC08.441752,notanomaly,notanomaly,anomaly,notanomaly,anomaly,notanomaly,anomaly,notanomaly,notanomaly
EMC03.441713,EMC03.441713,notanomaly,notanomaly,anomaly,anomaly,anomaly,anomaly,anomaly,anomaly,anomaly
EMC02.441750,EMC02.441750,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly,notanomaly


In [12]:
temp_df = pd.read_csv(class2_distances, index_col=[0])
diststr = 'precomputed'

model = OPTICS(metric=diststr)
model.fit_predict(temp_df.values)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])