# (연구&서연) GODE – Ex2 kappa에 따른 퍼포먼스 실험

최규빈  
2024-08-14

ref:
https://seoyeonc.github.io/GODE_blog/posts/Result/3_table/2024-06-22-final_GODE_code_JKSS_review.html

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import random
import pickle

import warnings
warnings.simplefilter("ignore", np.ComplexWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from haversine import haversine
from IPython.display import HTML
import plotly.graph_objects as go
import copy 

import tqdm
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector 

from pygsp import graphs, filters, plotting, utils

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc

In [9]:
class Conf_matrx:
    def __init__(self,original,compare):
        self.original = original
        self.compare = compare
    def conf(self,name):
        self.name = name
        self.conf_matrix = confusion_matrix(self.original, self.compare)
        
        # fig, ax = plt.subplots(figsize=(5, 5))
        # ax.matshow(self.conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
        # for i in range(self.conf_matrix.shape[0]):
        #     for j in range(self.conf_matrix.shape[1]):
        #         ax.text(x=j, y=i,s=self.conf_matrix[i, j], va='center', ha='center', size='xx-large')
        # plt.xlabel('Predictions', fontsize=18)
        # plt.ylabel('Actuals', fontsize=18)
        # plt.title('Confusion Matrix of ' + str(name), fontsize=18)
        # plt.show()
        
        self.acc = accuracy_score(self.original, self.compare)
        self.pre = precision_score(self.original, self.compare)
        self.rec = recall_score(self.original, self.compare)
        self.f1 = f1_score(self.original, self.compare)
        
        # print('Accuracy: %.3f' % self.acc)
        # print('Precision: %.3f' % self.pre)
        # print('Recall: %.3f' % self.rec)
        # print('F1 Score: %.3f' % self.f1)

In [10]:
class Orbit:
    def __init__(self,df):
        self.df = df 
        self.f = df.f.to_numpy()
        self.x = df.x.to_numpy()
        self.y = df.y.to_numpy()
        self.n = len(self.f)
        self.theta= None
    def get_distance(self):
        self.D = np.zeros([self.n,self.n])
        locations = np.stack([self.x, self.y],axis=1)
        for i in tqdm.tqdm(range(self.n)):
            for j in range(i,self.n):
                self.D[i,j]=np.linalg.norm(locations[i]-locations[j])
        self.D = self.D + self.D.T
    def get_weightmatrix(self,theta=1,beta=0.5,kappa=4000):
        self.theta = theta
        dist = np.where(self.D < kappa,self.D,0)
        self.W = np.exp(-(dist/self.theta)**2)
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)       
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.fbar = self.Psi.T @ self.f # fbar := graph fourier transform of f
        self.power = self.fbar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.power),sd=sd))
        self.fbar_threshed = np.where(self.power_threshed>0,self.fbar,0)
        self.fhat = self.Psi@self.fbar_threshed
        self.df = self.df.assign(fHat = self.fhat)
        self.df = self.df.assign(Residual = self.df.f- self.df.fHat)

In [11]:
class fortable:
    def __init__(self, df, clf, tab, outlier_true, conf_name = "Method"):
        self.df = df
        self.clf = clf
        self.conf_name = conf_name
        self.tabb = tab
        self.outlier_true = outlier_true
        
    def _forfit(self):
        if 'fnoise' in self.df.columns:
            self.clf.fit(self.df[['x', 'y','fnoise']])
        elif 'f' in self.df.columns:
            self.clf.fit(self.df[['x', 'y', 'f']])
        if 'f' not in self.df.columns:
            self.clf.fit(self.df[['x', 'y']])

    def _forlabels(self):
        self.labels = list(self.clf.labels_)

    def _forpredict(self):
        self.predict = self.clf.fit_predict(self.df)
        
    def comparison(self, compare_outlier = None, conf_outlier = None, gode = False):
        if gode == False:
            self._forfit()
            self._forlabels()
            if 'fnoise' in self.df.columns:
                compare_outlier = self.clf.decision_function(np.array(self.df[['x', 'y','fnoise']]))
            else:
                compare_outlier = self.clf.decision_function(self.df)
            if self.conf_name == "LOF":
                self._forpredict()
                conf_outlier = self.predict
            elif self.conf_name != "LOF":
                conf_outlier = self.labels
        elif gode == True:
            compare_outlier = compare_outlier
            conf_outlier = conf_outlier

        fpr, tpr, thresh = roc_curve(self.outlier_true, compare_outlier)       
        
        _conf = Conf_matrx(self.outlier_true, conf_outlier)
        _conf.conf(self.conf_name)
        
        _table = pd.concat([self.tabb,
                   pd.DataFrame({"Accuracy":[_conf.acc],"Precision":[_conf.pre],"Recall":[_conf.rec],"F1":[_conf.f1],"AUC":[auc(fpr, tpr)],"N":n, "Contamination": eta_sparsity,"kappa":kappa},index = [_conf.name])])
        
        return _table

# Orbit

In [12]:
n = 1000
eta_sparsity = 0.05
tab_orbit = pd.DataFrame(columns=["Accuracy","Precision","Recall","F1","AUC","N","Contamination"])

In [13]:
import numpy as np

warnings.filterwarnings('ignore')
kappas = np.linspace(0.01, 12.00, 1200)
accuracy = []
precision = [] 
recall = [] 
f1score = [] 
auc_curve = [] 
iteration = []

# Using tqdm for both the outer and inner loops
for i in tqdm.tqdm(range(30), desc="Outer Loop (Iterations)"):
    #np.random.seed(777)
    epsilon = np.around(np.random.normal(size=n),15)
    signal = np.random.choice(np.concatenate((np.random.uniform(-4, -1, round(n * eta_sparsity / 2)).round(15), np.random.uniform(1, 4, round(n * eta_sparsity / 2)).round(15), np.repeat(0, n - round(n * eta_sparsity)))), n)
    eta = signal + epsilon
    pi=np.pi
    ang=np.linspace(-pi,pi-2*pi/n,n)
    r=5+np.cos(np.linspace(0,12*pi,n))
    vx=r*np.cos(ang)
    vy=r*np.sin(ang)
    f1=10*np.sin(np.linspace(0,6*pi,n))
    f = f1 + eta
    data = pd.DataFrame({'x' : vx, 'y' : vy, 'f' : f})
    outlier_true_orbit = signal.copy()
    outlier_true_orbit = list(map(lambda x: 1 if x!=0 else 0,outlier_true_orbit))    
    orbit = Orbit(data)
    orbit.get_distance()
    for kappa in kappas:
        orbit.get_weightmatrix(theta=(orbit.D[orbit.D>0].mean()), kappa=kappa) 
        orbit.fit(sd=15)
        outlier_GODE_orbit_old = (orbit.df['Residual']**2).tolist()
        sorted_data = sorted(outlier_GODE_orbit_old, reverse=True)
        index = int(len(sorted_data) * eta_sparsity)
        five_percent = sorted_data[index]
        outlier_GODE_orbit = list(map(lambda x: 1 if x > five_percent else 0, outlier_GODE_orbit_old))
        tab_orb = fortable(data, clf=None, tab=tab_orbit, outlier_true=outlier_true_orbit, conf_name="GODE")
        result = tab_orb.comparison(compare_outlier=outlier_GODE_orbit_old, conf_outlier=outlier_GODE_orbit, gode=True)
        accuracy.append(result['Accuracy'].item())
        precision.append(result['Precision'].item())
        recall.append(result['Recall'].item())
        f1score.append(result['F1'].item())
        auc_curve.append(result['AUC'].item())
        iteration.append(i)

Outer Loop (Iterations):   0%|                                                                                             | 0/30 [00:00<?, ?it/s]
  0%|                                                                                                                    | 0/1000 [00:00<?, ?it/s]
  4%|███▊                                                                                                      | 36/1000 [00:00<00:02, 357.18it/s]
  7%|███████▋                                                                                                  | 72/1000 [00:00<00:02, 357.02it/s]
 11%|███████████▌                                                                                             | 110/1000 [00:00<00:02, 363.14it/s]
 15%|███████████████▋                                                                                         | 149/1000 [00:00<00:02, 372.99it/s]
 19%|███████████████████▉                                                                                     | 190/10

In [22]:
df_result = pd.DataFrame({'Kappa':list(kappas)*30,'Accuracy':accuracy, 'Precision':precision, 'Recall':recall, 'F1':f1score, 'AUC':auc_curve, 'Iteration':iteration})
df_result.to_csv("./ex2_kappa_results.csv",index=False)
#df_result.to_csv("./Dropbox/03_Yechan3/연구/서연이랑/GODE/ex2_kappa_results.csv",index=False)
#pd.read_csv("./Dropbox/03_Yechan3/연구/서연이랑/GODE/ex2_kappa_results.csv")

In [23]:
df_result

df