In [1]:
import sys, os
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
from scipy.integrate import quad, nquad
from sklearn.preprocessing import normalize
from sktree.tree import DecisionTreeClassifier
from sktree.ensemble import HonestForestClassifier
from sktree.stats import FeatureImportanceForestClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import entropy

import numpy as np
from scipy.stats import norm
from scipy.stats import multivariate_normal
from scipy.integrate import simps
from scipy.stats import entropy
from scipy.stats import gamma
from sklearn import metrics
from sktree.datasets import make_trunk_classification, make_trunk_mixture_classification

## Mean-Shift

In [16]:
def true_s98_mean_shift(mu0,mu1,sig):
    n = 5000000
    p = sig.shape[0]
    p_class0 = p_class1 = 0.5
    pdf_class0 = multivariate_normal(mean = mu0, cov = sig,allow_singular=True)
    pdf_class1 = multivariate_normal(mean = mu1, cov = sig,allow_singular=True)

    x_0 = pdf_class0.rvs(size = n).reshape((n,p))
    x_1 = pdf_class1.rvs(size = n).reshape((n,p))
    x = np.vstack((x_0,x_1))
    # print(x.shape)
    y = [0]*n+[1]*n

    p_x_given_class0 = np.nan_to_num(pdf_class0.pdf(x))
    p_x_given_class1 = np.nan_to_num(pdf_class1.pdf(x))

    p_x = p_x_given_class0*p_class0 + p_x_given_class1*p_class1
    # pos_class0 = p_x_given_class0*p_class0/p_x

    pos_class0 = (np.array([p_x_given_class0[i]*p_class0/p_x[i] if p_x[i] != 0 else 0 for i in range(p_x.shape[0])]).reshape(-1,1))
    pos_class1 = np.ones(pos_class0.shape) - pos_class0
    posterior = np.hstack((pos_class0.reshape(-1,1),pos_class1.reshape(-1,1)))

    fpr, tpr, thresholds = metrics.roc_curve(y, posterior[:,1], pos_label=1,drop_intermediate = False)
    s98 = np.max(tpr[fpr<=0.02])
    # print("S@98 " + str(s98))
    return s98



In [5]:
def _moving_avg_cov(n_dim, rho):
    # Create a meshgrid of indices
    i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij")

    # Calculate the covariance matrix using the corrected formula
    cov_matrix = rho ** np.abs(i - j)

    # Apply the banding condition
    cov_matrix[abs(i - j) > 1] = 0
    return cov_matrix

In [23]:
X, y, means, cov = make_trunk_classification(
        n_samples=100000,
        n_dim=1 + 1,
        n_informative=2,
        mu_0=0,
        mu_1=1,
        return_params=True,
        rho=0.5,
        # seed=seed,
    )
print(means)
print(cov)


[array([0., 0.]), array([1.        , 0.70710678])]
[array([[1. , 0.5],
       [0.5, 1. ]]), array([[1. , 0.5],
       [0.5, 1. ]])]


In [24]:
mu_1 = np.array([1 / np.sqrt(i) for i in range(1, 3)])
mu_0 = np.array([0 / np.sqrt(i) for i in range(1, 3)])
sig = _moving_avg_cov(2, 0.5)
print(sig)

### Together
print("View 1 + View 2: " + str(true_s98_mean_shift(mu_0,mu_1,sig)))

### View 1
print("View 1: " + str(true_s98_mean_shift(mu_0[1],mu_1[1],np.identity(1))))

###  View 2
print("View 2: " + str(true_s98_mean_shift(mu_0[0],mu_1[0],np.identity(1))))

[[1.  0.5]
 [0.5 1. ]]
View 1 + View 2: 0.153308
View 1: 0.0890762
View 2: 0.1459994


## Multi_Modal

In [2]:
def true_s98_multi_modal(mu0,mu1,sig0,sig1,p0= [0.5,0.5],p1=[0.5,0.5]):
    n = 5000000
    p_class0 = p_class1 = 0.5
    p = len(mu0[0])

    pdf_class0_1 = multivariate_normal(mean = mu0[0], cov = sig0[0],allow_singular=True)
    pdf_class0_2 = multivariate_normal(mean = mu0[1], cov = sig0[1],allow_singular=True)
    pdf_class1_1 = multivariate_normal(mean = mu1[0], cov = sig1[0],allow_singular=True)
    pdf_class1_2 = multivariate_normal(mean = mu1[1], cov = sig1[1],allow_singular=True)

    # print(pdf_class0_1.rvs(size = int(n*p0[0])).shape)
    x_0_1 = pdf_class0_1.rvs(size = int(n*p0[0])).reshape((int(n*p0[0]),p))
    x_0_2 = pdf_class0_2.rvs(size = int(n*p0[1])).reshape((int(n*p0[1]),p))
    x_1_1 = pdf_class1_1.rvs(size = int(n*p1[0])).reshape((int(n*p1[0]),p))
    x_1_2 = pdf_class1_2.rvs(size = int(n*p1[1])).reshape((int(n*p1[1]),p))
    x = np.vstack((x_0_1,x_0_2,x_1_1,x_1_2))
    # print(x.shape)
    y = [0]*n+[1]*n

    p_x_given_class0_1 = np.nan_to_num(pdf_class0_1.pdf(x))
    p_x_given_class0_2 = np.nan_to_num(pdf_class0_2.pdf(x))
    p_x_given_class1_1 = np.nan_to_num(pdf_class1_1.pdf(x))
    p_x_given_class1_2 = np.nan_to_num(pdf_class1_2.pdf(x))

    p_x_given_class0 = p0[0]*p_x_given_class0_1+p0[1]*p_x_given_class0_2
    p_x_given_class1 = p1[0]*p_x_given_class1_1+p1[1]*p_x_given_class1_2



    p_x = p_x_given_class0*p_class0 + p_x_given_class1*p_class1
    # pos_class0 = p_x_given_class0*p_class0/p_x

    pos_class0 = (np.array([p_x_given_class0[i]*p_class0/p_x[i] if p_x[i] != 0 else 0 for i in range(p_x.shape[0])]).reshape(-1,1))
    pos_class1 = np.ones(pos_class0.shape) - pos_class0
    posterior = np.hstack((pos_class0.reshape(-1,1),pos_class1.reshape(-1,1)))

    fpr, tpr, thresholds = metrics.roc_curve(y, posterior[:,1], pos_label=1,drop_intermediate = False)
    if len(tpr) == len(fpr) == 2:
        s98 = 0.02
    else:
        s98 = np.max(tpr[fpr<=0.02])
    # print("S@98 " + str(s98))
    return s98



In [3]:
X, y, means, covs, X_mixture = make_trunk_mixture_classification(
        n_samples=100000,
        n_dim=1 + 1,
        n_informative=2,
        mu_0=0,
        mu_1=5,
        mix=0.75,
        # scaling_factor=1,
        # seed=seed,
        rho=0.5,
        return_params=True,
    )
print(means)
print(covs)

(array([0., 0.]), array([5.        , 3.53553391]))
(array([[1. , 0.5],
       [0.5, 1. ]]), array([[1. , 0.5],
       [0.5, 1. ]]))


In [6]:
## View 1 + View 2
mu_0 = [[0,0],[0,0]]
mu_1 = [[0,0],[5,5/np.sqrt(2)]]
sig = _moving_avg_cov(2, 0.5)
sig_0 = [sig ,sig]
print(sig_0)
sig_1 = [sig ,sig]
print("View 1 + View 2: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))

## View 1 
mu_0 = [[0],[0]]
mu_1 = [[0],[5/np.sqrt(2)]]
sig_0 = [np.identity(1),np.identity(1) ]
sig_1 = [np.identity(1),np.identity(1) ]
print("View 1: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))


## View 2
mu_0 = [[0],[0]]
mu_1 = [[0],[5]]
sig_0 =[np.identity(1),np.identity(1) ]

sig_1 = [np.identity(1),np.identity(1) ]
print("View 2: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))

[array([[1. , 0.5],
       [0.5, 1. ]]), array([[1. , 0.5],
       [0.5, 1. ]])]
View 1 + View 2: 0.264652
View 1: 0.2475764
View 2: 0.264663


## Multi_Equal

In [7]:
## View 1 + View 2
mu_0 = [[0,0],[5,5/np.sqrt(2)]]
mu_1 = [[0,0],[5,5/np.sqrt(2)]]
sig = _moving_avg_cov(2, 0.5)
sig_0 = [sig,sig]
sig_1 = [sig,sig]
print("View 1 + View 2: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))

## View 1 
mu_0 = [[0],[5/np.sqrt(2)]]
mu_1 = [[0],[5/np.sqrt(2)]]
sig_0 = [np.identity(1),np.identity(1) ]
sig_1 = [np.identity(1),np.identity(1) ]
print("View 1: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))


## View 2
mu_0 = [[0],[5]]
mu_1 = [[0],[5]]
sig_0 = [np.identity(1),np.identity(1) ]
sig_1 = [np.identity(1),np.identity(1) ]
print("View 2: " + str(true_s98_multi_modal(mu_0,mu_1,sig_0,sig_1,p0= [0.75,0.25],p1=[0.75,0.25])))

View 1 + View 2: 0.02
View 1: 0.02
View 2: 0.02
