### Test KDE
The following code verifies the formula used in Sklearn KernelDensity, by estimating the density for a single point and ensuring the same density is manually estimated as is computed by sklearn.

In [None]:
import numpy as np
# X = np.random.multivariate_normal(mean=[0.5,0.5], cov=[[0.1, 0.1],[0.1,0.1]], size=100)

X = np.random.normal(0.5, 0.1, size=100)
X_r = np.reshape(X, (100,1))

from sklearn.neighbors.kde import KernelDensity 
bandwidth = 0.1
kde = KernelDensity(bandwidth=bandwidth,  metric='euclidean', kernel='gaussian' )
kde.fit(X_r)
    
    
test_sample = X[1]

k_score = kde.score_samples([[test_sample]])
print("Density assigned by the Sklearn KDE " + str(np.exp(k_score[0])))


def gaus_kernel(u):
    return (1/np.sqrt(2*np.pi)) * np.exp((-1/2)* (u**2))

def get_k(cur_x, X, h):
    N=X.shape[0]
    s = 0
    for x_i in X:
        u = (cur_x - x_i)/h
        s += gaus_kernel(u) 
    print(s)
    return s * (1/(N*h))

a=get_k(cur_x=test_sample, X=X, h=bandwidth)
print("Density assigned manually: " + str(a))

## Test random baselines

In [None]:
from sklearn.datasets import load_iris, load_diabetes, load_wine
data = load_wine()


In [None]:
data.target_names

In [None]:
import pandas as pd
y=pd.DataFrame(data.target, columns=['label'])

In [None]:
X=pd.DataFrame(data.data)

In [None]:
import random
# Return a random integer N such that a <= N <= b. Alias for randrange(a, b+1)
rand_class = random.randint(0,2)

In [None]:
def get_preds(y):
    N = y.shape[0]
    preds = []
    for i in range(N):
#         rand_class = random.randint(0,2)
#         preds.append(rand_class)
        
        # Using frequencies
        ri = random.randint(1,100)
        if ri >= 0 and ri <= 33:
            rand_class=0
        if ri > 33 and ri <= 73:
            rand_class=1
        if ri > 73:
            rand_class=2
        preds.append(rand_class)  
    return preds

In [None]:
from sklearn.metrics import precision_score,recall_score
avg_r = np.array([0,0,0])
avg_p = np.array([0,0,0])
n_iterations = 1000
for i in range(n_iterations):
    preds = get_preds(y)
    p=precision_score(y_true=y['label'], y_pred=preds, average=None)
    r=recall_score(y_true=y['label'], y_pred=preds, average=None)
    avg_r = avg_r+r
    avg_p = avg_p+p
a_r = avg_r/n_iterations
a_p = avg_p/n_iterations

print("Average recall: " + str(np.round(a_r,2)))
print("Average precision: " + str(np.round(a_p,2)))

In [None]:
c0 = y.loc[y.label==0].shape[0]/y.shape[0]
c1 = y.loc[y.label==1].shape[0]/y.shape[0]
c2 = y.loc[y.label==2].shape[0]/y.shape[0]

In [None]:
print(str(round(c0,2)), str(round(c1,2)), str(round(c2,2)))

In [None]:
# Purity baselines = proportion of class
# Recall baselines = 1/number of classes

In [None]:
ri = random.randint(1,100)
