## Python Class

In [1]:
class Staff:

    age = 20
    gender = "Male"

    def __init__(self, income):
        
        self.income = income

    def birthday(self):

        self.age = self.age + 1

    def promotion(self):

        self.income = self.income + (self.income*0.2)
    


In [2]:
class Teacher(Staff):

    subject = "Mathematics"


In [3]:
a = Teacher(4213)

In [4]:
a.income

4213

In [5]:
a.promotion()

In [6]:
a.income

5055.6

In [7]:
a.subject

'Mathematics'

## Wrapper around KernelDensity

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KernelDensity
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.utils.estimator_checks import check_estimator
import numpy as np
import pandas as pd

In [3]:
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity

class KernelDensityEstimator(BaseEstimator):

    _estimator_type = "density_estimator"

    def __init__(self, bandwidth=1.0, kernel="gaussian", n_samples=5, random_state=1):
        
        self.bandwidth = bandwidth
        self.kernel = kernel
        self.kde = KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel)
        self.n_samples = n_samples
        self.random_state = random_state

    def fit(self, X, y=None):
        
        self.kde.set_params(bandwidth=self.bandwidth, kernel=self.kernel)
        self.kde.fit(X)
        return self
    
    def transform(self, X):
        # Perform density estimation and return the log-likelihood scores
        return self.kde.score_samples(X)
    
    def fit_predict(self, X, y=None):
        
        return self.fit(X).predict(X)
    
    def predict(self, X):
        # Perform density estimation and return the log-likelihood scores
        return self.kde.score_samples(X)
    
    # def get_params(self, deep=True):
    #     return {
    #         "bandwidth": self.bandwidth,
    #         "kernel": self.kernel,
    #         "n_samples": self.n_samples,
    #         "random_state": self.random_state,
    #     }

    # def set_params(self, **parameters):
        
    #     for parameter, value in parameters.items():
    #         setattr(self, parameter, value)
        
    #     return self


In [4]:
kde = KernelDensityEstimator()

In [5]:
df = pd.read_csv("data\sample_data.csv", index_col=0)
df.head()

Unnamed: 0,X
0,1.491043
1,1.239346
2,0.629002
3,0.997488
4,1.204905


In [6]:
kde.fit(df)

KernelDensityEstimator()

In [7]:
kde.predict(df)

array([-1.66767631, -1.67082804, -1.79637841, ..., -3.42189407,
       -3.42526018, -3.42863547])

## GridSearchCV

In [8]:
from sklearn.metrics import make_scorer

def my_custom_scorer(estimator, X, y=None):
    
    log_likelihood = estimator.fit_transform(X)

    log_likelihood = log_likelihood[(log_likelihood != float("-inf")) & (log_likelihood != float("inf"))]

    return (np.mean(np.exp(log_likelihood)))

In [9]:
estimator = KernelDensityEstimator()

In [10]:
a = GridSearchCV(estimator, {"bandwidth": [0.01, 0.2, 0.003], "kernel": ["gaussian", "cosine", "linear", "tophat"]}, scoring=my_custom_scorer)

In [11]:
a.fit(df)

GridSearchCV(estimator=KernelDensityEstimator(),
             param_grid={'bandwidth': [0.01, 0.2, 0.003],
                         'kernel': ['gaussian', 'cosine', 'linear', 'tophat']},
             scoring=<function my_custom_scorer at 0x0000023932C53A68>)

In [12]:
a.best_estimator_

KernelDensityEstimator(bandwidth=0.003, kernel='linear')

In [13]:
a.best_score_

0.6666864712609291

In [14]:
a.best_params_

{'bandwidth': 0.003, 'kernel': 'linear'}

In [22]:
a.cv_results_["mean_test_score"]

array([0.44376025, 0.46839466, 0.48448306, 0.45040625, 0.38816815,
       0.41460514, 0.41623827, 0.4064625 , 0.49632887, 0.5781873 ,
       0.66668647, 0.4640625 ])

In [21]:
a.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__bandwidth': 1.0,
 'estimator__kernel': 'gaussian',
 'estimator__n_samples': 5,
 'estimator__random_state': 1,
 'estimator': KernelDensityEstimator(),
 'n_jobs': None,
 'param_grid': {'bandwidth': [0.01, 0.2, 0.003],
  'kernel': ['gaussian', 'cosine', 'linear', 'tophat']},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': <function __main__.my_custom_scorer(estimator, X, y=None)>,
 'verbose': 0}

## Unit testing

In [34]:
def my_custom_scorer(y_actual, y_pred):
        """Custom scoring method for boats.

        This method calculates the boats of the observations of the given dataset.

        Args:
            y_actual (numpy.ndarray): Dataset to be scored. Not required for scoring.
            y_pred (numpy.ndarray): Scored log density of the observations given the dataset.
        """

        y_pred = y_pred[(y_pred != float("-inf")) & (y_pred != float("inf"))]

        return float((np.mean(np.exp(y_pred))))

In [None]:
"boats"

In [2]:
import numpy as np
from numpy import random

df = random.rand(5)

In [45]:
pred = df.copy()

In [46]:
df

array([0.28912792, 0.84078642, 0.41546704, 0.31160921, 0.06220692])

In [53]:
pred = np.array([float("inf"),1,2,3])

In [54]:
pred

array([inf,  1.,  2.,  3.])

In [67]:
my_custom_scorer(y_actual=None, y_pred=pred)

10.064291616859121

In [56]:
pred

array([inf,  1.,  2.,  3.])

In [66]:
pred = np.append(pred, [float("inf"), float("-inf")])
pred

array([ inf,   1.,   2.,   3.,  inf, -inf,  inf, -inf,  inf, -inf,  inf,
       -inf])

In [68]:
np.random.seed(42)

In [69]:
df = np.random.rand(100)

In [70]:
df.shape

(100,)

In [71]:
df

array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
       0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258,
       0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497,
       0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914,
       0.61185289, 0.13949386, 0.29214465, 0.36636184, 0.45606998,
       0.78517596, 0.19967378, 0.51423444, 0.59241457, 0.04645041,
       0.60754485, 0.17052412, 0.06505159, 0.94888554, 0.96563203,
       0.80839735, 0.30461377, 0.09767211, 0.68423303, 0.44015249,
       0.12203823, 0.49517691, 0.03438852, 0.9093204 , 0.25877998,
       0.66252228, 0.31171108, 0.52006802, 0.54671028, 0.18485446,
       0.96958463, 0.77513282, 0.93949894, 0.89482735, 0.59789998,
       0.92187424, 0.0884925 , 0.19598286, 0.04522729, 0.32533033,
       0.38867729, 0.27134903, 0.82873751, 0.35675333, 0.28093451,
       0.54269608, 0.14092422, 0.80219698, 0.07455064, 0.98688694,
       0.77224477, 0.19871568, 0.00552212, 0.81546143, 0.70685

In [72]:
from sklearn.neighbors import KernelDensity


In [81]:
kde = KernelDensity(bandwidth=1, kernel="gaussian")

In [82]:
kde.fit(df.reshape(-1,1))

KernelDensity(bandwidth=1)

In [83]:
kde.score_samples(df.reshape(-1,1))

array([-0.96611354, -1.06834464, -0.99377682, -0.96978789, -1.00680573,
       -1.00681263, -1.03924111, -1.03433524, -0.9700828 , -0.98827595,
       -1.0539889 , -1.07697771, -1.02258912, -0.99213409, -0.99972233,
       -0.99930859, -0.97443268, -0.96349715, -0.96267566, -0.97646901,
       -0.97143671, -1.01166202, -0.97632068, -0.96684881, -0.96213099,
       -1.00788171, -0.99518013, -0.96300786, -0.96906315, -1.04367827,
       -0.97088085, -1.00274911, -1.0366428 , -1.06753975, -1.0750246 ,
       -1.01485187, -0.97437682, -1.0250709 , -0.98331464, -0.96243058,
       -1.0170637 , -0.96237956, -1.04841   , -1.05087519, -0.98222224,
       -0.97924866, -0.97333398, -0.96326662, -0.96484421, -0.99893081,
       -1.07682864, -1.00501993, -1.06345662, -1.04512916, -0.96969795,
       -1.05600768, -1.02822865, -0.99609547, -1.04415201, -0.97146202,
       -0.96498694, -0.97987933, -1.02136243, -0.96779089, -0.97818981,
       -0.96456496, -1.01123177, -1.01294249, -1.03317239, -1.08

In [84]:
np.random.seed(10)
df_2 = np.random.rand(10)

In [85]:
df_2

array([0.77132064, 0.02075195, 0.63364823, 0.74880388, 0.49850701,
       0.22479665, 0.19806286, 0.76053071, 0.16911084, 0.08833981])

In [86]:
kde.score(df_2.reshape(-1,1))

-10.010108910491624

In [93]:
kde.sample(5, random_state=1).reshape(-1, )

array([-0.61075817, -1.64899334, -1.98892849,  1.7428902 , -0.83518917])