# Density Estimation in Pattern Recognition: Non-Parametric Methods

- Synthetic data (1D and 2D)
- Histogram baseline
- KDE/Parzen with multiple kernels
- Bandwidth selection: Silverman & CV
- kNN density estimation (1D & 2D)
- Plug-in classifier demo (class-wise KDE)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity, NearestNeighbors
from sklearn.model_selection import GridSearchCV, KFold
np.random.seed(42)
print('Libraries imported.')

## 1. Synthetic Data (1D)
A bimodal 1D mixture for demonstrating model bias/variance.

In [None]:
n = 600
g2mix = np.random.rand(n) < 0.5 #Mixture of 2 Gaussains
x = np.empty(n)
x[g2mix] = np.random.normal(loc=-2.0, scale=0.6, size=g2mix.sum())
x[~g2mix] = np.random.normal(loc=1.8, scale=0.9, size=(~g2mix).sum())
x = x.reshape(-1, 1)
#y = np.zeros_like(x)
y = np.random.rand(len(x)) * 0.2 - 0.1 # Small random values around 0
print('1D data shape:', x.shape)

plt.figure()
plt.scatter(x, y,alpha=0.3)
plt.ylim(bottom=-0.7, top=0.7)
plt.xlim(xmin=-10, xmax=10)
plt.xlabel('x'); plt.title('1D point spread on x-axis')
plt.show()

## 3. Histogram Density (Baseline)

In [None]:
plt.figure()
plt.hist(x.ravel(), bins=30, density=True, edgecolor='black')
plt.xlabel('x'); plt.ylabel('density'); plt.title('Histogram (bins=30)')
plt.show()

## 4. Parzen / KDE (1D): Kernels and Bandwidth

In [None]:
kernels = ['gaussian', 'tophat', 'epanechnikov']
bandwidths = [0.1, 0.3, 0.6, 1.0]
X_plot = np.linspace(x.min()-3, x.max()+3, 400).reshape(-1, 1)
for ker in kernels:
    for bw in bandwidths:
        kde = KernelDensity(kernel=ker, bandwidth=bw).fit(x)
        dens = np.exp(kde.score_samples(X_plot))
        plt.figure()
        plt.hist(x.ravel(), bins=30, density=True, alpha=0.3, edgecolor='black')
        plt.plot(X_plot, dens)
        plt.title(f'KDE kernel={ker}, h={bw}')
        plt.xlabel('x'); plt.ylabel('density')
        plt.show()

## 5. Bandwidth Selection (1D): Silverman

In [None]:
def silverman_bandwidth(x1d):
    sigma = np.std(x1d, ddof=1)
    n = len(x1d)
    return 1.06 * sigma * n ** (-1/5)

h_silv = silverman_bandwidth(x.ravel())
params = {'bandwidth': np.linspace(max(0.05, 0.3*h_silv), 2.5*h_silv, 15)}
#grid = GridSearchCV(KernelDensity(kernel='gaussian'), params,
                   #cv=KFold(n_splits=5, shuffle=True, random_state=0))
#grid.fit(x)
#best_bw = grid.best_params_['bandwidth']
print('Silverman h:', h_silv)

bw, label = [h_silv,'Silverman']
kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(x)
dens = np.exp(kde.score_samples(X_plot))
plt.figure()
plt.hist(x.ravel(), bins=30, density=True, alpha=0.3, edgecolor='black')
plt.plot(X_plot, dens)
plt.title(f'Gaussian KDE with {label} h={bw:.3f}')
plt.xlabel('x'); plt.ylabel('density')
plt.show()

## 6. k-Nearest Neighbor Density (1D)

In [None]:
from sklearn.neighbors import NearestNeighbors
def knn_density_1d(x_train, x_eval, k=15):
    nbrs = NearestNeighbors(n_neighbors=k).fit(x_train)
    dists, _ = nbrs.kneighbors(x_eval)
    r_k = dists[:, -1]
    V_k = 2 * r_k  # 1D interval length
    n = len(x_train)
    return k / (n * V_k)

for k in [3, 10, 30, 100]:
    dens_knn = knn_density_1d(x, X_plot, k=k)
    plt.figure()
    plt.hist(x.ravel(), bins=30, density=True, alpha=0.3, edgecolor='black')
    plt.plot(X_plot, dens_knn)
    plt.title(f'kNN density (k={k})')
    plt.xlabel('x'); plt.ylabel('density')
    plt.show()