In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.manifold import MDS

In [None]:
np.random.seed(0)

In this notebook, we test a simple implementation of Naive Bayes classifier for a categorical dataset. Naive Bayes is based on applying Bayes theorem and assumes independence between input variables. 

## Generate toy_categorical_dataset

In [None]:
cond_probs_feat_a = pd.DataFrame(data = [[0.2, 0.1, 0.7],
                                         [0.5, 0.3, 0.2]], 
                                 columns = ['value_1', 'value_2', 'value_3'],
                                 index = ['class_a', 'class_b'])
cond_probs_feat_a

In [None]:
cond_probs_feat_b = pd.DataFrame(data = [[0.8, 0.2],
                                         [0.4, 0.6]], 
                                 columns = ['value_1', 'value_2'],
                                 index = ['class_a', 'class_b'])
cond_probs_feat_b

In [None]:
cond_probs_feat_c = pd.DataFrame(data = [[0.1, 0.2, 0.3, 0.4],
                                         [0.9, 0.02, 0.05, 0.03]], 
                                 columns = ['value_1', 'value_2', 'value_3', 'value_4'],
                                 index = ['class_a', 'class_b'])
cond_probs_feat_c

In [None]:
prob_class = pd.Series([0.4, 0.6], index = ['class_a', 'class_b'])
prob_class

In [None]:
data = []
classes = []
for i in range(1000):
    observed = []
    # Sampling class label
    if np.random.rand() > prob_class.loc['class_a']:
        classes.append('class_b')
    else:
        classes.append('class_a')
    # Sampling features
    for cond_probs in [cond_probs_feat_a, cond_probs_feat_b, cond_probs_feat_c]:
        df = cond_probs.loc[classes[-1]].cumsum()
        observed.append(df[df > np.random.rand()].index.values[0])
    
    data.append(observed)

In [None]:
data = pd.DataFrame(data = data, columns = ['feature_a', 'feature_b', 'feature_c'])
data['class'] = classes
data.head()

## Visualise the data

In [None]:
_ = data.groupby('class')['class'].count().plot.bar()

In [None]:
df = data.groupby(['class', 'feature_a'])['feature_a'].count().unstack('feature_a')
_ = df.div(df.sum(axis = 1), axis = 0).plot.bar(stacked = True)

In [None]:
df = data.groupby(['class', 'feature_b'])['feature_b'].count().unstack('feature_b')
_ = df.div(df.sum(axis = 1), axis = 0).plot.bar(stacked = True)

In [None]:
df = data.groupby(['class', 'feature_c'])['feature_c'].count().unstack('feature_c')
_ = df.div(df.sum(axis = 1), axis = 0).plot.bar(stacked = True)

In [None]:
df = data.copy(deep = True)
for f in ['feature_a', 'feature_b', 'feature_c']:
    values = df[f].unique().tolist()
    for v in values:
        df[f + '_' + v] = (df[f] == v).astype(int)
    df = df.drop(f, axis = 1)

In [None]:
X = df.iloc[:, 1:].values
embedding = MDS(n_components = 2)
X_transformed = embedding.fit_transform(X)

In [None]:
X_transformed = X_transformed + np.random.rand(X_transformed.shape[0], X_transformed.shape[1]) * 0.5

In [None]:
c_a = df[df['class'] == 'class_a'].index.values
c_b = df[df['class'] == 'class_b'].index.values
fig, ax = plt.subplots()
ax.scatter(X_transformed[c_a, 0], X_transformed[c_a, 1], alpha = 0.5, color = 'C1')
ax.scatter(X_transformed[c_b, 0], X_transformed[c_b, 1], alpha = 0.5, color = 'C2')
_ = ax.legend(['class_a', 'class_b'])

## Without laplacian smoothing

### Fit the model

We use 80% of the data

### Predict test data

Using the 20% remaining data. 

## Laplacian smoothing

### Fit the model

### Predict test data