In [None]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

In [None]:
np.random.seed(0)

In [None]:
MEAN_A = [0, 0]
MEAN_B = [2, 2]
MEAN_C = [-4, 0]
COV_A = [[1, 1], [1, 3]]
COV_B = [[4, 1], [1, 2]]
COV_C = [[2, 1], [1, 2]]
N_A = 120
N_B = 60
N_C = 20

In [None]:
X_A = np.concatenate((np.random.multivariate_normal(MEAN_A, COV_A, N_A),
                     np.zeros((N_A, 1))), 
                     axis = 1)
X_B = np.concatenate((np.concatenate((np.random.multivariate_normal(MEAN_B, COV_B, N_B),
                                      np.random.multivariate_normal(MEAN_C, COV_C, N_C))),
                     np.ones((N_B + N_C, 1))),
                     axis = 1)

In [None]:
df = pd.DataFrame(np.concatenate((X_A, X_B)), columns = ['feature_1', 'feature_2', 'label'])

In [None]:
df

In [None]:
fig, ax = plt.subplots()
_ = ax.scatter(df[df.label == 0].feature_1, df[df.label == 0].feature_2, color = 'C0', alpha = 0.5)
_ = ax.scatter(df[df.label == 1].feature_1, df[df.label == 1].feature_2, color = 'C1', alpha = 0.5)

In [None]:
df = df.sample(frac = 1)
df.index = range(len(df))
df_training = df[0:math.ceil(0.8 * len(df))].copy()
df_test = df[math.ceil(0.8 * len(df)):].copy()

In [None]:
prior = df_training.groupby('label')['label'].count().rename('prior')
prior = prior / prior.sum()
prior

In [None]:
# Estimate first feature's likelihood parameters from the training data 
likelihood_1 = df_training.groupby('label')['feature_1'].agg(mean_1 = np.mean, std_1 = np.std)
likelihood_1

In [None]:
# Estimate second feature's likelihood parameters from the training data 
likelihood_2 = df_training.groupby('label')['feature_2'].agg(mean_2 = np.mean, std_2 = np.std)
likelihood_2

In [None]:
df_test['posterior_0'] = df_test.apply(lambda x: prior[0] * \
                                                 norm(loc = likelihood_1.mean_1[0], 
                                                      scale = likelihood_1.std_1[0]).pdf(x.feature_1) * \
                                                 norm(loc = likelihood_2.mean_2[0], 
                                                      scale = likelihood_2.std_2[0]).pdf(x.feature_2), 
                                       axis = 1)

df_test['posterior_1'] = df_test.apply(lambda x: prior[1] * \
                                                 norm(loc = likelihood_1.mean_1[1], 
                                                      scale = likelihood_1.std_1[1]).pdf(x.feature_1) * \
                                                 norm(loc = likelihood_2.mean_2[1], 
                                                      scale = likelihood_2.std_2[1]).pdf(x.feature_2), 
                                       axis = 1)

df_test['predicted'] = (df_test.posterior_1 > df_test.posterior_0).astype(int)

In [None]:
accuracy = (df_test['predicted'] == df_test['label']).sum() / len(df_test)
print('Prediction accuracy: ' + str(accuracy))

In [None]:
confusion = [[(df_test[df_test['label'] == 0]['predicted'] == 0).sum(),
              (df_test[df_test['label'] == 0]['predicted'] == 1).sum()],
             [(df_test[df_test['label'] == 1]['predicted'] == 0).sum(),
             (df_test[df_test['label'] == 1]['predicted'] == 1).sum()]]
print('Confusion matrix:')
pd.DataFrame(data = confusion, columns = ['predicted 0', 'predicted 1'], index = ['class 0', 'class 1'])

In [None]:
RESOLUTION = 100
x = np.linspace(-7, 7, RESOLUTION)
y = np.linspace(-5, 7, RESOLUTION)
xx, yy = np.meshgrid(x, y)
boundaries = pd.DataFrame(np.concatenate((xx.reshape(RESOLUTION ** 2, 1), 
                                          yy.reshape(RESOLUTION ** 2, 1)), axis = 1),
                          columns = ['feature_1', 'feature_2'])

In [None]:
boundaries['posterior_0'] = boundaries.apply(lambda x: prior[0] * \
                                                       norm(loc = likelihood_1.mean_1[0], 
                                                            scale = likelihood_1.std_1[0]).pdf(x.feature_1) * \
                                                       norm(loc = likelihood_2.mean_2[0], 
                                                            scale = likelihood_2.std_2[0]).pdf(x.feature_2), 
                                             axis = 1)

boundaries['posterior_1'] = boundaries.apply(lambda x: prior[1] * \
                                                       norm(loc = likelihood_1.mean_1[1], 
                                                            scale = likelihood_1.std_1[1]).pdf(x.feature_1) * \
                                                       norm(loc = likelihood_2.mean_2[1], 
                                                            scale = likelihood_2.std_2[1]).pdf(x.feature_2), 
                                             axis = 1)

boundaries['predicted'] = (boundaries.posterior_1 > boundaries.posterior_0).astype(int)

In [None]:
fig, ax = plt.subplots()
ext = [-7, 7, -5, 7]
colormap = plt.cm.RdYlBu
ax.imshow(boundaries['predicted'].values.reshape(RESOLUTION, RESOLUTION), 
          zorder = 0, 
          extent = ext, 
          alpha = 0.5, 
          cmap = colormap, 
          origin = 'lower')
_ = ax.scatter(df_training[df_training.label == 0].feature_1, 
               df_training[df_training.label == 0].feature_2, 
               color = colormap(0))
_ = ax.scatter(df_training[df_training.label == 1].feature_1, 
               df_training[df_training.label == 1].feature_2, 
               color = colormap(256))

In [None]:
fig, ax = plt.subplots()
ext = [-7, 7, -5, 7]
colormap = plt.cm.RdYlBu
ax.imshow(boundaries['predicted'].values.reshape(RESOLUTION, RESOLUTION), 
          zorder = 0, 
          extent = ext, 
          alpha = 0.5, 
          cmap = colormap, 
          origin = 'lower')
_ = ax.scatter(df_test[df_test.label == 0].feature_1, df_test[df_test.label == 0].feature_2, color = colormap(0))
_ = ax.scatter(df_test[df_test.label == 1].feature_1, df_test[df_test.label == 1].feature_2, color = colormap(256))

In [None]:
# TODO: when training and test, plot boundaries of the classes
# No need for laplacian smoothing
# Refactor classification into a function
# Refactor plotting the imshow background and scatter plot unto a function