# Homework 02: Naïve Bayes Classifier

In [1]:
import numpy as np
import pandas as pd

## Importing Data

In [2]:
images = pd.read_csv('hw02_images.csv', header=None)
labels = pd.read_csv('hw02_labels.csv', header=None)
labels = labels.rename({0: 'Label'}, axis=1)

## Combining Data

In [3]:
data = pd.concat([images, labels], axis=1)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,Label
0,255,255,255,255,255,255,255,250,255,255,...,255,255,225,212,255,255,255,255,255,4
1,255,255,255,254,253,255,255,255,255,255,...,255,255,255,255,254,255,255,255,255,1
2,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,2
3,255,255,255,250,251,250,250,252,250,249,...,247,248,251,252,248,250,255,255,255,3
4,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,3


In [5]:
data.shape

(35000, 785)

In [6]:
data['Label'].nunique()

5

In [7]:
sorted(data['Label'].unique())

[1, 2, 3, 4, 5]

### Labels/Classes:

- **1**: T-shirt
- **2**: Dress
- **3**: Coat
- **4**: Shirt
- **5**: Bag

## Train-Test Split

In [8]:
train = data.iloc[:30000,:]
test = data.iloc[30000:,:]

In [9]:
X_train = train.drop('Label', axis=1)
X_test = test.drop('Label', axis=1)

y_train = train['Label']
y_test = test['Label']

In [10]:
print(f"Train set shape: {train.shape}")
print(f"Test set shape: {test.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Train set shape: (30000, 785)
Test set shape: (5000, 785)
X_train shape: (30000, 784)
X_test shape: (5000, 784)
y_train shape: (30000,)
y_test shape: (5000,)


## Estimating the Parameters

In [11]:
X_1 = train[train['Label'] == 1].drop('Label', axis=1)
X_2 = train[train['Label'] == 2].drop('Label', axis=1)
X_3 = train[train['Label'] == 3].drop('Label', axis=1)
X_4 = train[train['Label'] == 4].drop('Label', axis=1)
X_5 = train[train['Label'] == 5].drop('Label', axis=1)

Xs = [X_1, X_2, X_3, X_4, X_5]

### Sample Mean 

**Sample mean**: $m =\frac{\sum_{t}x^t}{N}$

In [12]:
def estimate_sample_mean(X):
    return [np.sum(X.iloc[:, i]) / X.shape[0] for i in range(X.shape[1])]

In [13]:
sample_means = np.array([estimate_sample_mean(X) for X in Xs])
print(sample_means)

[[254.99866667 254.98416667 254.85616667 ... 254.679      254.87816667
  254.95933333]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666667 254.98983333 254.91416667 ... 252.39516667 254.44166667
  254.93666667]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


### Standard Deviation

**Variance**: $\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; s^2 = \frac{\sum_t{(x^t-m)^2}}{N}$

**Standard Deviation**: $\;\;\; s = \sqrt{\frac{\sum_t{(x^t-m)^2}}{N}} $

In [14]:
def estimate_standard_deviation(X, sample_mean):
    return [np.sqrt(np.sum((X.iloc[:, i] - sample_mean[i])**2)/X.shape[0]) for i in range(X.shape[1])]

In [15]:
sample_deviations = np.array([estimate_standard_deviation(X, sample_mean) for X, sample_mean in zip(Xs, sample_means)])
print(sample_deviations)

[[ 0.09127736  0.25609108  1.31090756 ...  5.29826629  3.9117332
   1.93959091]
 [ 0.2065419   0.2065419   0.2163818  ...  1.04076669  0.47057267
   0.70062226]
 [ 0.05163547  0.04081939  0.16002465 ... 18.43665868  6.7881694
   1.1061344 ]
 [ 0.18436076  0.21617116  1.81046936 ... 15.67799977  6.34549162
   1.79971911]
 [ 0.04471018  0.64582342  3.03248555 ... 23.62576428 13.9167006
   4.4727787 ]]


### Prior Probabilities

**Prior Probability**: $\hat{P}(C_i) = \frac{\sum_t{r_i^t}}{N}$

In [16]:
def prior_probability(X, all_X):
    return X.shape[0] / all_X.shape[0]

In [17]:
class_priors = [prior_probability(X, train) for X in Xs]
print(class_priors)

[0.2, 0.2, 0.2, 0.2, 0.2]


## Naïve Bayes Classifier

### Alpaydin, E., Introduction to Machine Learning, Section 4.2.3: Gaussian (Normal) Density

**Log likelihood of Gaussian (Normal) Density:** $\;\;\;\;\;\; L(\mu, \sigma | X) = -\frac{N}{2}\log{(2\pi)} - N\log{\sigma} - \frac{\sum_t{(x^t-\mu)^2}}{2\sigma^2}$

$\newline\newline$

### Alpaydin, E., Introduction to Machine Learning, Section 4.5: Parametric Classification

**Discriminant Function:**

$\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; g_i(x) = p(x|C_i)P(C_i)$

$\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;$or

$\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; g_i(x) = \log{p(x|C_i)} + \log{P(C_i)}$

$\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;$ where $\;\;\; p(\textbf{x}|C_i) = \frac{1}{\sqrt{2\pi\sigma^2}}\exp{\left[-\frac{(x-\mu_i)^2}{2\sigma_i^2}\right]} $

$\newline\newline$

**Parametric Classification:** $\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; g_i(\textbf{x}) = -\frac{1}{2}\log{2\pi} - \log{\sigma_i}-\frac{(x-\mu_i)^2}{2\sigma_i^2} + \log{P(C_i)}$

Plugging the estimates for the **means** $\left(m =\frac{\sum_{t}x^t}{N}\right)$, **variance** $\left(s^2 = \frac{\sum_t{(x^t-m)^2}}{N}\right)$, and **priors** $\left(\hat{P}(C_i) = \frac{\sum_t{r_i^t}}{N}\right)$, we get:

$\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; g_i(\textbf{x}) = -\frac{1}{2}\log{2\pi} - \log{s_i}-\frac{(x-m_i)^2}{2s_i^2} + \log{\hat{P}(C_i)}$

In [18]:
def discriminant_function(x, sample_mean, sample_deviation, class_prior):
    return np.sum((-1/2 * np.log(2*np.pi))
                  - (np.log(sample_deviation)) - ((x - sample_mean)**2) / (2 * sample_deviation**2)
                  + np.log(class_prior))

## Classification Algorithm

Calculate scores of x's in the training set for each class. Pick the class with the highest score.

In [19]:
def predict(x):
    
    scores = []
    
    for i in range(5):
        scores.append(discriminant_function(x, sample_means[i], sample_deviations[i], class_priors[i]))
        
    scores = pd.Series(scores)
    return scores[scores == np.max(scores)].index[0] + 1

### Predictions

In [20]:
y_pred_train = np.array([predict(X_train.iloc[i, :]) for i in range(X_train.shape[0])])
y_pred_test = np.array([predict(X_test.iloc[i, :]) for i in range(X_test.shape[0])])

## Calculating Confusion Matrix
### Train Set

In [21]:
confusion_matrix_train = pd.crosstab(y_pred_train, y_train, rownames = ["y_pred"], colnames = ["y_truth"])

print("Confusion Matrix - Training Set:")
display(confusion_matrix_train)

Confusion Matrix - Training Set:


y_truth,1,2,3,4,5
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3685,49,4,679,6
2,1430,5667,1140,1380,532
3,508,208,4670,2948,893
4,234,60,123,687,180
5,143,16,63,306,4389


### Test Set

In [22]:
confusion_matrix_test = pd.crosstab(y_pred_test, y_test, rownames = ["y_pred"], colnames = ["y_truth"])

print("Confusion Matrix - Test Set:")
display(confusion_matrix_test)

Confusion Matrix - Test Set:


y_truth,1,2,3,4,5
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,597,6,0,114,1
2,237,955,188,267,81
3,92,25,785,462,167
4,34,11,16,109,29
5,40,3,11,48,722
