In [None]:
# reading input data
import numpy as np
import pandas as pd

df=pd.read_csv('data/data_weather.csv')
df

In [None]:
# displaying input data (and labels)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(2, 2, sharex='col', sharey='row')
ax = ax.ravel()
fig.suptitle('Features')
for i in range(1,5):
    ax[i-1].hist(df.iloc[:,i], bins=20)
    ax[i-1].set_title(df.columns[i])
plt.figure()
ax=df['weather'].hist()
ax.set_title('occuring classes')

In [None]:
# parsing labels into [0,1, ...]; defining X and c
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
c = le.fit_transform(df['weather'])
X = df[['precipitation', 'temp_max', 'temp_min', 'wind']].to_numpy()



Naive Bayes classifier: $\arg \max_{1\leq i\leq n} \left\{ \log \left( \prod_{k=1}^m P(x^{(k)}|c_i) \cdot P(c_i) \right) \right\} = \arg \max_{1\leq i\leq n} \left\{ \sum_{k=1}^m \log \left(  P(x^{(k)}|c_i) \right) + \log \left( P(c_i) \right) \right\}$ 

with $P(x^{(k)}|c_i)=\frac{1}{\sqrt{2\pi\sigma_{c_i}^2}}\cdot \exp{\left( -\frac{(x^{(k)}-\mu_{c_i})^2}{2\sigma_{c_i}^2} \right) }$



In [None]:
# implement Naive Bayes for continuous variables
class NaiveBayes:
    # training 
    def fit(self, X, c):
        self.n_samples, self.n_features = X.shape
        self._classes = np.unique(c)
        self.n_classes = len(self._classes)
        self._mean = np.zeros((self.n_classes, self.n_features))
        self._var = np.zeros((self.n_classes, self.n_features))
        self._priors = np.zeros(self.n_classes)
        self.var_smoothing = 1e-09
        for idx, c_i in enumerate(self._classes):
            X_c_i = X[c==c_i]
            self._mean[idx,:] = X_c_i.mean(axis=0)
            self._var[idx,:] = X_c_i.var(axis=0)
            if np.any(self._var[idx,:] == 0):
                for iFeat in range(self.n_features):
                    if self._var[idx,iFeat] == 0:
                        self._var[idx,iFeat] = self.var_smoothing # = var_smoothing in sklearn
            self._priors[idx] = np.log(X_c_i.shape[0] / float(self.n_samples))
    # application
    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for idx, x in enumerate(X):
            posteriors = np.zeros(self.n_classes)
            for idx_c, c_i in enumerate(self._classes):
                prior = self._priors[idx_c]
                mean = self._mean[idx_c]
                var = self._var[idx_c]
                gauss = np.exp(-((x-mean)**2) / (2*var)) / np.sqrt(2* np.pi * var)
                if np.any(gauss == 0.):
                    for iFeat in range(self.n_features):
                        if gauss[iFeat] == 0.:
                            gauss[iFeat] = self.var_smoothing
                posteriors[idx_c] = np.sum(np.log(gauss))+prior
            predictions[idx] = self._classes[np.argmax(posteriors)]
        return predictions

In [None]:
# split input data in training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, c_train, c_test = train_test_split(X,c, test_size=0.2, random_state=123)

In [None]:
# training and testing Naive Bayes
nb = NaiveBayes()
nb.fit(X_train, c_train)
predictions=nb.predict(X_test)

In [None]:
# implementing the same in scikit learn
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, c_train)
predictions_sk=nb.predict(X_test)

In [None]:
# comparing accuracy
def accuracy(y_true, y_pred):
    return np.sum(y_true==y_pred)/len(y_true)

print("accuracy = ", accuracy(c_test, predictions))
print("accuracy (scikit learn) = ", accuracy(c_test, predictions_sk))

In [None]:
# comparing confusion matrix
from sklearn.metrics import confusion_matrix

print('confusion matrix:')
print(confusion_matrix(c_test, predictions))
print('confusion matrix (scikit learn):')
print(confusion_matrix(c_test, predictions_sk))