To be done in groups of 2 students.

You have two choices:

1) Choose a data set from the  UCI Machine Learning Repository (http://archive.ics.uci.edu/ml/index.php) and a Machine Learning algorithm. Implement the algorithm and train it with your chosen data set to have the best performance over unseen data.
2) Choose a task appropriate for a machine learning and an algorithm to learn it. Implement the algorithm and train it for the task to have the best performance.

We chose the first option. We chose the data set "Spambase Data Set" from the UCI Machine Learning Repository and the algorithm "Naive Bayes". The data set can be found here: https://archive.ics.uci.edu/dataset/94/spambase

1. Import libraries

In [11]:
import numpy as np

 2. Load data

In [12]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 



3. Preprocessing

In [13]:
# convert to numpy arrays
X = np.array(X)
y = np.array(y)

# reshape y to 1d array
y = y.ravel()


4. Implement Naive Bayes

In [14]:
class NaiveBayes:
    def fit(self, X, y): # X and y are numpy arrays
        self.soft_value = 1e-40 # small value to avoid taking the log of zero
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # mean, average of each feature by class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        # var, variance of each feature by class
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        # prior, probabilities of each class
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for _class in self._classes:
            X_class = X[_class==y]
            self._mean[_class,:] = X_class.mean(axis=0)
            self._var[_class,:] = X_class.var(axis=0)
            self._priors[_class] = X_class.shape[0] / float(n_samples)
        

    def predict(self, X):
        y_pred = [self._predict(x) for x in X] # for each sample, predict the class
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        for _class in self._classes:
            prior = np.log(self._priors[_class])
            class_conditional = np.sum(np.log(self._pdf(_class, x) + self.soft_value))
            posterior = prior + class_conditional
            posteriors.append(posterior)

        # return the class with the highest probability
        print(posteriors)
        return self._classes[np.argmax(posteriors)]
    
    def _pdf(self, _class, x): # probability density function
        mean = self._mean[_class]
        var = self._var[_class]
        numerator = np.exp(-(x-mean)**2 / (2*var+ self.soft_value))
        denominator = np.sqrt(2*np.pi*var)
        return numerator / (denominator + self.soft_value)

4. Train and test the model

In [15]:
# split data into train and test sets, make sure to have spam and non-spam in both sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y
                                                    , test_size=0.1
                                                    , stratify=y)

# train model
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

# evaluate model
from sklearn.metrics import accuracy_score
print("Naive Bayes classification accuracy", accuracy_score(y_test, predictions))

print("False positives:", np.sum((predictions==1) & (y_test==0)))
print("False negatives:", np.sum((predictions==0) & (y_test==1)))
print("True positives:", np.sum((predictions==1) & (y_test==1)))
print("True negatives:", np.sum((predictions==0) & (y_test==0)))

# compare with sklearn's implementation
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)

print("Sklearn's Naive Bayes classification accuracy", accuracy_score(y_test, predictions))
print("False positives:", np.sum((predictions==1) & (y_test==0)))
print("False negatives:", np.sum((predictions==0) & (y_test==1)))
print("True positives:", np.sum((predictions==1) & (y_test==1)))
print("True negatives:", np.sum((predictions==0) & (y_test==0)))



[0 1]
[-99.32753886978175, -6.191784644962103]
[-23.121377803810613, -67.67070187087171]
[-65.44780114337377, 8.669282432858953]
[-22.732739618748045, -549.0446609009791]
[-30.14949659920124, -90.70960352058333]
[-18.271532847145153, -201.03284435807413]
[-32.75411878450061, -98.2498164973681]
[-39.97163498605843, -139.87454628532421]
[-19.216644785356817, -208.002508843946]
[-29.386118276277035, 2.0786558348264714]
[-21.20081057996314, 10.740184852869085]
[-21.103063839441926, -137.87081409509315]
[-30.600972796322324, -178.80331406962227]
[-18.952110118487663, -93.65468352404932]
[-17.796768640568548, -90.66964709362638]
[-266.7628231685755, -0.746766353958322]
[-20.446310915751777, -14.095469681844058]
[-27.128250909657016, 14.404971136557489]
[-29.703806652856944, -105.6970955464604]
[-224.1272853880897, -1086.8844781869748]
[-33.2324282631931, -198.12561519844095]
[-54.212773320367035, -77.21088351433335]
[-34.19324100637557, -13.780410121034278]
[-17.229615809608756, 16.566283559