To be done in groups of 2 students.

You have two choices:

1) Choose a data set from the  UCI Machine Learning Repository (http://archive.ics.uci.edu/ml/index.php) and a Machine Learning algorithm. Implement the algorithm and train it with your chosen data set to have the best performance over unseen data.
2) Choose a task appropriate for a machine learning and an algorithm to learn it. Implement the algorithm and train it for the task to have the best performance.

We chose the first option. We chose the data set "Spambase Data Set" from the UCI Machine Learning Repository and the algorithm "Naive Bayes". The data set can be found here: https://archive.ics.uci.edu/dataset/94/spambase

1. Import libraries

In [74]:
import numpy as np
import pandas as pd

 2. Load data

In [75]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 



3. Preprocessing

In [76]:
# convert to numpy arrays
X = np.array(X)
y = np.array(y)

# reshape y to 1d array
y = y.ravel()


4. Implement Naive Bayes

In [77]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

def naive_bayes_gaussian(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

4. Train and test the model

In [81]:
# split data into train and test sets, make sure to have spam and non-spam in both sets
from sklearn.model_selection import train_test_split
# transform spambaes to pandas dataframe
data = pd.DataFrame(X)
data['spam'] = y
train, test = train_test_split(data, test_size=0.2, random_state=42)

# train model
X_test = test.iloc[:,:-1].values
y_test = test.iloc[:,-1].values
X_train = train.iloc[:,:-1].values
y_train = train.iloc[:,-1].values

print("Starting Naive Bayes classification...")
predictions = naive_bayes_gaussian(train, X_test, 'spam')
print("Done!")

# evaluate model
from sklearn.metrics import accuracy_score
print("Naive Bayes classification accuracy", accuracy_score(y_test, predictions))

print("False positives:", np.sum((predictions==1) & (y_test==0)))
print("False negatives:", np.sum((predictions==0) & (y_test==1)))
print("True positives:", np.sum((predictions==1) & (y_test==1)))
print("True negatives:", np.sum((predictions==0) & (y_test==0)))

# compare with sklearn's implementation
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)

print("Sklearn's Naive Bayes classification accuracy", accuracy_score(y_test, predictions))
print("False positives:", np.sum((predictions==1) & (y_test==0)))
print("False negatives:", np.sum((predictions==0) & (y_test==1)))
print("True positives:", np.sum((predictions==1) & (y_test==1)))
print("True negatives:", np.sum((predictions==0) & (y_test==0)))



Starting Naive Bayes classification...
Done!
Naive Bayes classification accuracy 0.8208469055374593
False positives: 140
False negatives: 25
True positives: 365
True negatives: 391
Sklearn's Naive Bayes classification accuracy 0.8208469055374593
False positives: 144
False negatives: 21
True positives: 369
True negatives: 387
