# Spam Classification Task

### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Read in data and take a look at the resultant dataframe

For this dataframe, I manually copied and pasted the column names into a .txt file and passed it into the dataframe as the column headers. I personally found that this was easier for me than writing a script to extract the column names from the original 'spambase.names' file since the format was old and unconventional. However, label the data however is easiest for you. It might even be more trouble than it's worth to label the columns, but I like to keep my data as clean as possible.

In [2]:
# Read the data contained in 'spambase.data' file into a pandas dataframe
# Retrieved column names from 'spambase.names' file and copied into a plain txt file for ease of use
col_names = []
d = open('/Users/madeleine/Desktop/Personal/Data/col_names.txt', 'r')
for row in d:
    col_names.append(row.strip())

with open('/Users/madeleine/Desktop/Personal/Data/spambase.data', 'r') as f:
    df = pd.read_csv(f, names = col_names, index_col = False)
df
# 4601 by 58 dataframe

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


### Identify predictor and outcome variables and separate data into X (predictor data) and y (outcome data)

In [3]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

### Perform k-fold cross validation and pass data through Logistic Regression, Gaussian Naive Bayes, and K-Nearest Neighbors classification models. Print accuracy scores.

In [6]:
kf = KFold(n_splits = 10, shuffle = True)
models = [LogisticRegression(), GaussianNB(), KNeighborsClassifier()]
for model in models:
    scores = cross_val_score(model, X, y, scoring="accuracy", cv=kf, n_jobs = -1)
    print('%s Accuracy: %.3f%% +/- %.3f%%' % (model, scores.mean()*100, scores.std()*100))

LogisticRegression() Accuracy: 92.262% +/- 0.611%
GaussianNB() Accuracy: 82.047% +/- 1.500%
KNeighborsClassifier() Accuracy: 80.809% +/- 1.839%


Logistic regression performed the highest out of these three classification models (approx. 90% accuracy). We will use logistic regression going forward.

In [7]:
val_table = pd.DataFrame(columns = ['false_positive', 'false_negative', 'overall_error'])
for train_index, test_index in kf.split(X):
    # Split the test and training data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculate accuracy and populate a confusion matrix
    accuracy = accuracy_score(y_pred, y_test)
    cm = confusion_matrix(y_test, y_pred)
    # Calculate counts of true negative, false negative, true positive, and false positive values in the confusion matrix
    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    # Calculate the false positive, false negative, and overall error rates and add thme to our dataframe
    false_positive = FP / (TN + FN + TP + FP)
    false_negative = FN / (TN + FN + TP + FP)
    total_error = (FP + FN) / (TN + FN + TP + FP)
    a_series = pd.Series([false_positive, false_negative, total_error], index = val_table.columns)
    val_table = val_table.append(a_series, ignore_index=True)

In [8]:
val_table = val_table.append(val_table.mean(axis=0), ignore_index=True)

### Evaluating results

This dataframe contains the false positive, false negative, and overall error rates for each fold of the logistic regression model. The last row (10) shows the average error rate across all folds for this model. On average, the overall error is 19%.

In [9]:
val_table

Unnamed: 0,false_positive,false_negative,overall_error
0,0.078091,0.138829,0.21692
1,0.095652,0.091304,0.186957
2,0.104348,0.102174,0.206522
3,0.086957,0.115217,0.202174
4,0.097826,0.091304,0.18913
5,0.082609,0.119565,0.202174
6,0.080435,0.108696,0.18913
7,0.076087,0.08913,0.165217
8,0.095652,0.119565,0.215217
9,0.108696,0.086957,0.195652
