# Spam Detection

In [1]:
import pandas as pd

In [2]:
# Read spam data
spams = pd.read_csv('spambase.data.csv', header=None)
spams.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
X = spams.iloc[:, :-1]
y = spams.iloc[:, -1]

X_pos = X[y == 1] # spam
X_neg = X[y == 0] # not spam
y_pos = y[y == 1]
y_neg = y[y == 0]

# Split data into training and test sets
# 70% of each class is used for training
# 30% of each class is used for testing
from sklearn.model_selection import train_test_split
X_pos_train, X_pos_test = train_test_split(X_pos, test_size=0.3)
y_pos_train, y_pos_test = train_test_split(y_pos, test_size=0.3)
X_neg_train, X_neg_test = train_test_split(X_neg, test_size=0.3)
y_neg_train, y_neg_test = train_test_split(y_neg, test_size=0.3)

X_train = pd.concat([X_pos_train, X_neg_train])
y_train = pd.concat([y_pos_train, y_neg_train])
X_test = pd.concat([X_pos_test, X_neg_test])
y_test = pd.concat([y_pos_test, y_neg_test])

print('Number of training instances:', len(X_train))
print('Number of test instances:', len(X_test))

Number of training instances: 3220
Number of test instances: 1381


In [4]:
# Build a logistic regression classifier to detect spam emails
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# Evaluate the classifier
from sklearn.metrics import accuracy_score
predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)


Accuracy: 0.9116582186821144


In [7]:
# Inference on latest data in test set
# Read the latest email data
latest_emails = spams.iloc[-5:, :-1]

# Predict whether the latest emails are spam
predictions = classifier.predict(latest_emails)
print('Predictions:', predictions)

Predictions: [0 0 0 0 0]


In [10]:
# New model
from sklearn.naive_bayes import GaussianNB
model2 = GaussianNB()
model2.fit(X_train, y_train)

In [11]:
# Evaluate the new model
predictions = model2.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

Accuracy: 0.8030412744388125
