### Ham Vs Spam

In [1]:
import nltk
import numpy as np
import pandas as pd
%matplotlib inline

# Import spam dataset
spam = pd.read_csv("spambase.csv")

In [2]:
# Dimensions
print(spam.shape)

(4601, 58)


In [3]:
# Variable types
print(spam.dtypes)

word_freq_make                float64
word_freq_address             float64
word_freq_all                 float64
word_freq_3d                  float64
word_freq_our                 float64
word_freq_over                float64
word_freq_remove              float64
word_freq_internet            float64
word_freq_order               float64
word_freq_mail                float64
word_freq_receive             float64
word_freq_will                float64
word_freq_people              float64
word_freq_report              float64
word_freq_addresses           float64
word_freq_free                float64
word_freq_business            float64
word_freq_email               float64
word_freq_you                 float64
word_freq_credit              float64
word_freq_your                float64
word_freq_font                float64
word_freq_000                 float64
word_freq_money               float64
word_freq_hp                  float64
word_freq_hpl                 float64
word_freq_ge

In [4]:
# Summary stats
spam.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spamclass
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [5]:
# Count spam vs. not spam
c1 = len(spam[spam.spamclass==1])
c0 = len(spam[spam.spamclass==0])

print("Spam: %d" %c1)
print("Not spam: %d" %c0)

Spam: 1813
Not spam: 2788


In [6]:
# Check for nulls
spam[spam.isnull().any(axis=1)].head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spamclass


In [7]:
# Same method as used in group project 3
# Split into test, train, and validate
percTrain = 0.7
percVal = 0.15
percTest = 0.15

N = len(spam)
trainNum = int(percTrain * N)
valNum = int(percVal * N)
testNum = N - trainNum - valNum

In [8]:
# Check targets
print("Training target: %d" %trainNum)
print("Validation target: %d" %valNum)
print("Testing target: %d" %testNum)
print("Total: %d" %(trainNum + valNum + testNum))

Training target: 3220
Validation target: 690
Testing target: 691
Total: 4601


In [9]:
# Make splits
from sklearn.cross_validation import train_test_split

trainSet, testSet = train_test_split(spam, test_size=testNum, random_state=8)
trainSet, valSet = train_test_split(trainSet, test_size=valNum, random_state=88)



In [10]:
# Check lengths
print("Training set: %d" %len(trainSet))
print("Validation set: %d" %len(valSet))
print("Testing set: %d" %len(testSet))
print("Total: %d" %(len(trainSet) + len(valSet) + len(testSet)))

Training set: 3220
Validation set: 690
Testing set: 691
Total: 4601


In [11]:
# Decision tree
from sklearn import tree
import sklearn.metrics as sm

# Train classifier
trainSetClass = trainSet['spamclass']
trainSetVars = trainSet.drop(labels='spamclass', axis=1)
dt = tree.DecisionTreeClassifier(criterion="entropy", random_state=88)
dt_fit = dt.fit(trainSetVars, trainSetClass)

In [12]:
def model_summary(actual, pred):
    cm = sm.confusion_matrix(actual, pred, labels=[1, 0])
    print("True positives: %d" %cm[0,0])
    print("False positives: %d" %cm[1,0])
    print("True negatives: %d" %cm[1,1])
    print("False negatives: %d" %cm[0,1])
    print(sm.classification_report(actual, pred, labels=[1,0], target_names=["Spam", "Not spam"]))

In [13]:
# Training performance
dt_train = dt_fit.predict(trainSetVars)
model_summary(trainSetClass, dt_train)

True positives: 1260
False positives: 0
True negatives: 1958
False negatives: 2
             precision    recall  f1-score   support

       Spam       1.00      1.00      1.00      1262
   Not spam       1.00      1.00      1.00      1958

avg / total       1.00      1.00      1.00      3220



In [14]:
# Decision tree - validation set
valSetClass = valSet['spamclass']
valSetVars = valSet.drop(labels='spamclass', axis=1)

dt_val = dt_fit.predict(valSetVars)
model_summary(valSetClass, dt_val)

True positives: 252
False positives: 17
True negatives: 401
False negatives: 20
             precision    recall  f1-score   support

       Spam       0.94      0.93      0.93       272
   Not spam       0.95      0.96      0.96       418

avg / total       0.95      0.95      0.95       690



In [15]:
# Decision tree - test set
testSetClass = testSet['spamclass']
testSetVars = testSet.drop(labels='spamclass', axis=1)

dt_test = dt_fit.predict(testSetVars)
model_summary(testSetClass, dt_test)

True positives: 251
False positives: 29
True negatives: 383
False negatives: 28
             precision    recall  f1-score   support

       Spam       0.90      0.90      0.90       279
   Not spam       0.93      0.93      0.93       412

avg / total       0.92      0.92      0.92       691



In [16]:
# Check feature importance
def featImp(modelfit, setVars): 
    featFit = modelfit.feature_importances_
    df = {'Var': pd.Series(setVars.columns.values), 'Imp': pd.Series(featFit)}
    fi = pd.DataFrame(df, columns=['Var','Imp'])
    return fi.sort_values(['Imp'], ascending=0).head(10)
    

featImp(dt_fit, testSetVars)

Unnamed: 0,Var,Imp
51,char_freq_!,0.296289
52,char_freq_$,0.13037
6,word_freq_remove,0.125547
54,capital_run_length_average,0.056054
55,capital_run_length_longest,0.035736
26,word_freq_george,0.033155
56,capital_run_length_total,0.026242
24,word_freq_hp,0.02567
4,word_freq_our,0.025391
18,word_freq_you,0.024984


In [17]:
# Random forest - train
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(criterion="entropy", random_state=88)
rf_fit = rf.fit(trainSetVars, trainSetClass)

rf_train = rf_fit.predict(trainSetVars)
model_summary(trainSetClass, rf_train)

  from numpy.core.umath_tests import inner1d


True positives: 1248
False positives: 3
True negatives: 1955
False negatives: 14
             precision    recall  f1-score   support

       Spam       1.00      0.99      0.99      1262
   Not spam       0.99      1.00      1.00      1958

avg / total       0.99      0.99      0.99      3220



In [18]:
# Random forest - test set
rf_test = rf_fit.predict(testSetVars)
model_summary(testSetClass, rf_test)

True positives: 248
False positives: 16
True negatives: 396
False negatives: 31
             precision    recall  f1-score   support

       Spam       0.94      0.89      0.91       279
   Not spam       0.93      0.96      0.94       412

avg / total       0.93      0.93      0.93       691



In [19]:
# Random forest feature importance
featImp(rf_fit, testSetVars)

Unnamed: 0,Var,Imp
51,char_freq_!,0.133634
52,char_freq_$,0.110833
55,capital_run_length_longest,0.078106
15,word_freq_free,0.066308
54,capital_run_length_average,0.065711
6,word_freq_remove,0.053635
24,word_freq_hp,0.041705
56,capital_run_length_total,0.036781
25,word_freq_hpl,0.034489
18,word_freq_you,0.0338


In [20]:
# SVM - train
from sklearn import svm

sv = svm.SVC(random_state=88)
sv_fit = sv.fit(trainSetVars, trainSetClass)

sv_train = sv_fit.predict(trainSetVars)
model_summary(trainSetClass, sv_train)

True positives: 1126
False positives: 52
True negatives: 1906
False negatives: 136
             precision    recall  f1-score   support

       Spam       0.96      0.89      0.92      1262
   Not spam       0.93      0.97      0.95      1958

avg / total       0.94      0.94      0.94      3220



In [21]:
# SVM - test set
sv_test = sv_fit.predict(testSetVars)
model_summary(testSetClass, sv_test)

True positives: 214
False positives: 61
True negatives: 351
False negatives: 65
             precision    recall  f1-score   support

       Spam       0.78      0.77      0.77       279
   Not spam       0.84      0.85      0.85       412

avg / total       0.82      0.82      0.82       691



In [22]:
# Give more weight to non-spam class to reduce false positives
rf2 = ensemble.RandomForestClassifier(criterion="entropy", class_weight={1:1, 0:100}, random_state=88)
rf_fit2 = rf2.fit(trainSetVars, trainSetClass)

rf_train2 = rf_fit2.predict(trainSetVars)
model_summary(trainSetClass, rf_train2)

True positives: 1242
False positives: 1
True negatives: 1957
False negatives: 20
             precision    recall  f1-score   support

       Spam       1.00      0.98      0.99      1262
   Not spam       0.99      1.00      0.99      1958

avg / total       0.99      0.99      0.99      3220



In [23]:
rf_val2 = rf_fit2.predict(valSetVars)
model_summary(valSetClass, rf_val2)

True positives: 254
False positives: 12
True negatives: 406
False negatives: 18
             precision    recall  f1-score   support

       Spam       0.95      0.93      0.94       272
   Not spam       0.96      0.97      0.96       418

avg / total       0.96      0.96      0.96       690



In [24]:
rf_test2 = rf_fit2.predict(testSetVars)
model_summary(testSetClass, rf_test2)

True positives: 251
False positives: 14
True negatives: 398
False negatives: 28
             precision    recall  f1-score   support

       Spam       0.95      0.90      0.92       279
   Not spam       0.93      0.97      0.95       412

avg / total       0.94      0.94      0.94       691



In [25]:
def error_analysis(model, dfc):
    errors = []
    for x in range(0,len(dfc)):
        if (model[x] == 1 and dfc[x] == 0):
            errors.append(valSetVars[[x]])
    return(errors)