In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('data/spamham.csv')
data.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
data.isna().sum()

text    0
spam    0
dtype: int64

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

x_train_count = tfidf_vectorizer.fit_transform(data['text'])
tfidf_vectorizer.vocabulary_

{'subject': 32371,
 'naturally': 23381,
 'irresistible': 18838,
 'your': 37024,
 'corporate': 10045,
 'identity': 17688,
 'lt': 21148,
 'is': 18848,
 'really': 28018,
 'hard': 16655,
 'to': 33798,
 'recollect': 28142,
 'company': 9281,
 'the': 33450,
 'market': 21665,
 'full': 15415,
 'of': 24274,
 'suqgestions': 32635,
 'and': 4836,
 'information': 18234,
 'isoverwhelminq': 18885,
 'but': 7546,
 'good': 16072,
 'catchy': 8043,
 'logo': 20960,
 'stylish': 32352,
 'statlonery': 32001,
 'outstanding': 24871,
 'website': 36080,
 'will': 36373,
 'make': 21440,
 'task': 33068,
 'much': 23052,
 'easier': 12610,
 'we': 36019,
 'do': 12048,
 'not': 23937,
 'promise': 27135,
 'that': 33441,
 'havinq': 16765,
 'ordered': 24632,
 'iogo': 18759,
 'automaticaily': 5772,
 'become': 6296,
 'world': 36633,
 'ieader': 17706,
 'it': 18913,
 'isguite': 18863,
 'ciear': 8651,
 'without': 36499,
 'products': 27033,
 'effective': 12813,
 'business': 7529,
 'organization': 24670,
 'practicable': 26619,
 'aim

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
y = data['spam']

X_train, X_test, y_train, y_test = train_test_split(x_train_count, y, test_size=0.2)
x_train_count.shape

(5728, 37303)

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
def decision_classifier(x_train, y_train):
    clf = DecisionTreeClassifier(max_depth=10).fit(x_train, y_train)
    
    return clf

In [16]:
def random_forest(x_train, y_train):
    clf = RandomForestClassifier().fit(x_train, y_train)
    
    return clf

In [26]:
def naive_bayes(x_train, y_train):
    clf = GaussianNB().fit(x_train.toarray(), y_train)
    
    return clf

In [19]:
def neighbors(x_train, y_train):
    clf = KNeighborsClassifier(n_neighbors=10).fit(x_train, y_train)
    
    return clf

In [32]:
def logestic(x_train, y_train):
    clf = LogisticRegression().fit(x_train, y_train)
    
    return clf

In [21]:
def build_and_train_classification(x_train, y_train, classification_fn):
    model = classification_fn(x_train, y_train)
    
    y_pred = model.predict(X_test)
    
    train_score = model.score(X_train, y_train)
    test_score = accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Training Score: {train_score}')
    print(f'Test Score: {test_score}')

In [22]:
build_and_train_classification(X_train, y_train, decision_classifier)

Training Score: 0.9591881274552597
Test Score: 0.93717277486911


In [23]:
build_and_train_classification(X_train, y_train, random_forest)

Training Score: 1.0
Test Score: 0.9685863874345549


In [33]:
build_and_train_classification(X_train, y_train, logestic)

Training Score: 0.9951986032300305
Test Score: 0.9842931937172775


In [25]:
build_and_train_classification(X_train, y_train, neighbors)

Training Score: 0.983195111305107
Test Score: 0.9781849912739965
