In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB             #import 3 different models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer    #Preprocessing modules
from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split         #Final tests

texts = pd.read_csv('spam.csv', encoding = 'latin-1')
texts

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


We drop the next 3 columns since they contain only NaN. We make the title columns more informative and denote
ham/spam by 1/0.

Then there is a small text preprocessing. In steps:
1) Put the whole sms into a single text
2) Find all non-alphanumerical characters such as !, !!, ?, ), .., ... etc.
3) Create a regular expression that captures alphanumerical characters, together with the punctuation

In [2]:
texts.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)    
texts.loc[texts['v1'] == 'ham', 'v1'] = 1
texts.loc[texts['v1'] == 'spam', 'v1'] = 0
texts.rename(columns = {'v1': 'ham/spam', 'v2': 'Text'}, inplace = True)

final_text = ''
for text in texts['Text']:
    final_text += ' ' + text     #Make the text a whole, and separate sentences by whitespace

final_text = final_text.lower()

punctuation = list(set((re.findall(r"[^a-zA-Z0-9]", final_text))))   #find all punctuations
punctuation.remove(' ')             #take out ' ', i.e. whitespace
punctuation.remove("'")             #Take out apostrofes, i.e. hold words like it's, jimmy's

punct = ''               
for i in punctuation:
    punct +=  "|" + "\\" + i + "+"

punct = r"[0-9a-zA-Z']+" + punct  

#punct says pick every number of alphanumerical characters or every punctuation character



Perform now a transformation through CountVectorizer. Take as vocabulary all the words together with the different punctuations.

In [3]:
vectorizer = CountVectorizer(token_pattern = punct, stop_words = 'english')

X = vectorizer.fit_transform(texts['Text'])
vocabulary = vectorizer.get_feature_names()
X_data = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())
X_data

Unnamed: 0,\r,!,!!,!!!,!!!!,!!!!!!!!!,"""","""""",#,$,...,ì,ï,ð,ò,ó,ô,õ,ö,÷,û
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#Now we are ready to train. Import the y_data
X = X_data
y = texts['ham/spam']

X_train, X_test, y_train, y_test = train_test_split(X, y.astype('int32'), test_size = 0.2)
#Train = Test here, therefore validation is 0.4 of test data

print("Train:", X_train.shape, y_train.shape)
print("Test: ", X_test.shape, y_test.shape)


Train: (4457, 8646) (4457,)
Test:  (1115, 8646) (1115,)


In [5]:
import numpy as np

def train_and_test(models, X_train, y_train, X_test, y_test):
    score = []
    for model in models:
        model.fit(X_train, y_train)
        print(type(model).__name__, "achieved a score", model.score(X_train, y_train), "on the training set")
        print(type(model).__name__, "achieved a score", model.score(X_test, y_test), "on the test set")
        score.append(model.score(X_test, y_test))
    
    best = np.argmax(np.asarray(score))
    return models[best]


models = [MultinomialNB(), LogisticRegression(solver = 'newton-cg'), RandomForestClassifier()]

best_model = train_and_test(models, X_train, y_train, X_test, y_test)


MultinomialNB achieved a score 0.994166479694862 on the training set
MultinomialNB achieved a score 0.9838565022421525 on the test set
LogisticRegression achieved a score 0.9964101413506843 on the training set
LogisticRegression achieved a score 0.9838565022421525 on the test set




RandomForestClassifier achieved a score 0.9995512676688355 on the training set
RandomForestClassifier achieved a score 0.9775784753363229 on the test set


In [6]:
best_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Essentially NB and LogisticRegression have probably the same deciding boundary.