In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 0. Data Preparation

In [16]:
sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])

In [17]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
sms['target'] = (sms['label'] == 'spam').astype(int)
sms.drop('label', axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix

## Bag of words for SMS

In [6]:
target_name = 'target'
X = sms['message']
y = sms[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [7]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [8]:
def CMatrix(CM, labels=['ham','spam']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name='TRUE'
    df.columns.name='PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

In [12]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_test = nb.predict(X_test_dtm)
precision = precision_score(y_pred=y_pred_test, y_true=y_test)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print("Precision: {:0.2f}%".format(100*precision))
print("Accuracy: {:0.2f}%".format(100*accuracy))
CMatrix(CM)

Precision: 99.29%
Accuracy: 99.10%


PREDICTION,ham,spam,Total
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,965,1,966
spam,9,140,149
Total,974,141,1115


In [10]:
sms1 = "Today is your lucky day! claim $100 for free  now! just text back saying YES."

In [11]:
    prediction = nb.predict(vect.transform([sms1]))
    if prediction:
        print ("Spam email")
    else:
        print ("Genuine email")

Spam email
