# Train an SMS spam detector

## Import dependencies

In [1]:
import numpy as np
import pandas as pd

import joblib, re

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score

import streamlit as st


## Load and read the data

In [2]:
# read data
df = pd.read_csv('data/spam.csv', encoding="ISO-8859-1")

# drop useless columns
df = df[['v1', 'v2']]

In [3]:
# display
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocess data

In [4]:
# separate target from input text
x, y = df['v2'], df['v1']

# print example
idx = 2000
x[idx], y[idx]

("But i'll b going 2 sch on mon. My sis need 2 take smth.", 'ham')

In [5]:
# preprocess text data
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [6]:
# train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

## Train a text classification model

In [7]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, max_features=700, preprocessor=preprocessor, ngram_range=(1,1))


neural_net_pipeline = Pipeline([
    ('vectorizer', tfidf),
    ('nn_classifier', MLPClassifier(hidden_layer_sizes=(800, 600)))
], verbose=True)


neural_net_pipeline.fit(x_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   0.1s
[Pipeline] ..... (step 2 of 2) Processing nn_classifier, total=  35.4s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(lowercase=False, max_features=700,
                                 preprocessor=<function preprocessor at 0x00000224B23DCAF0>)),
                ('nn_classifier',
                 MLPClassifier(hidden_layer_sizes=(800, 600)))],
         verbose=True)

In [8]:
y_pred = neural_net_pipeline.predict(x_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       982
        spam       0.97      0.88      0.92       133

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 98.29596412556054 %


In [9]:
joblib.dump(neural_net_pipeline, 'spam_classifier.joblib')

['spam_classifier.joblib']

### [DEPREC] Take a look a vocabulary

In [10]:
vect = tfidf.fit(x)
print(vect.transform(x[:3]))

  (0, 676)	0.3828817317746881
  (0, 638)	0.31626839014952657
  (0, 614)	0.398807087387503
  (0, 569)	0.26993054547368095
  (0, 427)	0.2708251709261636
  (0, 298)	0.1858974189812721
  (0, 249)	0.3126805498989119
  (0, 247)	0.26563760071081977
  (0, 240)	0.256865405261823
  (0, 63)	0.4233791106411941
  (1, 660)	0.6604653083972
  (1, 419)	0.4164158171894947
  (1, 315)	0.6248067249943752
  (2, 664)	0.23112807387050585
  (2, 611)	0.19401254746439592
  (2, 587)	0.28350033540703856
  (2, 558)	0.19174312144678154
  (2, 481)	0.2609216402347037
  (2, 475)	0.2634507832385311
  (2, 473)	0.2789682308604613
  (2, 356)	0.2464590438444451
  (2, 298)	0.12609523496268285
  (2, 222)	0.17999614570834122
  (2, 211)	0.28718036391872315
  (2, 195)	0.5617578046486681
  (2, 53)	0.2634507832385311


In [11]:
print('Length of the vocabulary:', len(vect.vocabulary_))
print('---> sample of the vocabulary: ', {key: vect.vocabulary_[key] for key in list(vect.vocabulary_.keys())[:10]})

Length of the vocabulary: 700
---> sample of the vocabulary:  {'go': 240, 'until': 614, 'available': 63, 'only': 427, 'in': 298, 'great': 249, 'world': 676, 'there': 569, 'got': 247, 'wat': 638}


### [DEPREC] Example predictions

In [12]:
index = 2123
# x_sample = x_test.loc[x_test.index[811]]
x_sample = x_test.loc[index]
y_sample = y_test[index]
print(y_sample, '-----', x_sample)

spam ----- +123 Congratulations - in this week's competition draw u have won the å£1450 prize to claim just call 09050002311 b4280703. T&Cs/stop SMS 08718727868. Over 18 only 150ppm


In [13]:
neural_net_pipeline.predict([x_sample])

array(['ham'], dtype='<U4')

In [14]:
neural_net_pipeline.predict_proba([x_sample])

array([[9.99999066e-01, 9.34261442e-07]])