# Spam Detection (NLTK and scikit-learn) - Model creation, training, logging in experimentation phase

In [6]:
try:
    import verta
except ImportError:
    !pip install verta


- word similarity detection using [WordNet](https://github.com/nltk/wordnet) from **NLTK**
- [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) vectorization using **scikit-learn**
- **verta**'s Python client logging a `class` as a model to be instantiated at deployment time
- predictions against a deployed model

In [7]:
HOST = "http://10.117.149.9:31764/"
PROJECT_NAME = "DSW_VerificationSpam-detection-nk"
EXPERIMENT_NAME = "LogReg"

## Imports

In [8]:
from __future__ import print_function

import json
import os
import re
import time

import cloudpickle

import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
try:
    import wget
except ImportError:
    !pip install wget  # you may need pip3
    import wget

---

# Run Workflow

## Prepare Data

In [10]:
train_data_url = "http://s3.amazonaws.com/verta-starter/spam.csv"
train_data_filename = wget.detect_filename(train_data_url)
if not os.path.isfile(train_data_filename):
    wget.download(train_data_url)

In [11]:
raw_data = pd.read_csv(train_data_filename, delimiter=',', encoding='latin-1')

raw_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
# turn spam/ham to 0/1, and remove unnecessary columns
raw_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
raw_data.v1 = LabelEncoder().fit_transform(raw_data.v1)

raw_data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# lemmatize text
total_stopwords = set([word.replace("'",'') for word in stopwords.words('english')])
lemma = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.replace("'",'')
    text = re.sub('[^a-zA-Z]',' ',text)
    words = text.split()
    words = [lemma.lemmatize(word) for word in words if (word not in total_stopwords) and (len(word)>1)] # Remove stop words
    text = " ".join(words)
    return text

raw_data.v2 = raw_data.v2.apply(preprocess_text)

raw_data.head()

Unnamed: 0,v1,v2
0,0,go jurong point crazy available bugis great wo...
1,0,ok lar joking wif oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,dun say early hor already say
4,0,nah think go usf life around though


In [14]:
x_train, x_test, y_train, y_test = train_test_split(raw_data.v2, raw_data.v1, test_size=0.15, stratify=raw_data.v1)    

## Instantiate Client

In [15]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
proj = client.set_project(PROJECT_NAME)
expt = client.set_experiment(EXPERIMENT_NAME)
run = client.set_experiment_run()

connection successfully established
created new Project: DSW_VerificationSpam-detection-nk in personal workspace
created new Experiment: LogReg
created new ExperimentRun: Run 29516112273536180167


## Fit Model - Experimentation Phase

In [16]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x_train)

x_train_vec = vectorizer.transform(x_train).toarray()

model = linear_model.LogisticRegression()
model.fit(x_train_vec, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Model Predict - Experimentation Phase

In [17]:
x_test_vec = vectorizer.transform(x_test).toarray()
y_pred = model.predict(x_test_vec)

m_confusion_test = confusion_matrix(y_test, y_pred)
display(pd.DataFrame(data=m_confusion_test,
                     columns=['Predicted 0', 'Predicted 1'],
                     index=['Actual 0', 'Actual 1']))

print("This model misclassifies {} genuine SMS as spam"
      " and misses only {} SPAM.".format(m_confusion_test[0,1], m_confusion_test[1,0]))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,722,2
Actual 1,28,84


This model misclassifies 2 genuine SMS as spam and misses only 28 SPAM.


## Model explainability - Experimentation Phase

In [18]:
try:
    import eli5
except ImportError:
    !pip install eli5


In [19]:
eli5.show_weights(model)

Weight?,Feature
+4.807,x5806
+3.757,x3478
+3.540,x2075
+3.527,x791
+3.495,x994
+3.434,x5277
+3.363,x5827
+3.314,x6293
+3.293,x5531
+3.179,x4843


### Logging metrics and weights

In [21]:
accuracy = accuracy_score(y_test, y_pred)

run.log_metric("accuracy", accuracy)

accuracy

0.9641148325358851

In [22]:
# save and upload weights
model_param = {}
model_param['coef'] = model.coef_.reshape(-1).tolist()
model_param['intercept'] = model.intercept_.tolist()

json.dump(model_param, open("weights.json", "w"))

run.log_artifact("weights", open("weights.json", "rb"))

upload complete (weights)


In [23]:
# serialize and upload vectorizer
run.log_artifact("vectorizer", vectorizer)

upload complete (vectorizer)


## Define Model Class

Our model—with its pre-trained weights and serialized vectorizer—will require some setup at deployment time.

To support this, the Verta platform allows a model to be defined as a `class` that will be instantiated when it's deployed.  
This class should have provide the following interface:

- `__init__(self, artifacts)` where `artifacts` is a mapping of artifact keys to filepaths. This will be explained below, but Verta will provide this so you can open these artifact files and set up your model. Other initialization steps would be in this method, as well.
- `predict(self, data)` where `data`—like in other custom Verta models—is a list of input values for the model.

In [17]:
class SpamModel():    
    def __init__(self, artifacts):
        from nltk.corpus import stopwords  # needs to be re-imported to remove local file link
        
        # get artifact filepaths from `artifacts` mapping
        weights_filepath = artifacts['weights']
        vectorizer_filepath = artifacts['vectorizer']

        # load artifacts
        self.weights = json.load(open(weights_filepath, "r"))
        self.vectorizer = cloudpickle.load(open(vectorizer_filepath, "rb"))
        
        # reconstitute logistic regression
        self.coef_ = np.array(self.weights["coef"])
        self.intercept_ = self.weights["intercept"]
        
        # configure text preprocessing
        self.total_stopwords = set([word.replace("'",'') for word in stopwords.words('english')])
        self.lemma = WordNetLemmatizer()

    def preprocess_text(self, text):
        text = text.lower()
        text = text.replace("'",'')
        text = re.sub('[^a-zA-Z]',' ',text)
        words = text.split()
        words = [self.lemma.lemmatize(word) for word in words if (word not in self.total_stopwords) and (len(word)>1)] # Remove stop words
        text = " ".join(words)
        return text     
        
    def predict(self, data):
        predictions = []
        for inp in data:
            # preprocess input
            processed_text = self.preprocess_text(inp)
            inp_vec = self.vectorizer.transform([inp]).toarray()
            
            # make prediction
            prediction = (np.dot(inp_vec.reshape(-1), self.coef_.reshape(-1)) + self.intercept_)[0]
            predictions.append(prediction)
            
        return predictions

Earlier we logged artifacts with the keys `"weights"` and `"vectorizer"`.  
You can obtain an `artifacts` mapping mentioned above using `run.fetch_artifacts(keys)` to work with locally.  
A similar mapping—that works identically—will be passed into `__init__()` when the model is deployed.

In [18]:
artifacts = run.fetch_artifacts(["weights", "vectorizer"])

spam_model = SpamModel(artifacts=artifacts)

In [19]:
spam_model.predict(["FREE FREE FREE"])

[0.7391828703466365]

## Log Model

In [20]:
run.log_model(
    model=SpamModel,
    artifacts=['weights', 'vectorizer'],
)

upload complete (custom_modules)
upload complete (model.pkl)
upload complete (model_api.json)


We also have to make sure we provide every package involved in the model.

In [21]:
run.log_requirements([
    "cloudpickle",
    "nltk",
    "numpy",
    "sklearn",
])

upload complete (requirements.txt)


In [22]:
run.log_setup_script("""
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
""")

upload complete (setup_script)
