# ML Pipeline

Importing the dataset

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import gensim
import numpy as np


dataset=pd.read_csv("sms_spam.csv")

print(dataset.head())
print ("Shape:", dataset.shape, '\n')

   type                                               text
0   ham  Hope you are having a good week. Just checking in
1   ham                            K..give back my thanks.
2   ham        Am also doing in cbe only. But have to pay.
3  spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4  spam  okmail: Dear Dave this is your final notice to...
Shape: (5559, 2) 



Preprocessing function

In [2]:
def transformText(text):
    stops = set(stopwords.words("english"))
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Removing all the words with < 3 characters
    text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

Preprocessing

In [3]:
#applies transformText to all rows of text
dataset['text'] = dataset['text'].map(transformText)
print(dataset['text'].head())

0                                 hope good week check
1                                      give back thank
2                                    also cbe onli pai
3    complimentari star ibiza holidai cash need urg...
4    okmail dear dave final notic collect tenerif h...
Name: text, dtype: object


Creating training and test set

In [4]:
## Split the data
from sklearn.model_selection import train_test_split

#separate the test set
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)
print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))

Training Sample Size: 3724   Test Sample Size: 1835


Creating the pipeline

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import svm
from time import time

#creates a model instance with no parameters
svc=svm.SVC()

# defines the steps of the pipeline, each with
# a name and the model object
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("selector",SelectPercentile()),
        ("clf", svc),
    ]
)

Setting pipeline possible parameters

In [6]:
# create a dictionary with possible values for some parameters
# each parameter name is composed as
# pipelineStepName__componentParameter
parameters = {
    "vect__ngram_range": ((1, 1), (1, 2)),
    "vect__min_df": (20,30,40),
    'tfidf__use_idf': (True, False),
    'selector__score_func': [chi2], #selector function needs a list
    'selector__percentile': (20,30,40),
    'clf__C': [1, 10, 100, 1000],
    'clf__gamma': [0.001, 0.0001],
    'clf__kernel': ['rbf','linear']
}


Performing the grid search

In [7]:

#instantiates the grid search
# using the svc model and the parameters above defined
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=10)

print("Performing grid search...")
print("parameters:")
print(parameters)
t0 = time()
# Starts the grid search
grid_search.fit(X_train, y_train)
# Prints the required time
print("done in %0.3fs" % (time() - t0))
print()

# Prints the best score
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
parameters:
{'vect__ngram_range': ((1, 1), (1, 2)), 'vect__min_df': (20, 30, 40), 'tfidf__use_idf': (True, False), 'selector__score_func': [<function chi2 at 0x131864f40>], 'selector__percentile': (20, 30, 40), 'clf__C': [1, 10, 100, 1000], 'clf__gamma': [0.001, 0.0001], 'clf__kernel': ['rbf', 'linear']}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV 3/5; 2/576] START clf__C=1, clf__gamma=0.001, clf__kernel=rbf, selector__percentile=20, selector__score_func=<function chi2 at 0x1170a8540>, tfidf__use_idf=True, vect__min_df=20, vect__ngram_range=(1, 2)
[CV 2/5; 1/576] START clf__C=1, clf__gamma=0.001, clf__kernel=rbf, selector__percentile=20, selector__score_func=<function chi2 at 0x11d870540>, tfidf__use_idf=True, vect__min_df=20, vect__ngram_range=(1, 1)
[CV 5/5; 1/576] START clf__C=1, clf__gamma=0.001, clf__kernel=rbf, selector__percentile=20, selector__score_func=<function chi2 at 0x118c04540>, tfidf__use_idf=True, vect__min_df=20, vect_

Instatiating and using the best model

In [8]:
#Creating the model:

#instantiating the model using the grid search best parameters
clf=best_pipe = grid_search.best_estimator_
clf.fit(X_train, y_train)

#performing the actual prediction
predicted = clf.predict(X_test)

from sklearn import metrics
print(pd.crosstab(y_test,predicted))
print(metrics.classification_report(y_test, predicted))

col_0   ham  spam
type             
ham    1569    14
spam     29   223
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1583
        spam       0.94      0.88      0.91       252

    accuracy                           0.98      1835
   macro avg       0.96      0.94      0.95      1835
weighted avg       0.98      0.98      0.98      1835

