# ML Pipeline with Customized Preprocessing Function

Importing the dataset

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import gensim
import numpy as np


dataset=pd.read_csv("sms_spam.csv")

print(dataset.head())
print ("Shape:", dataset.shape, '\n')

   type                                               text
0   ham  Hope you are having a good week. Just checking in
1   ham                            K..give back my thanks.
2   ham        Am also doing in cbe only. But have to pay.
3  spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4  spam  okmail: Dear Dave this is your final notice to...
Shape: (5559, 2) 



Defining the class PreprocessTransformer

In [2]:
from sklearn.base import BaseEstimator,TransformerMixin

class PreprocessTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,stop=True,stripNum=True,minSize=3,stemming=True):
        self.stop=stop
        self.stripNum=stripNum
        self.minSize=minSize
        self.stemming=stemming
    def fit(self,x, y=None):
        return self
    def transform(self,x, y=None):
        xc=x.copy()
        xc = xc.map(self.transformText)
        return xc
    def transformText(self,text):
        stops = set(stopwords.words("english"))
        # Convert text to lowercase
        text = text.lower()
        # Strip multiple whitespaces
        text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
        if self.stop:
            # Removing all the stopwords
            filtered_words = [word for word in text.split() if word not in stops]
            # Preprocessed text after stop words removal
            text = " ".join(filtered_words)
        # Remove the punctuation
        text = gensim.parsing.preprocessing.strip_punctuation(text)
        if self.stripNum:
            # Strip all the numerics
            text = gensim.parsing.preprocessing.strip_numeric(text)
        if self.minSize>0:
            # Removing all the words with less than 3 characters
            text = gensim.parsing.preprocessing.strip_short(text, minsize=self.minSize)
        # Strip multiple whitespaces
        text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
        # Stemming
        if self.stemming:
            text=gensim.parsing.preprocessing.stem_text(text)
        return text

Creating training and test set

In [3]:
## Split the data
from sklearn.model_selection import train_test_split

#separate the test set
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)
print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))

Training Sample Size: 3724   Test Sample Size: 1835


Creating the pipeline

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn import svm

# defines the steps of the pipeline, each with
# a name and the model object
clf = Pipeline(
    [
        ("prep", PreprocessTransformer()),
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("selector",SelectPercentile(score_func=chi2, percentile=30)),
        ("clf", svm.SVC(kernel="rbf",C=1000, gamma=0.0001)),
    ]
)

Fitting and predicting

In [5]:
clf.fit(X_train, y_train)

#performing the actual prediction
predicted = clf.predict(X_test)

from sklearn import metrics
print(pd.crosstab(y_test,predicted))
print(metrics.classification_report(y_test, predicted))

col_0   ham  spam
type             
ham    1577     6
spam     63   189
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1583
        spam       0.97      0.75      0.85       252

    accuracy                           0.96      1835
   macro avg       0.97      0.87      0.91      1835
weighted avg       0.96      0.96      0.96      1835

