<h1 align='center'><u>Spam Detection</u></h1>



In [None]:
!pip install -U spacy

In [None]:
!pip install pyspellchecker

In [None]:
!pip install bs4

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import textwrap as tw
import matplotlib.pyplot as plt

# learning Curves
from sklearn.model_selection import learning_curve

# save and load models
import joblib

import re
from bs4 import BeautifulSoup

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token


import nltk
from nltk.stem.porter import PorterStemmer


from sklearn.model_selection import train_test_split
from collections import Counter
from xgboost import XGBClassifier
from sklearn import metrics
from spellchecker import SpellChecker

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data_folder = Path('../input/sms-spam-collection-dataset')

In [None]:
!python -m spacy download 'en_core_web_sm'

In [None]:
nlp = spacy.load('en_core_web_sm')

# Load the dataset  





In [None]:
# location of data file
spam_file = data_folder / 'spam.csv'

# creating Pandas Dataframe
df = pd.read_csv(spam_file, index_col=0,encoding = 'ISO-8859-1')

In [None]:
# print shape of the dataset
print(f'Shape of data set is : {df.shape}')

In [None]:
df.drop(columns = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'v1':'label', 'v2':'text'},inplace=True)

# Printing basic info
df.info()

In [None]:
# Checking distribution of class labels for train dataset
df['label'].value_counts(normalize=True)

## Metric for evaluating model


In [None]:
# Creating a scorer for F2 score so that we can given an emphases on the minority class predictions i.e higher recall
from sklearn.metrics import fbeta_score, make_scorer
fscore = make_scorer(fbeta_score, beta=0.5)
fscore

* As seen in the previous step, we have a heavily imbalanced dataset. Hence, using accuracy as an evaluation metric doesn't bear good results.

* **Precision** and **recall** are used to take care of false positive and false negative rates. We have F-score which is the harmonic mean of precision and recall and gives equal weightage to both. 

* In this case, I will be using a derivative of F-score which is a **Fbeta-measure**. It is used when both precision and recall are to be considered with a higher weightage for one of them. 

* In our dataset, I am more concerned about marking a non-spam email as a spam email which causes **False Positive**. So more weightage is to be given to **Precision**.

* I will be using **beta=0.5** in this exercise. So it will be F0.5 measure.

# Classification Pipelines

    1. Featurization (TF-IDF) + Feature Engineering + ML Model pipeline

**Requirements:** 

1. Using XgBoost model for the classification and tuning the **XGBoost for imbalanced dataset** (If you have never used XGBoost before , here is the link on XGBoost tutorial for imbalanced data: https://machinelearningmastery.com/xgboost-for-imbalanced-classification/).

2. For feature engineering, I will Count of following  (Nouns, ProperNouns, AUX, VERBS, Adjectives, named entities, spelling mistakes (see the link on how to get spelling mistakes https://pypi.org/project/pyspellchecker/). 

3. For Sparse embeddings I will use **tfidf vectorization**. We should choose appropriate parameters e.g. min_df, max_df, max_faetures, n-grams etc.)

## Sampling and Train-Test Split

In [None]:
df['label'] = df['label'].map({'spam':1, 'ham':0}).astype(int)

In [None]:
X = df['text'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)
print(f'X_train: {X_train.shape} y_train: {y_train.shape}')
print(f'X_test: {X_test.shape} y_test: {y_test.shape}')

## Custom Classes 

In [None]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):
    '''
    model : spacy model to be used. For example fpr english language we can specify: en_core_web_sm

    '''
    np.random.seed(0)
    def __init__(self, model, batch_size = 64, lammetize=True, lower=True, remove_stop=True, 
                 remove_punct=True, remove_email=True, remove_url=True, remove_num=False, stemming = False,
                 add_user_mention_prefix=True, remove_hashtag_prefix=False):
        self.model = model
        self.batch_size = batch_size
        self.remove_stop = remove_stop
        self.remove_punct = remove_punct
        self.remove_num = remove_num
        self.remove_url = remove_url
        self.remove_email = remove_email
        self.lammetize = lammetize
        self.lower = lower
        self.stemming = stemming
        self.add_user_mention_prefix = add_user_mention_prefix
        self.remove_hashtag_prefix = remove_hashtag_prefix

 # helpfer functions for basic cleaning 

    def basic_clean(self, text):
        
        '''
        This fuction removes HTML tags from text
        '''
        if (bool(BeautifulSoup(text, "html.parser").find())==True):         
            soup = BeautifulSoup(text, "html.parser")
            text = soup.get_text()
        else:
            pass
        return re.sub(r'[\n\r]',' ', text) 

    # helper function for pre-processing with spacy and Porter Stemmer
    
    def spacy_preprocessor(self,texts):

        final_result = []
        nlp = spacy.load(self.model)
        if self.lammetize:   
          disabled = nlp.select_pipes(disable= [ 'parser', 'ner'])
        else:
          disabled = nlp.select_pipes(disable= ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
        
        ## Add @ as a prefix so that we can separate the word from @ 
        prefixes = list(nlp.Defaults.prefixes)

        if self.add_user_mention_prefix:
            prefixes += ['@']

        ## Remove # as a prefix so that we can keep hashtags and words together
        if self.remove_hashtag_prefix:
            prefixes.remove(r'#')

        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
        nlp.tokenizer.prefix_search = prefix_regex.search

        matcher = Matcher(nlp.vocab)
        if self.remove_stop:
            matcher.add("stop_words", [[{"is_stop" : True}]])
        if self.remove_punct:
            matcher.add("punctuation",[ [{"is_punct": True}]])
        if self.remove_num:
            matcher.add("numbers", [[{"like_num": True}]])
        if self.remove_url:
            matcher.add("urls", [[{"like_url": True}]])
        if self.remove_email:
            matcher.add("emails", [[{"like_email": True}]])
            
        Token.set_extension('is_remove', default=False, force=True)

        cleaned_text = []
        for doc in nlp.pipe(texts,batch_size=self.batch_size ):
            matches = matcher(doc)
            for _, start, end in matches:
                for token in doc[start:end]:
                    token._.is_remove =True
                    
            if self.lammetize:                     
                text = ' '.join(token.lemma_ for token in doc if (token._.is_remove==False))
            elif self.stemming:
                text = ' '.join(PorterStemmer().stem(token.text) for token in doc if (token._.is_remove==False))
            else:
                text = ' '.join(token.text for token in doc if (token._.is_remove==False))
                                   
            if self.lower:
                text=text.lower()
            cleaned_text.append(text)
        return cleaned_text

    def fit(self, X,y=None):
        return self

    def transform(self, X, y=None):
        try:
            if str(type(X)) not in ["<class 'list'>","<class 'numpy.ndarray'>"]:
                raise Exception('Expected list or numpy array got {}'.format(type(X)))
            x_clean = [self.basic_clean(text) for text in X]
            x_clean_final = self.spacy_preprocessor(x_clean)
            return x_clean_final
        except Exception as error:
            print('An exception occured: ' + repr(error))

In [None]:
def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
class ManualFeatures(TransformerMixin, BaseEstimator):
    
    def __init__(self, spacy_model, pos_features = True, ner_features = True, count_features = True):
        
        self.spacy_model = spacy_model
        self.pos_features = pos_features
        self.ner_features = ner_features
        self.count_features = count_features    
    
        
    # Define some helper functions
    def get_pos_features(self, cleaned_text):
        nlp = spacy.load(self.spacy_model)
        noun_count = []
        aux_count = []
        verb_count = []
        adj_count =[]
        disabled = nlp.select_pipes(disable= ['lemmatizer', 'ner'])
        for doc in nlp.pipe(cleaned_text, batch_size=1000, n_process=-1):
            nouns = [token.text for token in doc if (token.pos_ in ["NOUN","PROPN"])] 
            auxs =  [token.text for token in doc if (token.pos_ in ["AUX"])] 
            verbs =  [token.text for token in doc if (token.pos_ in ["VERB"])] 
            adjectives =  [token.text for token in doc if (token.pos_ in ["ADJ"])]        

            noun_count.append(int(len(nouns)))
            aux_count.append(int(len(auxs)))
            verb_count.append(int(len(verbs)))
            adj_count.append(int(len(adjectives)))
        return np.transpose(np.vstack((noun_count, aux_count, verb_count, adj_count)))
            
        
    def get_ner_features(self, cleaned_text):
        nlp = spacy.load(self.spacy_model)
        count_ner  = []
        disabled = nlp.select_pipes(disable= ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
        for doc in nlp.pipe(cleaned_text, batch_size=1000, n_process=-1):
            ners = [ent.label_ for ent in doc.ents] 
            count_ner.append(len(ners))
        return np.array(count_ner).reshape(-1,1)   
   
    def get_count_features(self, cleaned_text):
        list_count_words =[]
        list_count_characters =[]
        list_count_characters_no_space =[]
        list_avg_word_length=[]
        list_count_digits=[]
        list_count_numbers=[]
        for sent in cleaned_text:
            words = re.sub(r'\d+\s','',sent)
            numbers = re.findall(r'\d+', sent)
            #print(words)
            #print(numbers)

            count_word = len(words.split())
            count_char = len(words)
            count_char_no_space = len(''.join(words.split()))
            avg_word_length = count_char_no_space/(count_word+1)
            count_numbers = len(numbers)
            count_digits = len(''.join(numbers))

            list_count_words.append(count_word)
            list_count_characters.append(count_char)
            list_count_characters_no_space.append(count_char_no_space)
            list_avg_word_length.append(avg_word_length)
            list_count_digits.append(count_digits)
            list_count_numbers.append(count_numbers)  
            
        count_features = np.vstack((list_count_words, list_count_characters,
                                  list_count_characters_no_space, list_avg_word_length,
                                  list_count_digits,list_count_numbers ))
        return np.transpose(count_features)
        
 
         
    def fit(self, X, y = None):
        return self
    
    def transform(self, X,y=None):
        try:
            if str(type(X)) not in ["<class 'list'>","<class 'numpy.ndarray'>"]:
                raise Exception('Expected list or numpy array got {}'.format(type(X)))

            
            preprocessor1 = SpacyPreprocessor(model = 'en_core_web_sm', lammetize=False, lower = False, 
                                   remove_stop=False )
            preprocessor2 = SpacyPreprocessor(model = 'en_core_web_sm', lammetize=False, lower = False, 
                                   remove_stop=False, remove_punct= False )
            
            feature_names =[]
            if (self.pos_features or self.ner_features):
                cleaned_x_count_ner_pos = preprocessor2.fit_transform(X)
            
            if self.count_features:
                cleaned_x_count_features = preprocessor1.fit_transform(X)
                count_features = self.get_count_features(cleaned_x_count_features)
                feature_names.extend(['count_words', 'count_characters',
                                  'count_characters_no_space', 'avg_word_length',
                                  'count_digits','count_numbers'])
				
            else:
                count_features = np.empty(shape = (0, 0))
                
            if self.pos_features: 
                pos_features = self.get_pos_features(cleaned_x_count_ner_pos)
                feature_names.extend(['noun_count', 'aux_count', 'verb_count', 'adj_count'])
            else:
                 pos_features = np.empty(shape = (0, 0))
                
            if self.ner_features: 
                ner_features =self.get_ner_features(cleaned_x_count_ner_pos)
                feature_names.extend(['ner'])
            else:
                 ner_features = np.empty(shape = (0, 0))
                
            return np.hstack((count_features, ner_features, pos_features)), feature_names
            

        except Exception as error:
            print('An exception occured: ' + repr(error))


## Final Pipeline 

In [None]:
# count examples in each class
counter = Counter(df['label'])
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
X_train_cleaned = SpacyPreprocessor(model = 'en_core_web_sm', remove_stop=True).transform(X_train)

In [None]:
featurizer =  ManualFeatures(spacy_model='en_core_web_sm')

In [None]:
X_train_features, feature_names  = featurizer.fit_transform(X_train)

In [None]:
X_train_final = pd.concat((pd.DataFrame(X_train_cleaned, columns =['cleaned_text']), 
                           pd.DataFrame(X_train_features, columns=feature_names)),axis =1)

In [None]:
X_train_final.head()

In [None]:
X_train_final.info()

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.sparse import csr_matrix
class SparseTransformer(TransformerMixin, BaseEstimator):
    
  def __init__(self):
    return None

  def fit(self, X, y=None):
      return self

  def transform(self, X, y=None):
      return csr_matrix(X)

In [None]:
sparse_features = Pipeline([('sparse', SparseTransformer()),                     
                        ]) 
vectorizer = Pipeline([('tfidf', TfidfVectorizer(max_features=5)),                     
                        ]) 

In [None]:
sparse_features.fit_transform(X_train_final.iloc[:,1:])

In [None]:
vectorizer.fit_transform(X_train_final.iloc[:,0])

In [None]:
from sklearn.compose import ColumnTransformer, make_column_transformer
combined_features = ColumnTransformer(
 transformers=[
     ('tfidf', vectorizer, 'cleaned_text'),
      ], remainder=sparse_features
     )

In [None]:
classifier_1 = Pipeline([('combined_features',  combined_features),
                         ('classifier', XGBClassifier(scale_pos_weight=estimate)),
                        ])

In [None]:
param_grid_1 = {'combined_features__tfidf__tfidf__max_features': [500, 1000, 2000],
                'combined_features__tfidf__tfidf__max_df': [0.6, 0.8, 1.0],
                'combined_features__tfidf__tfidf__ngram_range': [(1,1), (1,2), (1,3)]}

grid_1 = GridSearchCV(estimator=classifier_1, param_grid=param_grid_1, 
                                 cv = 2,scoring = fscore, n_jobs= -1, verbose = 4 )

In [None]:
grid_1.fit(X_train_final, y_train)

In [None]:
print("Best cross-validation score: {:.2f}".format(grid_1.best_score_))
print("\nBest parameters: ", grid_1.best_params_)
print("\nBest estimator: ", grid_1.best_estimator_)

In [None]:
file_model = 'combined_model.pkl'

In [None]:
joblib.dump(grid_1.best_estimator_, file_model)

In [None]:
# load the saved model
loaded_model = joblib.load(file_model)

In [None]:
# plot learning curves
# This cell can take upto 30 minutes to run
plot_learning_curve(loaded_model, 'Learning Curves classifier_1', X_train_final, y_train)

In [None]:
# Accuracy on Train data set
grid_classifier = loaded_model.score(X_train_final, y_train)
print(f'Accuracy on train set  is {grid_classifier}')

In [None]:
X_test_cleaned = SpacyPreprocessor(model = 'en_core_web_sm', remove_stop=True).transform(X_test)

In [None]:
# save  this to a file
X_test_cleaned = 'x_test_cleaned_sparse_embed.pkl'

In [None]:
joblib.dump(X_test_cleaned, X_test_cleaned)

In [None]:
# Final Pipeline
def final_pipeline(text):
  cleaned_text = SpacyPreprocessor(model = 'en_core_web_sm', remove_stop=True).transform(text)
  X_features, feature_names  = featurizer.fit_transform(text)
  X_final = pd.concat((pd.DataFrame(cleaned_text, columns =['cleaned_text']), 
                           pd.DataFrame(X_features, columns=feature_names)),axis =1)
  
  predictions = loaded_model.predict(X_final)
  return predictions

In [None]:
# predicted values for Test data set
y_test_pred = final_pipeline(X_test)

In [None]:
print('\nTest set classification report:\n\n',classification_report(y_test, y_test_pred ))

In [None]:
print('\nTest set classification report:\n\n',fbeta_score(y_test, y_test_pred, beta=0.5))