In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings; warnings.simplefilter('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn import preprocessing

# Import the classifiers we will be using
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# NLTK imports
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import re
import string

import spacy
#English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. 
#Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.
nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger'])

In [None]:
df = pd.read_csv('spam.csv', encoding='latin1')
df.head(5)

In [None]:
#axis=1 (columns)
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.rename(columns={"v1":"label", "v2":"message"},inplace=True)
df.head()

# Clean your data

Define transformer to remove all irrelevant characters such as any non alphanumeric characters

In [None]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self
    

# Initialize a tokenizer and a stemmer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
regex_list = [(r"http\S+", ""), 
              (r"http", ""),
              (r"@\S+", ""),
              (r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " "),
              (r"@", "at"),
               #remove remaining irrelevant simple apostrophs found in the corpus datasets
              (r"[[^a-z]'|'$|' +?|^'", " "),
              #removePunctuation
              (r"[^[:alnum:][:space:]']|[^a-z]'|'$|' +?|^'", " "),
              #removed accent from characters that are useless
              (r"[[^a-z]'|'$|' +?|^'",""),
              #I removed characters within words that have no meaning like www and com using this pattern
              (r"www\\.?|\\.?com", " ")                  
             ]

In [None]:
cleaner = TextCleanerTransformer(tokenizer, stemmer, regex_list)
df['message'] = cleaner.transform(df['message'].values)

## 1.1 - Baseline##

Let's get our baseline with the Bag-of-words approach. Here we are going to use a RandomForestClassifier, a powerful machine learning classifier that fits very well in this problem. The usage of RandomForestClassifier in sckit-learn is similar to the other classifiers we already used.

In [None]:
# Split in train and validation
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Encode the labels
le = preprocessing.LabelEncoder()
le.fit(train_data['label'].values)

train_data['label'] = le.transform(train_data['label'].values)
test_data['label'] = le.transform(test_data['label'].values)

In [None]:
# Build the pipeline

text_clf = Pipeline([ ('tfidf', TfidfVectorizer ()),
                     ('classifier', RandomForestClassifier(random_state = 42))])

text_clf = Pipeline([ ('vect', CountVectorizer(stop_words='english',  ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer()),
                     ('classifier', RandomForestClassifier(random_state = 42))])


# Train the classifier
text_clf.fit(map(str, train_data['message'].values), train_data['label'].values)


predicted = text_clf.predict(map(str, test_data['message'].values))
np.mean(predicted == test_data['label'])

In [None]:
train_data.isnull().sum()

In [None]:
train_data['message'].notna().head()

In [None]:
#Select missing or non-missing values
#Always use pd.isnull() or pd.notnull() as it is most reliable.
train_data[pd.isnull(train_data.message)].head(5)

In [None]:
#Select missing or non-missing values
#Always use pd.isnull() or pd.notnull() as it is most reliable.
train_data[pd.notna(train_data.message)].head(5)

In [None]:
df['length'] = df['message'].apply(lambda x: len(x))
df.head()

In [None]:
ax_list = df.hist(column='length', by='label', bins=50,figsize=(12,4))
ax_list[0].set_xlim((0,300))
ax_list[1].set_xlim((0,300))

## Fitting a classifier
Starting with a logistic regression is a good idea. It is simple, often gets the job done, and is easy to interpret.

## Evaluation
Let's start by looking at some metrics to see if our classifier performed well at all.

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

#accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)
#print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

## 1.4 - Advanced features

Let's now play with other more complex text features and see if we can maximize our classification score even more.


In [None]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
text = Pipeline([
                ('selector', TextSelector(key='message')),
                ('tfidf', TfidfVectorizer())
            ])


length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

In [None]:
# Feature Union allow use to use multiple distinct features in our classifier
feats = FeatureUnion([('text', text), 
                      ('length', length)])

In [None]:
# Split in train and validation
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(train_data, train_data.label)

preds = pipeline.predict(test_data)
np.mean(preds == test_data.label)

In [None]:
df['words'] = df['message'].str.split(' ').map(len)
stop_words = set(stopwords.words('english'))
df['words_not_stopword'] = df['message'].apply(lambda x: len([t for t in x.split(' ') if t not in stop_words]))
df['commas'] = df['message'].str.count(',')
df['upper'] = df['message'].map(lambda x: map(str.isupper, x)).map(sum)
df['capitalized'] = df['message'].map(lambda x: map(str.istitle, x)).map(sum)
#get the average word length
df['avg_word_length'] = df['message'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stop_words]) if len([len(t) for t in x.split(' ') if t not in stop_words]) > 0 else 0)

In [None]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])
upper =  Pipeline([
                ('selector', NumberSelector(key='upper')),
                ('standard', StandardScaler()),
            ])
capitalized =  Pipeline([
                ('selector', NumberSelector(key='capitalized')),
                ('standard', StandardScaler()),
            ])

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                     ('upper', upper),                     
                     ('capitalized', capitalized)])

feature_processing = Pipeline([('feats', feats)])

In [None]:
df.head(2)

In [None]:
# Split in train and validation
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Encode the labels
le = preprocessing.LabelEncoder()
le.fit(train_data['label'].values)

train_data['label'] = le.transform(train_data['label'].values)
test_data['label'] = le.transform(test_data['label'].values)


pipeline = Pipeline([
 ('features', feats),
   ('classifier', RandomForestClassifier(random_state = 42) )   
])

#pipeline.fit(map(str, train_data['message'].values), train_data.label.values)
pipeline.fit(train_data.drop(['label'], axis=1), train_data.label)

y_pred = pipeline.predict( test_data.drop(['label'], axis=1))
np.mean(y_pred == test_data.label)

In [None]:
df.head()

## Saving predictions to CSV

In [None]:
# Create a file for the submission
predictions = pd.DataFrame({'ID': test_data.index, 'label': y_pred})

predictions.to_csv('submission.csv', index=False)


# Hyperparameter Tuning Using Grid Search

In [None]:
# Load libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV


In [None]:
#Create Hyperparameter Search Space
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)


hyperparameters = dict( max_depth=[1,5], max_features=[1, 2, 4])

'''
random_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=45, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=10, verbose=0,
            warm_start=False)
'''


In [None]:
# Create logistic regression
logistic = linear_model.LogisticRegression()

In [None]:
#Create Grid Search
# Create grid search using 5-fold cross validation
clf = GridSearchCV(RandomForestClassifier(), hyperparameters, cv=5, verbose=0)


In [None]:
#Conduct Grid Search
# Fit grid search
#X, y  =  train_data, test_data
#train_x = pd.get_dummies(X_train.drop("message", axis=1))
#train_y = pd.get_dummies(
#best_model = clf.fit(train_data.drop('label', axis=1), train_data.label)
#View Hyperparameter Values Of Best Model
# View best hyperparameters
#print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
#print('Best C:', best_model.best_estimator_.get_params()['C'])

## Hyperparameter Tuning Using RandomizedSearchCV

In [None]:
from scipy.stats import randint

independent_variables = train_data.columns
estimator  = RandomForestClassifier()

random_search_parameter_space_dist = {
                   "max_depth": randint(1, 100),
                   "max_features": randint(1, 2),
                   "class_weight": ["balanced", None]
                  }

randomized_search = RandomizedSearchCV(
                        estimator, 
                        random_search_parameter_space_dist,
                        cv=5, n_iter=250,
                        random_state=42,
                        return_train_score=True, 
                        n_jobs = 10 )
%%timeit -n 1 -r 1
randomized_search.fit(train_data.drop('label', axis=1), train_data.label)

## Material for help ##

Regular Expressions
    
    https://chrisalbon.com/#articles
    
Automatic hyperparameter tuning

    http://www.cse.chalmers.se/~richajo/dit865/files/Automatic%20hyperparameter%20tuning.html

text mining (nlp) with python
    
    https://github.com/TiesdeKok/Python_NLP_Tutorial/blob/master/NLP_Notebook.ipynb
    
Natural Language Processing in a Kaggle Competition for Movie Reviews (We can use this page to check how we must create the) predict_proba
    
   [] (https://jessesw.com/NLP-Movie-Reviews/)
   
Natural Language Processing in a Kaggle Competition for Movie Reviews
    https://jessesw.com/NLP-Movie-Reviews/
    
# Errors

[python sklearn pipiline fit: “AttributeError: lower not found”](https://stackoverflow.com/questions/50192763/python-sklearn-pipiline-fit-attributeerror-lower-not-found)