# Spam identifier
Exploration of text message data and creation of models to predict if a message is spam or not. 

In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('assets/spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

In [2]:
def percentage_of_spam():

    # YOUR CODE HERE
    return 100/2 *spam_data[spam_data['target']==1].size/spam_data['target'].size

percentage_of_spam()

13.406317300789663

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
%precision 17

def model_count():
    vect = CountVectorizer().fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB(alpha=0.1)
    model.fit(X_train_vectorized, y_train)
    y_p=model.predict_proba(vect.transform(X_test))[:,-1]
    return roc_auc_score(y_test, y_p)#Your answer here

model_count()

0.99154542213469599

## Second vectroizer

Fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **3**.

Then fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1` and compute the area under the curve (AUC) score using the transformed test data.

*This function should return the AUC score as a float.*

In [4]:
%precision 16
def model_tfid():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import roc_auc_score
    
    # YOUR CODE HERE
    vect = TfidfVectorizer(min_df=3).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB(alpha=0.1)
    model.fit(X_train_vectorized, y_train)
    y_p=model.predict_proba(vect.transform(X_test))[:,-1]
    return roc_auc_score(y_test, y_p)

model_tfid()

0.9954968337775665

<br>
<br>
The following function has been provided to help you combine new features into the training data:

In [6]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

## An aditional feaure

Fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **5**.

Using this document-term matrix and an additional feature, **the length of document (number of characters)**, fit a Support Vector Classification model with regularization `C=10000`. Then compute the area under the curve (AUC) score using the transformed test data.


In [7]:
from sklearn.svm import SVC

def model():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.svm import SVC
    from sklearn.metrics import roc_auc_score
    
    # YOUR CODE HERE
    vect = TfidfVectorizer(min_df=5).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    X_train_unvectorized = vect.inverse_transform(X_train_vectorized)
    
    spam=np.array(X_train)
    len_feature=np.array([len(w) for w in spam])
    X_train_vectorized = add_feature(X_train_vectorized, len_feature)
    
    model = SVC(C=10000)
    model.fit(X_train_vectorized, y_train)
    x_add=vect.transform(X_test)
    spam2=np.array(X_test)
    len_featuret=np.array([len(w) for w in spam2])
    x_add = add_feature(x_add, len_featuret)
    y_p=model.decision_function(x_add)
    return roc_auc_score(y_test, y_p)
    #return X_train_unvectorized
model()

0.9963202213809143

## Feature: **number of digits per document**


In [9]:
from sklearn.linear_model import LogisticRegression

def answer_nine():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import roc_auc_score
    
    # YOUR CODE HERE
    vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    X_train_unvectorized = vect.inverse_transform(X_train_vectorized)
    
    spam=np.array(X_train)
    len_feature=np.array([len(w) for w in spam])
    n_digits=np.array([sum(c.isdigit() for c in s) for s in spam])
    X_train_vectorized = add_feature(X_train_vectorized, len_feature)
    X_train_vectorized = add_feature(X_train_vectorized, n_digits)
    
    model = LogisticRegression(C=100, max_iter=1000)
    model.fit(X_train_vectorized, y_train)
    
    X_test_vectorized=vect.transform(X_test)
    spam_test=np.array(X_test)
    len_feature_test=np.array([len(w) for w in spam_test])
    n_digits_test=np.array([sum(c.isdigit() for c in s) for s in spam_test])
    X_test_vectorized = add_feature(X_test_vectorized, len_feature_test)
    X_test_vectorized = add_feature(X_test_vectorized, n_digits_test)
    y_p=model.predict_proba(X_test_vectorized)[:,-1]
    return roc_auc_score(y_test, y_p)

answer_nine()

0.9972921582941445

## Using n-grams

In [17]:
def answer_eleven():
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import roc_auc_score
    import re
    
    # YOUR CODE HERE
    vect = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train[:2000])
    X_train_vectorized = vect.transform(X_train[:2000])
    X_train_unvectorized = vect.inverse_transform(X_train_vectorized)
    
    spam=np.array(X_train[:2000])
    len_feature=np.array([len(w) for w in spam])
    n_digits=np.array([sum(c.isdigit() for c in s) for s in spam])
    non_word=np.array([len(re.sub('[\w]+' ,'', s)) for s in spam])
    X_train_vectorized = add_feature(X_train_vectorized, len_feature)
    X_train_vectorized = add_feature(X_train_vectorized, n_digits)
    X_train_vectorized = add_feature(X_train_vectorized, non_word)
    
    model = LogisticRegression(C=100, max_iter=1000)
    model.fit(X_train_vectorized, y_train[:2000])
    
    X_test_vectorized=vect.transform(X_test)
    spam_test=np.array(X_test)
    len_feature_test=np.array([len(w) for w in spam_test])
    n_digits_test=np.array([sum(c.isdigit() for c in s) for s in spam_test])
    non_word_test=np.array([len(re.sub('[\w]+' ,'', s)) for s in spam_test])
    X_test_vectorized = add_feature(X_test_vectorized, len_feature_test)
    X_test_vectorized = add_feature(X_test_vectorized, n_digits_test)
    X_test_vectorized = add_feature(X_test_vectorized, non_word_test)
    
    y_p=model.predict_proba(X_test_vectorized)[:,-1]
    AUC=roc_auc_score(y_test, y_p)
    
    sorted_coef = model.coef_[0].argsort()
    feature_names = np.array(vect.get_feature_names())
    return (AUC, feature_names[sorted_coef[:10]], feature_names[sorted_coef[:-11:-1]])
answer_eleven()

IndexError: index 9890 is out of bounds for axis 0 with size 9889