# NLP Cup Event #3 - Uncle Steve's Solution

In [1]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.1.


In [3]:
import os
os.getcwd()

"C:\\Users\\st50\\OneDrive - Queen's University\\Courses\\202008_GMMA865\\Big Data Cup\\Event 3"

In [4]:
df = pd.read_csv("sms-spam/spamraw_train.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5000 non-null   int64 
 1   sms_text  5000 non-null   object
 2   spam      5000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 117.3+ KB


Unnamed: 0,id,sms_text,spam
0,1,Hope you are having a good week. Just checking in,0
1,2,K..give back my thanks.,0
2,3,Am also doing in cbe only. But have to pay.,0
3,4,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1
4,5,okmail: Dear Dave this is your final notice to...,1


In [5]:
from sklearn.model_selection import train_test_split

X = df['sms_text']
y = df['spam']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.005, random_state=42)

# Custom Functions for Preprocessing and Feature Engineering

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode
import textstat
import string  

lemmer = WordNetLemmatizer()

# Simple preprocessor.
# Input is a single document, as a single string.
# Otuput should be a single document, as a single string.
def my_preprocess(doc):
    
    # Lowercase
    doc = doc.lower()
    
    # Replace URL with URL string
    doc = re.sub(r'http\S+', 'URL', doc)
    
    # Replace AT with AT string
    doc = re.sub(r'@', 'AT', doc)
    
    # Replace all numbers/digits with the string NUM
    doc = re.sub(r'\b\d+\b', 'NUM', doc)
    
    # Lemmatize each word.
    doc = ' '.join([lemmer.lemmatize(w) for w in doc.split()])

    return doc

In [7]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entier corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def lexicon_count(corpus):
    return np.array([textstat.lexicon_count(doc) for doc in corpus]).reshape(-1, 1)

def _get_punc(doc):
    return len([a for a in doc if a in string.punctuation])

def punc_count(corpus):
    return np.array([_get_punc(doc) for doc in corpus]).reshape(-1, 1)

def _get_caps(doc):
    return sum([1 for a in doc if a.isupper()])

def capital_count(corpus):
    return np.array([_get_caps(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

def num_question_marks(corpus):
    return np.array([doc.count('?') for doc in corpus]).reshape(-1, 1)

def xxx_pics_count(corpus):
    return np.array(["xxx pics" in doc.lower() for doc in corpus]).reshape(-1, 1)

# See if the document ends with someting like "Love Steve XXX"
def has_lovexxx(corpus):
    return np.array([bool(re.search(r"l[ou]+ve?.{0,10}x{2,5}\.? ?$", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_url(corpus):
    return np.array([bool(re.search("http", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_pence(corpus):
    return np.array([bool(re.search("\dp\W", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_money(corpus):
    return np.array([bool(re.search("[\$£]|\bpence\b|\bdollar\b", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_sexy_phrase(corpus):
    return np.array([bool(re.search("sexy single|\bfree sexy\b|\bsexy pic\b|\blive sex\b", doc.lower())) for doc in corpus]).reshape(-1, 1)

In [8]:
# To help handle class imbalance, calculate the class weights.

import numpy as np
neg, pos = np.bincount(df['spam'])
total = neg + pos
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.58
Weight for class 1: 3.71


# Construct the Pipeline

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF
from sklearn.neural_network import MLPClassifier

# Need to preprocess the stopwords, because scikit learn's TfidfVectorizer
# removes stopwords _after_ preprocessing
stop_words = [my_preprocess(word) for word in stop_words.ENGLISH_STOP_WORDS]

# This vectorizer will be used to create the BOW features
vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 1000, 
                             ngram_range=[1,4],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

# This vectorizer will be used to preprocess the text before topic modeling.
# (I _could_ use the same vectorizer as above- but why limit myself?)
vectorizer2 = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 1000, 
                             ngram_range=[1,2],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

nmf = NMF(n_components=25, random_state=1, init='nndsvda', solver='mu', alpha=.1, l1_ratio=.5)
rf = RandomForestClassifier(criterion='entropy', random_state=223)
mlp = MLPClassifier(random_state=42, verbose=2, max_iter=200)



feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('cv', vectorizer), ])),
    ('topics', Pipeline([('cv', vectorizer2), ('nmf', nmf),])),
    ('length', FunctionTransformer(doc_length, validate=False)),
    ('words', FunctionTransformer(lexicon_count, validate=False)),
    ('punc_count', FunctionTransformer(punc_count, validate=False)),
    ('capital_count', FunctionTransformer(capital_count, validate=False)),  
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
    ('num_question_marks', FunctionTransformer(num_question_marks, validate=False)),  
    ('xxx_pics_count', FunctionTransformer(xxx_pics_count, validate=False)),  
    ('has_lovexxx', FunctionTransformer(has_lovexxx, validate=False)),  
    ('has_url', FunctionTransformer(has_url, validate=False)),  
    ('has_pence', FunctionTransformer(has_pence, validate=False)),  
    ('has_money', FunctionTransformer(has_money, validate=False)),
    ('has_sexy_phrase', FunctionTransformer(has_sexy_phrase, validate=False)),
])

steps = [('features', feature_processing)]

pipe = Pipeline([('features', feature_processing), ('clf', mlp)])

param_grid = {}

# You - yes you! Manually choose which classifier run you'd like to try.
# In future I'd like to automate this so that both are tried; but for this simple
# Kaggle competition, I'm keeping it simple. You can set this to either:
#
# "RF" - Random Forest
# "MLP" - NN
#
# and then re-run the entire notebook
which_clf = "RF"

if which_clf == "RF":

    steps.append(('clf', rf))

    # I already ran a 4-hour extensive grid; this is not the full set. BTW, the best hyperarms I found are:
    # Best parameter (CV scy_train0.988):
    # {'clf__class_weight': None, 
    # 'clf__n_estimators': 500, 
    # 'features__bow__cv__max_features': 500, 
    # 'features__bow__cv__preprocessor': None, 
    # 'features__bow__cv__use_idf': False, 
    # 'features__topics__cv__stop_words': None, 
    # 'features__topics__nmf__n_components': 300}
    param_grid = {
        'features__bow__cv__preprocessor': [None, my_preprocess],
        'features__bow__cv__max_features': [200, 500, 1000],
        'features__bow__cv__use_idf': [False],
        'features__topics__cv__stop_words': [None],
        'features__topics__nmf__n_components': [25, 75],
        'clf__n_estimators': [100, 500],
        'clf__class_weight': [None],
    }
    
elif which_clf == "MLP":
    
    steps.append(('clf', mlp))

    # I already ran a 4-hour extensive grid; this is not the full set. BTW, the best hyperarms I found are:
    # Best parameter (CV scy_train0.991): 
    # {'clf__hidden_layer_sizes': (25, 25, 25), 
    # 'features__bow__cv__max_features': 3000, 
    # 'features__bow__cv__min_df': 0, 
    # 'features__bow__cv__preprocessor': <function my_preprocess at 0x0000024801E161E0>, 
    # 'features__bow__cv__use_idf': False, 
    # 'features__topics__nmf__n_components': 300}
    param_grid = {
        'features__bow__cv__preprocessor': [my_preprocess],
        'features__bow__cv__max_features': [1000, 3000],
        'features__bow__cv__min_df': [0],
        'features__bow__cv__use_idf': [False],
        'features__topics__nmf__n_components': [300],
        'clf__hidden_layer_sizes': [(100, ), (50, 50), (25, 25, 25)],
    }

pipe = Pipeline(steps)

search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=3, scoring='f1_micro', return_train_score=True, verbose=2)



# Fit Model

It's showtime, baby.

In [10]:
search = search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   57.7s
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:  3.0min finished


In [11]:
print("Best parameter (CV scy_train%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV scy_train0.987):
{'clf__class_weight': None, 'clf__n_estimators': 500, 'features__bow__cv__max_features': 500, 'features__bow__cv__preprocessor': <function my_preprocess at 0x000001E4DC4B8268>, 'features__bow__cv__use_idf': False, 'features__topics__cv__stop_words': None, 'features__topics__nmf__n_components': 75}


In [12]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results
#results.to_csv('results2.csv', index=False)

Unnamed: 0,clf__class_weight,clf__n_estimators,features__bow__cv__max_features,features__bow__cv__preprocessor,features__bow__cv__use_idf,features__topics__cv__stop_words,features__topics__nmf__n_components,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
19,,500,500,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,8.802069,0.909676,0.999899,0.0001421606,0.986532,0.003632,1
23,,500,1000,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,8.431518,0.804217,0.999899,0.0001421606,0.986531,0.00447,2
3,,100,200,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,4.021864,0.558166,0.999899,0.0001421606,0.986331,0.003355,3
7,,100,500,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,3.970745,0.662673,0.999899,0.0001421606,0.98613,0.003554,4
11,,100,1000,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,3.723979,0.665498,0.999899,0.0001421606,0.985928,0.004416,5
10,,100,1000,<function my_preprocess at 0x000001E4DC4B8268>,False,,25,2.115033,0.555789,0.999497,0.0001421606,0.985728,0.00253,6
15,,500,200,<function my_preprocess at 0x000001E4DC4B8268>,False,,75,10.729021,1.196579,0.999899,0.0001421606,0.985728,0.003012,7
4,,100,500,,False,,25,1.850402,0.423519,0.999095,0.0002461552,0.985728,0.002846,7
14,,500,200,<function my_preprocess at 0x000001E4DC4B8268>,False,,25,6.32186,0.785745,0.999397,0.0002461551,0.985728,0.003763,7
6,,100,500,<function my_preprocess at 0x000001E4DC4B8268>,False,,25,2.030012,0.590679,0.999598,0.0002842783,0.985728,0.003207,7


# Estimate Model Performance on Val Data

In [13]:
# Because we are using a pipeline and a GridSearchCV, things are a bit complicated.
# I want to get references to the objects from the pipeline with the *best* hyperparameter settings,
# so that I can explore those objects (later). 
# The code below is a bit ugly, but after reading throught the docs of Pipeline, 
# I believe this is the only way to do it.

# The pipeline with the best performance
pipeline = search.best_estimator_

# Get the feature processing pipeline, so I can use it later
feature_processing_obj = pipeline.named_steps['features']

# Find the vectorizer objects, the NMF objects, and the classifier objects
pipevect= dict(pipeline.named_steps['features'].transformer_list)
vectorizer_obj = pipevect.get('bow').named_steps['cv']
vectorizer_obj2 = pipevect.get('topics').named_steps['cv']
nmf_obj = pipevect.get('topics').named_steps['nmf']
clf_obj = pipeline.named_steps['clf']

# Sanity check - what was vocabSize set to? Should match the output here.
len(vectorizer_obj.get_feature_names())

500

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

features_val = feature_processing_obj.transform(X_val).todense()

pred_val = search.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_val, pred_val, average='micro')))

print("\nClassification Report:")
print(classification_report(y_val, pred_val))

Confusion matrix:
[[20  0]
 [ 1  4]]

F1 Score = 0.96000

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.80      0.89         5

    accuracy                           0.96        25
   macro avg       0.98      0.90      0.93        25
weighted avg       0.96      0.96      0.96        25



# Estimate Performance on Test/Kaggle Data

In [15]:
test_df = pd.read_csv('sms-spam/spamraw_test.csv')

features_test = feature_processing_obj.transform(test_df['sms_text']).todense()
pred_test = search.predict(test_df['sms_text'])

# Output the predictions to a file to upload to Kaggle.
# Uncomment to actually create the file
#my_submission = pd.DataFrame({'id': test_df.id, 'predicted': pred_test})
#my_submission.to_csv('steve_submission.csv', index=False)

solutions_df = pd.read_csv('sms-spam/spamraw_test_solutions.csv')
y_test = solutions_df['spam']

print("Confusion matrix:")
print(confusion_matrix(y_test, pred_test))

print("\nF1 Score = {:.5f}".format(f1_score(y_test, pred_test, average="micro")))

print("\nClassification Report:")
print(classification_report(y_test, pred_test))

Confusion matrix:
[[485   0]
 [ 10  64]]

F1 Score = 0.98211

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       485
           1       1.00      0.86      0.93        74

    accuracy                           0.98       559
   macro avg       0.99      0.93      0.96       559
weighted avg       0.98      0.98      0.98       559



# Explore the Model Further

The path to enlightment begins by understanding what our model learned.

## Print Topics

Print the top words for each of the NMF topics

In [16]:
n_top_words = 10
def get_top_words(H, feature_names):
    output = []
    for topic_idx, topic in enumerate(H):
        top_words = [(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
        output.append(top_words)
        
    return pd.DataFrame(output) 

top_words = get_top_words(nmf_obj.components_, vectorizer_obj2.get_feature_names())
top_words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,NUM,NUM NUM,to NUM,txt,call NUM,NUM to,NUM now,stop,win,claim
1,me,call me,tell,tell me,to me,give me,me to,me NUM,for me,with me
2,ok,ok lor,it ok,thanx,yup,prob,ya,leave,too,whats
3,sorry,later,ll,ll call,call later,sorry ll,call,meeting,call you,aight
4,are,are you,you are,how are,where are,what are,you doing,doing,hope you,having
...,...,...,...,...,...,...,...,...,...,...
70,know,you know,don,let,don know,let me,me know,dont,know what,know when
71,new,happy,year,new year,happy new,family,the new,many,you and,birthday
72,what,what you,about,what are,doing,plan,tell,should,hey,hear
73,wat,doing,wat time,abt,finish,thk,wan,dunno,dat,eat


## Print Feature Importances

Note: this section will only work with models that have `.feature_importances_`, such as RF and DT.

In [17]:
topic_feature_names = ["topic {}".format(i) for i in range(nmf_obj.n_components_)]

stat_feature_names = [t[0] for t in pipeline.named_steps['features'].transformer_list if t[0] not in ['topics', 'bow']]

feature_names = vectorizer_obj.get_feature_names() + topic_feature_names + stat_feature_names
len(feature_names)

feature_importances = None
if hasattr(clf_obj, 'feature_importances_'):
    feature_importances = clf_obj.feature_importances_

587

In [19]:
features_train = feature_processing_obj.transform(X_train).todense()

if feature_importances is None:
    print("No Feature importances! Skipping.")
else:
    N = features_train.shape[1]

    ssum = np.zeros(N)
    avg = np.zeros(N)
    avg_spam = np.zeros(N)
    avg_ham = np.zeros(N)
    for i in range(N):
        ssum[i] = sum(features_train[:, i]).reshape(-1, 1)
        avg[i] = np.mean(features_train[:, i]).reshape(-1, 1)
        avg_spam[i] = np.mean(features_train[y_train==1, i]).reshape(-1, 1)
        avg_ham[i] = np.mean(features_train[y_train==0, i]).reshape(-1, 1)

    rf = search.best_estimator_
    imp = pd.DataFrame(data={'feature': feature_names, 'imp': feature_importances, 'sum': ssum, 'avg': avg, 'avg_ham': avg_ham, 'avg_spam': avg_spam})
    imp = imp.sort_values(by='imp', ascending=False)
    imp.head(20)
    imp.tail(10)
    #imp.to_csv('importances.csv', index=False)

Unnamed: 0,feature,imp,sum,avg,avg_ham,avg_spam
4,NUM,0.091413,550.569586,0.110667,0.049016,0.508171
500,topic 0,0.078473,51.603379,0.010373,0.004175,0.050333
578,capital_count,0.066757,28010.0,5.630151,4.078941,15.631737
538,topic 38,0.053829,23.56477,0.004737,0.001236,0.027305
575,length,0.04833,397790.0,79.957789,70.773624,139.173653
402,to NUM,0.034718,40.448772,0.00813,0.000486,0.057419
62,call NUM,0.029933,36.710821,0.007379,2.6e-05,0.054791
576,words,0.0296,76222.0,15.321005,14.029719,23.646707
585,has_money,0.02954,252.0,0.050653,0.004179,0.350299
61,call,0.022952,129.107862,0.025951,0.016138,0.089224


Unnamed: 0,feature,imp,sum,avg,avg_ham,avg_spam
401,tmr,3.852383e-06,10.22843,0.002056,0.002375,0.0
117,dun,3.660861e-06,16.323417,0.003281,0.00379,0.0
448,watching,3.5471e-06,11.250014,0.002261,0.002612,0.0
229,ll be,3.342423e-06,7.8392,0.001576,0.00182,0.0
213,lar,3.330078e-06,12.928998,0.002599,0.003002,0.0
108,do it,2.808439e-06,6.044677,0.001215,0.001403,0.0
295,on my,2.511915e-06,7.413922,0.00149,0.001721,0.0
231,ll call later,1.421664e-06,10.240883,0.002058,0.002378,0.0
238,love you,1.19292e-06,9.246362,0.001859,0.002147,0.0
364,sorry ll call,1.83743e-07,10.196225,0.002049,0.002367,0.0


# Further explanation on Val Data

This cool package will explain all the predictions of a tree-based model. I'll have it explain all predictions that were incorrect, to see what is going on (and hopefully inform some additional feature engineering or cleaning steps).

Note: this only works on tree-based models, like RF. This cell will crash when using, e.g., MLPClassifier

In [20]:
if feature_importances is None:
    print("No Feature importances! Skipping.")
else:

    from treeinterpreter import treeinterpreter as ti

    prediction, bias, contributions = ti.predict(clf_obj, features_val)

    for i in range(len(features_val)):
        if y_val.iloc[i] == pred_val[i]:
            continue
        print("Instance {}".format(i))
        X_val.iloc[i]
        print("Bias (trainset mean) {}".format(bias[i]))
        print("Truth {}".format(y_val.iloc[i]))
        print("Prediction {}".format(prediction[i, :]))
        print("Feature contributions:")
        con = pd.DataFrame(data={'feature': feature_names, 
                                 'value': features_val[i].A1,
                                 'legit contr': contributions[i][:, 0],
                                 'spam contr': contributions[i][:, 1],
                                 'abs contr': abs(contributions[i][:, 1])})

        con = con.sort_values(by="abs contr", ascending=False)
        con['spam cumulative'] = con['spam contr'].cumsum() + bias[i][1]
        con.head(30)
        print("-"*20) 



Instance 16


'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.892 0.108]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
429,uk,0.2357023,-0.083493,0.083493,0.083493,0.217289
4,NUM,0.0,0.021193,-0.021193,0.021193,0.196096
579,num_exclamation_marks,1.0,-0.019944,0.019944,0.019944,0.21604
500,topic 0,1.048537e-05,0.014379,-0.014379,0.014379,0.201662
575,length,84.0,0.012303,-0.012303,0.012303,0.189359
581,xxx_pics_count,1.0,-0.011884,0.011884,0.011884,0.201243
578,capital_count,4.0,0.011652,-0.011652,0.011652,0.189591
576,words,19.0,-0.010094,0.010094,0.010094,0.199685
267,my,0.2357023,0.005641,-0.005641,0.005641,0.194044
560,topic 60,7.199698e-14,0.005484,-0.005484,0.005484,0.18856


--------------------


# Further exploration on Test/Kaggle Data

Note: this only works on tree-based models, like RF. This cell will crash when using, e.g., MLPClassifier

In [22]:
if  feature_importances is None:
    print("No Feature importances! Skipping.")
else:

    from treeinterpreter import treeinterpreter as ti

    prediction, bias, contributions = ti.predict(clf_obj, features_test)

    for i in range(len(features_test)):
        if y_test[i] == pred_test[i]:
            continue
        print("Instance {}".format(i))
        test_df.iloc[i, :].sms_text
        print("Bias (trainset mean) {}".format(bias[i]))
        print("Truth {}".format(y_test[i]))
        print("Prediction {}".format(prediction[i, :]))
        print("Feature contributions:")
        con = pd.DataFrame(data={'feature': feature_names,
                                 'value': features_test[i].A1,
                                 'legit contr': contributions[i][:, 0],
                                 'spam contr': contributions[i][:, 1],
                                 'abs contr': abs(contributions[i][:, 1])})
        con = con.sort_values(by="abs contr", ascending=False)
        con['spam cumulative'] = con['spam contr'].cumsum() + bias[i][1]
        con.head(30)
        print("-"*20) 

Instance 5


'SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.74 0.26]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
578,capital_count,24.0,-0.08286,0.08286,0.08286,0.216656
538,topic 38,0.007559668,-0.077521,0.077521,0.077521,0.294178
335,reply,0.2773501,-0.072371,0.072371,0.072371,0.366549
4,NUM,0.0,0.051583,-0.051583,0.051583,0.314966
500,topic 0,0.0001830725,0.034299,-0.034299,0.034299,0.280667
576,words,22.0,-0.02167,0.02167,0.02167,0.302337
274,new,0.2773501,-0.016618,0.016618,0.016618,0.318955
300,or,0.2773501,-0.014202,0.014202,0.014202,0.333157
548,topic 48,0.04719928,-0.012706,0.012706,0.012706,0.345863
402,to NUM,0.0,0.012622,-0.012622,0.012622,0.333241


--------------------
Instance 102


"RCT' THNQ Adrian for U text. Rgds Vatian"

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.76 0.24]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
538,topic 38,0.042898,-0.153261,0.153261,0.153261,0.287057
578,capital_count,11.0,-0.094365,0.094365,0.094365,0.381422
379,text,0.7071068,-0.03858,0.03858,0.03858,0.420003
4,NUM,0.0,0.035131,-0.035131,0.035131,0.384871
575,length,40.0,0.029333,-0.029333,0.029333,0.355538
576,words,8.0,0.023022,-0.023022,0.023022,0.332516
135,for,0.7071068,-0.018969,0.018969,0.018969,0.351485
523,topic 23,0.1367002,-0.015901,0.015901,0.015901,0.367387
577,punc_count,2.0,0.009531,-0.009531,0.009531,0.357856
402,to NUM,0.0,0.009262,-0.009262,0.009262,0.348594


--------------------
Instance 121


"Sorry I missed your call let's talk when you have the time. I'm on 07090201529"

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.564 0.436]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
4,NUM,0.2672612,-0.071457,0.071457,0.071457,0.205253
61,call,0.2672612,-0.071401,0.071401,0.071401,0.276654
500,topic 0,0.01968069,-0.069114,0.069114,0.069114,0.345768
564,topic 64,0.03943996,-0.064453,0.064453,0.064453,0.41022
294,on NUM,0.2672612,-0.054224,0.054224,0.054224,0.464445
490,you have,0.2672612,-0.047784,0.047784,0.047784,0.512228
578,capital_count,3.0,0.044089,-0.044089,0.044089,0.468139
560,topic 60,0.0007041828,-0.039996,0.039996,0.039996,0.508135
510,topic 10,0.03911389,-0.038735,0.038735,0.038735,0.54687
575,length,78.0,0.038648,-0.038648,0.038648,0.508222


--------------------
Instance 136


'This message is brought to you by GMW Ltd. and is not connected to the'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.974 0.026]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
4,NUM,0.0,0.020352,-0.020352,0.020352,0.113443
500,topic 0,8.261309000000001e-23,0.016404,-0.016404,0.016404,0.097039
253,message,0.2886751,-0.013278,0.013278,0.013278,0.110317
575,length,70.0,0.013092,-0.013092,0.013092,0.097225
578,capital_count,5.0,-0.01081,0.01081,0.01081,0.108035
549,topic 49,0.06836016,-0.00877,0.00877,0.00877,0.116805
538,topic 38,3.455682e-06,0.007963,-0.007963,0.007963,0.108842
576,words,15.0,0.006993,-0.006993,0.006993,0.101849
564,topic 64,4.522876e-11,0.005713,-0.005713,0.005713,0.096136
61,call,0.0,0.004748,-0.004748,0.004748,0.091388


--------------------
Instance 180


'In The Simpsons Movie released in July 2007 name the band that died at the start of the film? A-Green Day, B-Blue Day, C-Red Day. (Send A, B or C)'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.884 0.116]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
578,capital_count,18.0,-0.090356,0.090356,0.090356,0.224152
575,length,146.0,-0.046769,0.046769,0.046769,0.270921
4,NUM,0.1581139,0.042718,-0.042718,0.042718,0.228203
500,topic 0,0.01473512,-0.041384,0.041384,0.041384,0.269588
538,topic 38,1.5116029999999998e-19,0.03632,-0.03632,0.03632,0.233267
576,words,30.0,-0.022512,0.022512,0.022512,0.255779
548,topic 48,0.03669924,-0.017695,0.017695,0.017695,0.273474
300,or,0.1581139,-0.016597,0.016597,0.016597,0.290071
402,to NUM,0.0,0.015857,-0.015857,0.015857,0.274214
564,topic 64,7.768123e-36,0.013826,-0.013826,0.013826,0.260387


--------------------
Instance 197


'Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry scores with a simple shot from 6 yards from a pass by Bergkamp to give Arsenal a 2 goal margin after 78 mins.'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.566 0.434]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
4,NUM,0.904534,-0.143388,0.143388,0.143388,0.277184
500,topic 0,0.1133496,-0.128541,0.128541,0.128541,0.405725
578,capital_count,7.0,-0.074007,0.074007,0.074007,0.479733
575,length,155.0,-0.061647,0.061647,0.061647,0.54138
538,topic 38,5.783457e-11,0.057083,-0.057083,0.057083,0.484297
579,num_exclamation_marks,1.0,-0.038857,0.038857,0.038857,0.523154
143,from,0.3015113,-0.02885,0.02885,0.02885,0.552005
61,call,0.0,0.025672,-0.025672,0.025672,0.526332
402,to NUM,0.0,0.023716,-0.023716,0.023716,0.502616
560,topic 60,6.166053e-05,-0.02343,0.02343,0.02343,0.526046


--------------------
Instance 256


'Missed call alert. These numbers called but left no message. 07008009200'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.784 0.216]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
4,NUM,0.3535534,-0.079802,0.079802,0.079802,0.213598
61,call,0.3535534,-0.071299,0.071299,0.071299,0.284898
564,topic 64,0.05418788,-0.066905,0.066905,0.066905,0.351803
500,topic 0,0.02287467,-0.06541,0.06541,0.06541,0.417213
578,capital_count,2.0,0.044215,-0.044215,0.044215,0.372998
253,message,0.3535534,-0.031424,0.031424,0.031424,0.404423
576,words,11.0,0.025378,-0.025378,0.025378,0.379044
575,length,72.0,0.024455,-0.024455,0.024455,0.35459
62,call NUM,0.0,0.023366,-0.023366,0.023366,0.331224
518,topic 18,0.02339705,-0.019174,0.019174,0.019174,0.350398


--------------------
Instance 432


"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50"

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.626 0.374]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
585,has_money,1.0,-0.132718,0.132718,0.132718,0.266514
5,NUM NUM,0.2357023,-0.086578,0.086578,0.086578,0.353092
4,NUM,0.4714045,-0.080637,0.080637,0.080637,0.433729
500,topic 0,0.05214772,-0.061522,0.061522,0.061522,0.495251
578,capital_count,3.0,0.05755,-0.05755,0.05755,0.437701
379,text,0.2357023,-0.036669,0.036669,0.036669,0.474371
508,topic 8,0.03729255,0.031914,-0.031914,0.031914,0.442457
496,your,0.2357023,-0.029082,0.029082,0.029082,0.471539
510,topic 10,0.04285359,-0.028468,0.028468,0.028468,0.500007
527,topic 27,0.0466229,0.025581,-0.025581,0.025581,0.474426


--------------------
Instance 512


'dating:i have had two of these. Only started after i sent a text to talk sport radio last week. Any connection do you think or coincidence?'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.842 0.158]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
538,topic 38,0.01599823,-0.099579,0.099579,0.099579,0.233374
500,topic 0,1.398982e-05,0.050204,-0.050204,0.050204,0.18317
578,capital_count,2.0,0.049655,-0.049655,0.049655,0.133515
4,NUM,0.0,0.046275,-0.046275,0.046275,0.08724
575,length,139.0,-0.043974,0.043974,0.043974,0.131214
379,text,0.25,-0.040055,0.040055,0.040055,0.171269
560,topic 60,0.003083597,-0.029424,0.029424,0.029424,0.200694
576,words,26.0,-0.025143,0.025143,0.025143,0.225836
548,topic 48,0.0400888,-0.015303,0.015303,0.015303,0.24114
402,to NUM,0.0,0.010108,-0.010108,0.010108,0.231032


--------------------
Instance 557


'SMS. ac JSco: Energy is high, but u may not know where 2channel it. 2day ur leadership skills r strong. Psychic? Reply ANS w/question. End? Reply END JSCO'

Bias (trainset mean) [0.86620422 0.13379578]
Truth 1
Prediction [0.802 0.198]
Feature contributions:


Unnamed: 0,feature,value,legit contr,spam contr,abs contr,spam cumulative
538,topic 38,0.01435894,-0.130638,0.130638,0.130638,0.264433
578,capital_count,20.0,-0.079447,0.079447,0.079447,0.343881
335,reply,0.4850713,-0.061017,0.061017,0.061017,0.404897
500,topic 0,4.49927e-06,0.059623,-0.059623,0.059623,0.345274
4,NUM,0.0,0.047942,-0.047942,0.047942,0.297332
575,length,154.0,-0.04074,0.04074,0.04074,0.338071
576,words,28.0,-0.027811,0.027811,0.027811,0.365882
57,but,0.2425356,0.024267,-0.024267,0.024267,0.341615
507,topic 7,0.03041276,0.019712,-0.019712,0.019712,0.321903
525,topic 25,0.04668768,-0.016471,0.016471,0.016471,0.338375


--------------------
