In [None]:
"""
  Create features and run models for spam detection. 
  author: MP
  date: 4/25/2021
  
  There are three features sets:
  - Trial A: Words manually selected
  - Trial B: Words pre-designed from homework spec
  - Trial C: All unigrams
  
  There are three models:
  - Logistic Regression
  - Decision Tree Classifier
  - Multinomial Naive Bayes
"""

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
import os
from matplotlib import pyplot

In [2]:
""" Reading in the dataset and split into train and test """

corpus_df = pd.read_csv('C:/6200-IR/homework-7-mplatt27/corpus_df_clean.csv')

In [3]:
corpus_df.drop(['Unnamed: 0'], axis=1, inplace=True)
corpus_df.head()

Unnamed: 0,doc_path,raw_doc,subject,clean_doc,label
0,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From RickyAmes@aol.com Sun Apr 8 13:07:32 20...,"Generic Cialis, branded quality@",Do you feel the pressure to perform and not ri...,spam
1,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From bounce-debian-mirrors=ktwarwic=speedy.uwa...,Typo in /debian/README,Hi i just updated from the gulus and I check o...,ham
2,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From 7stocknews@tractionmarketing.com Sun Apr...,authentic viagra,Mega authenticV I A G R A DISCOUNT priceC I A ...,spam
3,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From vqucsmdfgvsg@ruraltek.com Sun Apr 8 13:...,Nice talking with ya,Hey Billy it was really fun going out the othe...,spam
4,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From dcube@totalink.net Sun Apr 8 13:19:30 2...,or trembling; stomach cramps; trouble in sleep...,system of the home It will have the capabiliti...,spam


In [4]:
# we only need columns with clean_doc and labels
all_data = corpus_df.iloc[:,[3,4]]
all_data.head()

Unnamed: 0,clean_doc,label
0,Do you feel the pressure to perform and not ri...,spam
1,Hi i just updated from the gulus and I check o...,ham
2,Mega authenticV I A G R A DISCOUNT priceC I A ...,spam
3,Hey Billy it was really fun going out the othe...,spam
4,system of the home It will have the capabiliti...,spam


In [5]:
# change labels to spam --> 1 and ham --> 0
all_data.loc[all_data['label'] == 'spam', 'label'] = 1
all_data.loc[all_data['label'] == 'ham', 'label'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [6]:
all_data.head()

Unnamed: 0,clean_doc,label
0,Do you feel the pressure to perform and not ri...,1
1,Hi i just updated from the gulus and I check o...,0
2,Mega authenticV I A G R A DISCOUNT priceC I A ...,1
3,Hey Billy it was really fun going out the othe...,1
4,system of the home It will have the capabiliti...,1


In [7]:
# change type so CV can read it
all_data['clean_doc'] = all_data['clean_doc'].astype('U')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['clean_doc'] = all_data['clean_doc'].astype('U')


In [8]:
# split into train and test sets
train_x, test_x, train_y, test_y = train_test_split(all_data['clean_doc'], all_data['label'], test_size=0.2)

In [12]:
# should have 1/3 ham and 2/3 spam
print('Total: ',len(train_y))
train_y.value_counts()

Total:  60335


1    40094
0    20241
Name: label, dtype: int64

In [13]:
# should have 1/3 ham and 2/3 spam
print('Total: ',len(test_y))
test_y.value_counts()

Total:  15084


1    10105
0     4979
Name: label, dtype: int64

In [14]:
# change type again so CV can read it
train_x = train_x.astype('str')

In [15]:
train_y = train_y.astype('int')
test_y = test_y.astype('int')

In [None]:
""" TRIAL A: Create features, first with manually chosen words """

In [16]:
# TRIAL A: manually create a list of ngrams (there are 17 ngrams)

trial_a_vocab = ['free', 'click here', 'winner', 'won', 'prize', 'money', 'cash', 'porn', 'access now', 'apply now',
                 'credit', 'buy now', 'limited time', 'call now', 'congratulations', 'earn money', 'give away']


In [17]:
# create CV
vectorizer = CountVectorizer(analyzer='word', min_df=0.005, max_df=0.995, vocabulary=trial_a_vocab)
fitted_x_train_A = vectorizer.fit_transform(train_x)

In [18]:
# check the vocab words we want
vectorizer.vocabulary_

{'free': 0,
 'click here': 1,
 'winner': 2,
 'won': 3,
 'prize': 4,
 'money': 5,
 'cash': 6,
 'porn': 7,
 'access now': 8,
 'apply now': 9,
 'credit': 10,
 'buy now': 11,
 'limited time': 12,
 'call now': 13,
 'congratulations': 14,
 'earn money': 15,
 'give away': 16}

In [19]:
# check this is the size we expect
fitted_x_train_A.toarray().shape

(60335, 17)

In [20]:
# create for test
transformed_x_test_A = vectorizer.transform(test_x)

In [21]:
# dump files for later
dump_svmlight_file(fitted_x_train_A, train_y, 'trainA_dump.txt')

In [22]:
""" TRIAL B: Create features, second with pre-designed words """

def read_trainB_words(file_name):
    f = open(file_name, 'r')
    words = []
    for line in f:
        words.append(line.strip())
        
    f.close()
    return words

trial_b_vocab = read_trainB_words('C:/6200-IR/homework-7-mplatt27/trainB_words.txt')
print(trial_b_vocab)

['free', 'spam', 'click', 'buy', 'clearance', 'shopper', 'order', 'earn', 'cash', 'extra', 'money', 'double', 'collect', 'credit', 'check', 'affordable', 'fast', 'price', 'loans', 'profit', 'refinance', 'hidden', 'freedom', 'chance', 'miracle', 'lose', 'home', 'remove', 'success', 'virus', 'malware', 'ad', 'subscribe', 'sales', 'performance', 'valium', 'medicine', 'diagnostics', 'million', 'join', 'deal', 'unsolicited', 'trial', 'prize', 'now', 'legal', 'bonus', 'limited', 'instant', 'luxury', 'celebrity', 'only', 'compare', 'win', 'viagra', '$$$', '$discount', 'click here', 'meet singles', 'incredible deal', 'lose weight', 'act now', '100% free', 'fast cash', 'million dollars', 'lower interest rate', 'visit our website', 'no credit check', '0']


In [23]:
# create CV
vectorizer2 = CountVectorizer(analyzer='word', min_df=0.005, max_df=0.995, vocabulary=trial_b_vocab)
fitted_x_train_B = vectorizer2.fit_transform(train_x)
# check the vocab words we want
vectorizer2.vocabulary_

{'free': 0,
 'spam': 1,
 'click': 2,
 'buy': 3,
 'clearance': 4,
 'shopper': 5,
 'order': 6,
 'earn': 7,
 'cash': 8,
 'extra': 9,
 'money': 10,
 'double': 11,
 'collect': 12,
 'credit': 13,
 'check': 14,
 'affordable': 15,
 'fast': 16,
 'price': 17,
 'loans': 18,
 'profit': 19,
 'refinance': 20,
 'hidden': 21,
 'freedom': 22,
 'chance': 23,
 'miracle': 24,
 'lose': 25,
 'home': 26,
 'remove': 27,
 'success': 28,
 'virus': 29,
 'malware': 30,
 'ad': 31,
 'subscribe': 32,
 'sales': 33,
 'performance': 34,
 'valium': 35,
 'medicine': 36,
 'diagnostics': 37,
 'million': 38,
 'join': 39,
 'deal': 40,
 'unsolicited': 41,
 'trial': 42,
 'prize': 43,
 'now': 44,
 'legal': 45,
 'bonus': 46,
 'limited': 47,
 'instant': 48,
 'luxury': 49,
 'celebrity': 50,
 'only': 51,
 'compare': 52,
 'win': 53,
 'viagra': 54,
 '$$$': 55,
 '$discount': 56,
 'click here': 57,
 'meet singles': 58,
 'incredible deal': 59,
 'lose weight': 60,
 'act now': 61,
 '100% free': 62,
 'fast cash': 63,
 'million dollars': 64

In [24]:
# check this is the size we expect
fitted_x_train_B.toarray().shape

(60335, 69)

In [25]:
# create for test
transformed_x_test_B = vectorizer2.transform(test_x)

In [26]:
# dump files for later
dump_svmlight_file(fitted_x_train_B, train_y, 'trainB_dump.txt')

In [None]:
""" TRIAL C: Create features using all unigrams """

In [27]:
# create CV
vectorizer3 = CountVectorizer(analyzer='word', min_df=0.005, max_df=0.995)
fitted_x_train = vectorizer3.fit_transform(train_x)
# check the vocab words we want
vectorizer3.vocabulary_

{'style': 2691,
 'got': 1197,
 'http': 1322,
 'img': 1353,
 'html': 1321,
 'head': 1258,
 'title': 2853,
 'mouvement': 1787,
 'desjardins': 724,
 'meta': 1725,
 'fr': 1118,
 'generator': 1164,
 'microsoft': 1731,
 'frontpage': 1138,
 'progid': 2169,
 'level': 1558,
 'intermediate': 1425,
 'keywords': 1494,
 'description': 719,
 'theme': 2807,
 'zero': 3182,
 'default': 687,
 'border': 311,
 'verdana': 3010,
 'arial': 167,
 'helvetica': 1277,
 'bold': 305,
 'table': 2753,
 'tbody': 2770,
 'tr': 2880,
 'td': 2771,
 'https': 1323,
 'justify': 1483,
 'span': 2598,
 'font': 1092,
 'cher': 446,
 'membre': 1713,
 'accèsd': 20,
 'sansserif': 2408,
 'br': 320,
 'le': 1535,
 'département': 836,
 'de': 667,
 'vérification': 3047,
 'comptable': 545,
 'du': 828,
 'groupe': 1216,
 'un': 2943,
 'problème': 2151,
 'transaction': 2891,
 'dans': 655,
 'votre': 3045,
 'compte': 546,
 'montant': 1772,
 'été': 3188,
 'et': 936,
 'par': 1973,
 'notre': 1868,
 'nous': 1869,
 'vous': 3046,
 'avisons': 224,
 '

In [28]:
# save vocabulary for later
trial_c_vocab = list(vectorizer3.vocabulary_.keys())

In [29]:
# check this is the size we expect
fitted_x_train.toarray().shape

(60335, 3191)

In [30]:
# create for test
transformed_x_test = vectorizer3.transform(test_x)

In [31]:
# dump files for later
dump_svmlight_file(fitted_x_train, train_y, 'trainC_dump.txt')

In [None]:
""" Run classification models """

# TRIAL A (manual features): fitted_x_train_A, transformed_x_test_A
# TRIAL B (pre-designed feature set): fitted_x_train_B, transformed_x_test_B
# TRIAL C (all unigrams): fitted_x_train, transformed_x_test

In [40]:
def write_results(scores, trial, y):
    """ Write scores to doc so we can view first few spam docs 
        Score is probability that it is spam
    """
    y_list = y.tolist()
    x_list = test_x.tolist()

    name = 'trial_' + str(trial) + '.txt'
    if os.path.exists('C:/6200-IR/homework-7-mplatt27/' + name):
        os.remove('C:/6200-IR/homework-7-mplatt27/' + name)
    o = open('C:/6200-IR/homework-7-mplatt27/' + name, "w")
    
    for i in range(10):
        o.write("Preidctied probability of being spam: " + str(scores[i]) + "\n")
        if y_list[i] == 1:
            o.write("Is actually: Spam \n")
        else:
            o.write("Is actually: Ham \n")
        o.write("Text: " + x_list[i] + "\n")
        o.write("**********************************************************\n")
    o.close()
    return

def classify_documents(x_train_data, x_test_data, y_train_labels, y_test_labels, trial):
    """ Classify documents with various models. """
    
    # run logistic regression
    lr = LogisticRegression(max_iter=1000, C=0.01, penalty='l1', solver='liblinear')
    lr.fit(x_train_data, y_train_labels)
    lr_probs = lr.predict_proba(x_test_data)[:, 1]
    lr_score = roc_auc_score(np.array(y_test_labels), lr_probs)
    write_results(lr_probs, trial, y_test_labels)
    print('ROC AUC score for Logistic Regression: ', lr_score)
    print('Most important features: ')
    importance_lr = lr.coef_[0]
    features_lr = {}
    for i,v in enumerate(importance_lr):
        if trial == 'A':
            features_lr[trial_a_vocab[i]] = v
        elif trial == 'B':
            features_lr[trial_b_vocab[i]] = v
        elif trial == 'C':
            features_lr[trial_c_vocab[i]] = v
            
    features_lr = {k: v for k, v in sorted(features_lr.items(), key=lambda item: item[1], reverse=True)}
    i = 0
    for feat, sc in features_lr.items():
        if i > 5:
            break
        print(feat, "\t", round(sc,2))
        i += 1
    print("**************************************************\n")
        
    
    # run decision tree
    dt = DecisionTreeClassifier()
    dt.fit(x_train_data, y_train_labels)
    dt_probs = dt.predict_proba(x_test_data)[:, 1]
    dt_score = roc_auc_score(np.array(y_test_labels), dt_probs)
    write_results(dt_probs, trial, y_test_labels)
    print('ROC AUC score for Decision Tree: ', dt_score)
    print('Most important features: ')
    importance_dt = dt.feature_importances_
    features_dt = {}
    for i,v in enumerate(importance_dt):
        if trial == 'A':
            features_dt[trial_a_vocab[i]] = v
        elif trial == 'B':
            features_dt[trial_b_vocab[i]] = v
        elif trial == 'C':
            features_dt[trial_c_vocab[i]] = v
    features_dt = {k: v for k, v in sorted(features_dt.items(), key=lambda item: item[1], reverse=True)}
    i = 0
    for feat, sc in features_dt.items():
        if i > 5:
            break
        print(feat, "\t", round(sc,2))
        i += 1
    print("**************************************************\n")
    
    # run naive bayes
    nb = MultinomialNB()
    nb.fit(x_train_data, y_train_labels)
    nb_probs = nb.predict_proba(x_test_data)[:, 1]
    nb_score = roc_auc_score(np.array(y_test_labels), nb_probs)
    write_results(nb_probs, trial, y_test_labels)
    print('ROC AUC score for Naive Bayes: ', nb_score)
    importance_nb = nb.coef_[0]
    features_nb = {}
    for i,v in enumerate(importance_nb):
        if trial == 'A':
            features_nb[trial_a_vocab[i]] = v
        elif trial == 'B':
            features_nb[trial_b_vocab[i]] = v
        elif trial == 'C':
            features_nb[trial_c_vocab[i]] = v
    features_nb = {k: v for k, v in sorted(features_nb.items(), key=lambda item: item[1], reverse=True)}
    i = 0
    for feat, sc in features_nb.items():
        if i > 5:
            break
        print(feat, "\t", round(sc,2))
        i += 1
    print("**************************************************\n")

In [41]:
# run classifiers on TRIAL A (manual features)
classify_documents(fitted_x_train_A, transformed_x_test_A, train_y, test_y, "A")

ROC AUC score for Logistic Regression:  0.5915500420916787
Most important features: 
money 	 1.09
credit 	 0.49
cash 	 0.03
click here 	 0.0
winner 	 0.0
won 	 0.0
**************************************************

ROC AUC score for Decision Tree:  0.6118970631625615
Most important features: 
money 	 0.58
free 	 0.13
credit 	 0.12
won 	 0.04
cash 	 0.04
congratulations 	 0.03
**************************************************

ROC AUC score for Naive Bayes:  0.5957629366446447
money 	 -0.74
free 	 -1.36
credit 	 -1.88
cash 	 -2.97
won 	 -3.93
prize 	 -4.28
**************************************************





In [42]:
# run classifiers on TRIAL B (pre-designed features)
classify_documents(fitted_x_train_B, transformed_x_test_B, train_y, test_y, "B")

ROC AUC score for Logistic Regression:  0.7486095733699549
Most important features: 
viagra 	 1.47
money 	 1.08
buy 	 0.84
fast 	 0.57
refinance 	 0.56
bonus 	 0.55
**************************************************

ROC AUC score for Decision Tree:  0.8351095978667058
Most important features: 
click 	 0.12
money 	 0.1
viagra 	 0.08
buy 	 0.07
now 	 0.05
price 	 0.05
**************************************************

ROC AUC score for Naive Bayes:  0.7342426613349546
price 	 -1.98
now 	 -2.12
only 	 -2.34
viagra 	 -2.37
money 	 -2.61
buy 	 -2.98
**************************************************





In [43]:
# run classifiers on TRIAL C (all unigrams)
classify_documents(fitted_x_train, transformed_x_test, train_y, test_y, "C")

ROC AUC score for Logistic Regression:  0.9905844129708952
Most important features: 
forum 	 1.0
continuously 	 0.8
becomes 	 0.55
interested 	 0.47
hex 	 0.4
difficult 	 0.31
**************************************************

ROC AUC score for Decision Tree:  0.9814649335223774
Most important features: 
experience 	 0.28
ordering 	 0.19
os 	 0.11
guarantees 	 0.11
careful 	 0.04
forum 	 0.03
**************************************************

ROC AUC score for Naive Bayes:  0.979672258319181
scope 	 -3.16
protéger 	 -3.64
closer 	 -3.65
degree 	 -3.82
industrious 	 -4.0
developing 	 -4.27
**************************************************



