# Scam Detector: Decision Tree

    This Jupyter Notebook will be used to run a Decision Tree Algorithm to predict if a given email is a scam or a ham(a normal email).

## Import Packages

In [232]:
#import the packages we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

## Retrieve Data

In [234]:
col_names = ['sender', 'receiver', 'subject', 'body', 'label', 'urls']
path = "./data/CEAS_08.csv"
data = pd.read_csv(path)
data = data.drop('date', axis=1)
col_names[-1], col_names[-2] = col_names[-2], col_names[-1]
data = data[col_names]
data.head(10)

Unnamed: 0,sender,receiver,subject,body,urls,label
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,1,0
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1
5,Caroline Aragon <dwthaidomainnamesm@thaidomain...,user7-ext5@gvc.ceas-challenge.cc,From Caroline Aragon,\n\n\n\n\nYo wu urS mo ou go rc ebo eForM rgi ...,0,1
6,Replica Watches <jhorton@thebakercompanies.com>,user2.10@gvc.ceas-challenge.cc,Replica Watches,We have fake Swiss Men's and Ladie's Replica \...,0,1
7,Daily Top 10 <acidirev_1972@tcwpg.com>,user2.3@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
8,qydlqcws-iacfym@issues.apache.org,xrh@spamassassin.apache.org,[Bug 5780] URI processing turns uuencoded stri...,http://issues.apache.org/SpamAssassin/show_bug...,1,0
9,Daily Top 10 <orn|dent_1973@musicaedischi.it>,user7@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1


In [235]:
rec_mode = data['receiver'].mode()[0]
data['receiver'].fillna(rec_mode, inplace=True)
sub_mode = data['subject'].mode()[0]
data['subject'].fillna(sub_mode, inplace=True)

## TF-IDF

In [236]:
corpus_sender = data['sender'][0:]
send_lengths = np.array([len(text.split()) for text in corpus_sender])
vectorizer_send = TfidfVectorizer()
send = vectorizer_send.fit_transform(corpus_sender)
print(vectorizer_send.get_feature_names_out().shape)
print(send.shape)

(49703,)
(39154, 49703)


In [237]:
send_sum = np.array(send.sum(axis=1)).flatten()
send_normalized = send_sum / (send_lengths + 1e-6)
send_normalized.shape

(39154,)

In [238]:
corpus_rec = data['receiver'][0:]
rec_lengths = np.array([len(text.split()) for text in corpus_rec])
vectorizer_rec = TfidfVectorizer()
rec = vectorizer_rec.fit_transform(corpus_rec)
print(vectorizer_rec.get_feature_names_out().shape)
print(rec.shape)

(9516,)
(39154, 9516)


In [239]:
rec_sum = np.array(rec.sum(axis=1)).flatten()
rec_normalized = rec_sum / (rec_lengths + 1e-6)
rec_normalized.shape

(39154,)

In [240]:
corpus_sub = data['subject'][0:]
sub_lengths = np.array([len(text.split()) for text in corpus_sub])
vectorizer_sub = TfidfVectorizer()
sub = vectorizer_sub.fit_transform(corpus_sub)
print(vectorizer_sub.get_feature_names_out().shape)
print(sub.shape)

(15339,)
(39154, 15339)


In [241]:
sub_sum = np.array(sub.sum(axis=1)).flatten()
sub_normalized = sub_sum / (sub_lengths + 1e-6)
sub_normalized.shape

(39154,)

In [242]:
corpus_body = data['body'][0:]
body_lengths = np.array([len(text.split()) for text in corpus_body])
vectorizer_body = TfidfVectorizer()
body = vectorizer_body.fit_transform(corpus_body)
print(vectorizer_body.get_feature_names_out().shape)
print(body.shape)

(183381,)
(39154, 183381)


In [243]:
body_sum = np.array(body.sum(axis=1)).flatten()
body_normalized = body_sum / (body_lengths + 1e-6)
body_normalized.shape

(39154,)

In [244]:
data['body']= body_normalized
data['subject']= sub_normalized
data['receiver']= rec_normalized
data['sender']= send_normalized

In [245]:
data.head(10)

Unnamed: 0,sender,receiver,subject,body,urls,label
0,0.604142,1.830326,0.353418,0.12523,1,1
1,1.113127,2.181456,0.576909,0.318952,1,1
2,0.64356,2.181456,0.558653,0.026979,1,1
3,0.679424,0.698137,0.342154,0.001695,1,0
4,0.690642,2.181456,0.999999,1.435365,1,1
5,0.689706,2.072763,0.540761,0.169657,0,1
6,0.687507,2.083376,0.706872,0.2009,0,1
7,0.512329,2.181456,0.558653,0.027012,1,1
8,2.19652,1.94804,0.245374,0.15752,1,0
9,0.5939,2.073301,0.558653,0.027004,1,1


## Decision Class

In [246]:
class Decision:
    """ A decision is used to ask the question at a decision node to split the data.
    This class records column number and values and matches the stored feature value to a give feature value
    """
    
    def __init__(self, feature_index, threshold):
        self.feature_index = feature_index
        self.threshold = threshold
        
    def ask(self, input):
        # Compares input feature value to stored value
        feature_val = input[self.feature_index]
        if isinstance(feature_val, (int, float, np.number)):
            return feature_val >= self.threshold
        else:
            return feature_val == self.threshold
        

## Helper Functions for Splitting

In [247]:
def divide_df(rows, decision):
    # Partitions a data frame
    # Check if each row matches decision, divide into true and false
    col = rows[:, decision.feature_index]
    if np.issubdtype(col.dtype, np.number):
        mask = col >= decision.threshold
    else:
        mask = col == decision.threshold
    left, right = rows[mask],rows[~mask]
    return left, right

In [248]:
def label_count(rows):
    # Counts the number of each classification in data frame
    y = rows[:, -1]
    unique, label_counts = np.unique(y, return_counts=True)
    return dict(zip(unique,label_counts))

In [249]:
def gini_impurity(rows):
    #Calculates Gini Impurity for a data frame of rows.
    y = rows[:, -1]
    _, label_counts = np.unique(y, return_counts=True)
    probs = label_counts/label_counts.sum()
    return 1.0 - np.sum(probs**2)

In [250]:
def info_gain(left, right, curr_gini):
    #Information gain: Gini of the root node subtracted by the impurty of the two children nodes.
    if len(left) + len(right) == 0:
        return 0
    prob = float(len(left) / (len(left) + len(right)))
    return curr_gini - prob * gini_impurity(left) - (1 - prob) * gini_impurity(right)
                 

In [251]:
def threshold_candidates(col, max_thresh=5):
    #Choose candidate threshold split
    unique = np.unique(col)
    if len(unique) > max_thresh:
        quantile = np.linspace(0, 100, max_thresh + 2)[1:-1]
        unique = np.percentile(unique, quantile)
    if len(unique) > 1:
        return (unique[:-1] + unique[1:])/2
    else:
        return unique

In [252]:
def info_gain_split(rows):
    #Find best decision to make based on informaiton gain
    X = rows[:, :-1]
    y = rows[:, -1]
    curr_gini = gini_impurity(rows)
    feature_count = X.shape[1]
    
    highest_gain = 0
    optimal_decision = None
    
    for feature_index in range(feature_count):
        col = X[:, feature_index]
        
        #Candidate Thresholds
        thresholds = threshold_candidates(col) if np.issubdtype(col.dtype, np.number) else np.unique(col)
        
        for candidate in thresholds:
            if np.issubdtype(col.dtype, np.number):
                mask = col >= candidate
            else:
                mask = col == candidate
            
            if mask.sum() == 0 or mask.sum() == len(mask):
                continue
        
            left, right = rows[mask], rows[~mask]
            gain = info_gain(left, right, curr_gini)
            
            if gain > highest_gain:
                highest_gain, optimal_decision = gain, Decision(feature_index, candidate)
                
    return highest_gain, optimal_decision

## Build Tree and Node Classes

In [253]:
class LeafNode:
    # A leaf Node holdes classified data.
    # Holds a dictionary with class counts in the leaf.
    
    def __init__(self,rows):
        self.pred = label_count(rows)

In [254]:
class DecisionNode:
    # A Decision Node asks a Decision to be made.
    # Holds reference to a Decision, and two child nodes.
    
    def __init__(self, decision, left, right):
        self.decision = decision
        self.left = left
        self.right = right

In [255]:
def build_tree(rows, depth=0, max_depth=10, min_sample_split=5):
    # Recursively Builds tree.
    if len(rows) < min_sample_split or depth >= max_depth:
        return LeafNode(rows)
    
    highest_gain, optimal_decision = info_gain_split(rows)
    
    #Base case no further gain
    if highest_gain == 0 or optimal_decision is None:
        return LeafNode(rows)
    
    #Found Partition
    left, right = divide_df(rows, optimal_decision)
    
    #Recurse Left Subtree
    left_subtree = build_tree(left, depth+1, max_depth, min_sample_split)
    
    #Recurse Right Subtree
    right_subtree = build_tree(right, depth+1, max_depth, min_sample_split)
    
    #Return Decision Node
    return DecisionNode(optimal_decision, left_subtree, right_subtree)

In [256]:
#print(data.shape)
#x = data.iloc[:, :-1]
#x
my_tree = build_tree(data.to_numpy())

In [284]:
def predict(row, curr_node):
    #Base Case: Curr node is a leaf
    if isinstance(curr_node, LeafNode):
        total = sum(curr_node.pred.values())
        return max(curr_node.pred, key=curr_node.pred.get), {k: v/total for k,v in curr_node.pred.items()}
    
    #Recurse the left or right subtree
    if curr_node.decision.ask(row):
        return predict(row, curr_node.left)
    else:
        return predict(row, curr_node.right)

In [296]:
test_data = {'sender': ['d@gmail.com'], 'receiver': ['a@gmail.com'], 'subject': ['URGENT QUICK MONEY'], 'body':['Buy my stuff, ILLEGAL, Great opportunity, make money, own your life, passive income, money quick URGENT OPPORTUNITY'], 'urls':[0]}
test_df = pd.DataFrame(test_data)

In [297]:
new_body = vectorizer_body.transform(test_df['body'])
new_body_sum = new_body.sum(axis=1).flatten()
nonzero_body = (new_body > 0).sum(axis=1).flatten()

new_body_normalized = (new_body_sum/nonzero_body if nonzero_body > 0 else 0)

new_send = vectorizer_send.transform(test_df['sender'])
new_send_sum = new_send.sum(axis=1).flatten()
nonzero_send = (new_send > 0).sum(axis=1).flatten()

new_send_normalized = (new_send_sum/nonzero_send if nonzero_send > 0 else 0)

new_rec = vectorizer_rec.transform(test_df['receiver'])
new_rec_sum = new_rec.sum(axis=1).flatten()
nonzero_rec = (new_rec > 0).sum(axis=1).flatten()

new_rec_normalized = (new_rec_sum/nonzero_rec if nonzero_rec > 0 else 0)

new_sub = vectorizer_sub.transform(test_df['subject'])
new_sub_sum = new_sub.sum(axis=1).flatten()
nonzero_sub = (new_sub > 0).sum(axis=1).flatten()

new_sub_normalized = (new_sub_sum/nonzero_sub if nonzero_sub > 0 else 0)

test_df['body'] = new_body_normalized
test_df['sender'] = new_send_normalized
test_df['receiver'] = new_rec_normalized
test_df['subject'] = new_sub_normalized
test_df

Unnamed: 0,sender,receiver,subject,body,urls
0,0.66105,0.699575,0.574969,0.23439,0


In [298]:
predict(test_df.to_numpy()[0], my_tree)

(1.0, {0.0: 0.1111111111111111, 1.0: 0.8888888888888888})