# Scam Detector: Decision Tree

    This Jupyter Notebook will be used to run a Decision Tree Algorithm to predict if a given email is a scam or a ham(a normal email).

## Import Packages

In [1]:
#import the packages we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

## Retrieve Data

In [2]:
"""
test_col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
test_path = "IRIS.csv"
test = pd.read_csv(path)
test["species"] = pd.factorize(test['species'])[0]
test.head(100)
"""

'\ntest_col_names = [\'sepal_length\', \'sepal_width\', \'petal_length\', \'petal_width\', \'species\']\ntest_path = "IRIS.csv"\ntest = pd.read_csv(path)\ntest["species"] = pd.factorize(test[\'species\'])[0]\ntest.head(100)\n'

In [3]:
col_names = ['sender', 'receiver', 'subject', 'body', 'label', 'urls']
path = "./data/CEAS_08.csv"
data = pd.read_csv(path)
data = data.drop('date', axis=1)
col_names[-1], col_names[-2] = col_names[-2], col_names[-1]
data = data[col_names]
data.head(10)

Unnamed: 0,sender,receiver,subject,body,urls,label
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,1,0
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1
5,Caroline Aragon <dwthaidomainnamesm@thaidomain...,user7-ext5@gvc.ceas-challenge.cc,From Caroline Aragon,\n\n\n\n\nYo wu urS mo ou go rc ebo eForM rgi ...,0,1
6,Replica Watches <jhorton@thebakercompanies.com>,user2.10@gvc.ceas-challenge.cc,Replica Watches,We have fake Swiss Men's and Ladie's Replica \...,0,1
7,Daily Top 10 <acidirev_1972@tcwpg.com>,user2.3@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
8,qydlqcws-iacfym@issues.apache.org,xrh@spamassassin.apache.org,[Bug 5780] URI processing turns uuencoded stri...,http://issues.apache.org/SpamAssassin/show_bug...,1,0
9,Daily Top 10 <orn|dent_1973@musicaedischi.it>,user7@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1


In [4]:
rec_mode = data['receiver'].mode()[0]
data['receiver'].fillna(rec_mode, inplace=True)
sub_mode = data['subject'].mode()[0]
data['subject'].fillna(sub_mode, inplace=True)

## TF-IDF

In [5]:
corpus_sender = data['sender'][0:]
vectorizer_send = TfidfVectorizer()
send = vectorizer_send.fit_transform(corpus_sender)
print(vectorizer_send.get_feature_names_out().shape)
print(send.shape)
send_means = np.mean(send.toarray(), axis=1)

(49703,)
(39154, 49703)


In [6]:
corpus_rec = data['receiver'][0:]
vectorizer_rec = TfidfVectorizer()
rec = vectorizer_rec.fit_transform(corpus_rec)
print(vectorizer_rec.get_feature_names_out().shape)
print(rec.shape)
rec_means = np.mean(rec.toarray(), axis=1)

(9516,)
(39154, 9516)


In [7]:
corpus_sub = data['subject'][0:]
vectorizer_sub = TfidfVectorizer()
sub = vectorizer_sub.fit_transform(corpus_sub)
print(vectorizer_sub.get_feature_names_out().shape)
print(sub.shape)
sub_means = np.mean(sub.toarray(), axis=1)

(15339,)
(39154, 15339)


In [8]:
corpus_body = data['body'][0:]
vectorizer_body = TfidfVectorizer()
body = vectorizer_body.fit_transform(corpus_body)
print(vectorizer_body.get_feature_names_out().shape)
print(body.shape)

(183381,)
(39154, 183381)


In [None]:
body_arr = body.toarray().astype(np.float16)
body_means = np.mean(body_arr, axis=1)

In [68]:
testdf = pd.DataFrame(body.toarray(), columns=vectorizer_body.get_feature_names_out())
testdf.head()

Unnamed: 0,00,000,0000,00000,000000,0000000,00000000,000000000,000000000000,0000000000000000,...,胜是神仙,自分自身と向き合ったり,详细信息,质量保证,走质量路线,这两天用gdb很多,遇事不钻牛角尖,野宮,首选广州新桥中英文笔译专家,벬9랷
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Decision Class

In [34]:
class Decision:
    """ A decision is used to ask the question at a decision node to split the data.
    This class records column number and values and matches the stored feature value to a give feature value
    """
    
    def __init__(self, feature_index, threshold):
        self.feature_index = feature_index
        self.threshold = threshold
        
    def ask(self, input):
        # Compares input feature value to stored value
        feature_val = input[self.feature_index]
        if isinstance(feature_val, (int, float)):
            try:
                feature_val >= self.threshold
            except TypeError:
                print("Error: Type")
                print(feature_val)
                print(self.threshold)
            return feature_val >= self.threshold
        else:
            return feature_val == self.threshold
        

## Helper Functions for Splitting

In [35]:
def divide_df(rows, decision):
    # Partitions a data frame
    # Check if each row matches decision, divide into true and false
    
    left, right = [], []
    for row in rows:
        left.append(row) if decision.ask(row) else right.append(row)
    return left, right

In [36]:
def label_count(rows):
    # Counts the number of each classification in data frame
    label_counts = {}
    for row in rows:
        label = row[-1]
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    return label_counts

In [37]:
def gini_impurity(rows):
    #Calculates Gini Impurity for a data frame of rows.
    
    label_counts = label_count(rows)
    gini = 1.0
    for label in label_counts:
        label_prob = label_counts[label]/float(len(rows))
        gini -= label_prob**2
    return gini

In [38]:
def info_gain(left, right, curr_gini):
    #Information gain: Gini of the root node subtracted by the impurty of the two children nodes.
    prob = float(len(left) / (len(left) + len(right)))
    return curr_gini - prob * gini_impurity(left) - (1 - prob) * gini_impurity(right)
                 

In [39]:
def info_gain_split(rows):
    #Find best decision to make based on informaiton gain
    highest_gain = 0
    optimal_decision = None
    curr_gini = gini_impurity(rows)
    feature_count = len(rows[0]) -1
    
    for feature_index in range(0, feature_count):
        
        unique = set(row[feature_index] for row in rows) #unique column values
        
        for candidate in unique:
            decision = Decision(feature_index, candidate)
            
            left, right = divide_df(rows, decision)
            
            #continue if no split
            if(len(left) == 0 or len(right) == 0):
                continue
            
            #information gain from this split
            gain = info_gain(left, right, curr_gini)
            
            if gain > highest_gain:
                highest_gain, optimal_decision = gain, decision
                
    return highest_gain, optimal_decision

## Build Tree and Node Classes

In [40]:
class LeafNode:
    # A leaf Node holdes classified data.
    # Holds a dictionary with class counts in the leaf.
    
    def __init__(self,rows):
        self.pred = label_count(rows)

In [41]:
class DecisionNode:
    # A Decision Node asks a Decision to be made.
    # Holds reference to a Decision, and two child nodes.
    
    def __init__(self, decision, left, right):
        self.decision = decision
        self.left = left
        self.right = right

In [42]:
def build_tree(rows):
    # Recursively Builds tree.
    
    highest_gain, optimal_decision = info_gain_split(rows)
    
    #Base case no further gain
    if highest_gain == 0:
        return LeafNode(rows)
    
    #Found Partition
    left, right = divide_df(rows, optimal_decision)
    
    #Recurse Left Subtree
    left_subtree = build_tree(left)
    
    #Recurse Right Subtree
    right_subtree = build_tree(right)
    
    #Return Decision Node
    return DecisionNode(optimal_decision, left_subtree, right_subtree)

In [51]:
#print(data.shape)
df_list = data[0:1000].values.tolist()
my_tree = build_tree(df_list)

In [52]:
def predict(row, curr_node):
    #Base Case: Curr node is a leaf
    if isinstance(curr_node, LeafNode):
        return curr_node.pred
    
    #Recurse the left or right subtree
    if curr_node.decision.ask(row):
        return predict(row, curr_node.left)
    else:
        return predict(row, curr_node.right)

In [69]:
test_data = {'sender': ['d@gmail.com'], 'receiver': ['a@gmail.com'], 'subject': ['Hi'], 'body':['Buy my stuff, ILLEGAL']}
test_df = pd.DataFrame(test_data)
test_df.head()
test_list = test_df.values.tolist()[0]
print(test_list)
predict(test_list, my_tree)

['d@gmail.com', 'a@gmail.com', 'Hi', 'Buy my stuff, ILLEGAL']


IndexError: list index out of range