# Scam Detector: Decision Tree

    This Jupyter Notebook will be used to run a Decision Tree Algorithm to predict if a given email is a scam or a ham(a normal email).

## Import Packages

In [131]:
#import the packages we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import sys
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [132]:
csv.field_size_limit(sys.maxsize)

path = "./data/Enron.csv"
data2 = pd.read_csv(path, engine='python', on_bad_lines='skip', encoding='latin1')

In [133]:
data2.insert(loc=0, column='sender', value='Empty')
data2.insert(loc=3, column='url', value=np.random.randint(0, 2, size=data2.shape[0]))
#print(svd.explained_variance_ratio_.sum())  

In [134]:
data2.shape

(29767, 5)

## Retrieve Data

In [135]:
col_names = ['sender', 'receiver', 'subject', 'body', 'label', 'urls']
path = "./data/CEAS_08.csv"
data1 = pd.read_csv(path)
data1 = data1.drop('date', axis=1)
col_names[-1], col_names[-2] = col_names[-2], col_names[-1]
data1 = data1[col_names]
data1.head(10)
counts = data1['urls'].value_counts()
print(counts)
data1.head()

urls
1    26232
0    12922
Name: count, dtype: int64


Unnamed: 0,sender,receiver,subject,body,urls,label
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,1,0
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [136]:
data1 = data1.drop('receiver', axis=1)

In [137]:
data = data1
data.shape

(39154, 5)

In [138]:
#send_mode = data['sender'].mode()[0]
data['sender'].fillna('Empty', inplace=True)
#sub_mode = data['subject'].mode()[0]
data['subject'].fillna('Empty', inplace=True)
#body_mode = data['body'].mode()[0]
data['body'].fillna('Empty', inplace=True)

data['sender'].isnull().any()

False

## TF-IDF

In [139]:
# Text Vectorization for numeric interpretations
corpus_sender = data['sender'][0:]
vectorizer_send = TfidfVectorizer()
send = vectorizer_send.fit_transform(corpus_sender)
print(vectorizer_send.get_feature_names_out().shape)
print(send.shape)

(49703,)
(39154, 49703)


In [140]:
svd_send = TruncatedSVD(n_components=5, random_state=42)
send_reduced = svd_send.fit_transform(send)

In [141]:
# Text Vectorization for numeric interpretations
corpus_sub = data['subject'][0:]
vectorizer_sub = TfidfVectorizer()
sub = vectorizer_sub.fit_transform(corpus_sub)
print(vectorizer_sub.get_feature_names_out().shape)
print(sub.shape)

(15339,)
(39154, 15339)


In [142]:
svd_sub = TruncatedSVD(n_components=50, random_state=42)
sub_reduced = svd_sub.fit_transform(sub)

In [143]:
# Text Vectorization for numeric interpretations
corpus_body = data['body'][0:]
vectorizer_body = TfidfVectorizer()
body = vectorizer_body.fit_transform(corpus_body)
print(vectorizer_body.get_feature_names_out().shape)
print(body.shape)

(183381,)
(39154, 183381)


In [144]:
svd = TruncatedSVD(n_components=300, random_state=42)
body_reduced = svd.fit_transform(body)

In [145]:
dupes_data = data.index[data.index.duplicated()]
print("data duplicates:", dupes_data)

data duplicates: Index([], dtype='int64')


In [146]:
body_df = pd.DataFrame(body_reduced, columns=range(1,301))
send_df = pd.DataFrame(send_reduced, columns=range(1,6))
sub_df = pd.DataFrame(sub_reduced, columns=range(1,51))

new_data = pd.concat([body_df, send_df, sub_df, data], axis=1)
new_data = new_data.drop('body', axis=1)
new_data = new_data.drop('sender', axis=1)
new_data = new_data.drop('subject', axis=1)
new_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,43,44,45,46,47,48,49,50,urls,label
0,0.063286,0.130304,-0.022210,-0.005088,-0.021140,-0.086163,-0.025422,0.049411,-0.009487,0.007434,...,-4.764230e-02,-7.083443e-03,2.977597e-02,2.493743e-02,-3.583672e-02,-1.147733e-02,-1.518206e-02,-8.340481e-03,1,1
1,0.091167,0.038318,-0.022531,0.022039,-0.029642,-0.005703,0.064485,0.063031,-0.026433,0.021969,...,-6.947093e-04,1.107725e-03,1.382440e-03,-3.354624e-03,2.064797e-04,2.642337e-03,-4.453137e-04,-5.294903e-04,1,1
2,0.885237,-0.261488,-0.148659,-0.008257,-0.004082,-0.004655,-0.014338,-0.022398,0.284830,-0.191905,...,7.896102e-04,1.134033e-04,-8.621442e-04,1.621947e-04,1.372798e-04,1.056232e-03,-1.293172e-04,-2.332366e-04,1,1
3,0.087471,0.014409,-0.053204,0.003458,0.008816,-0.016598,0.010920,0.020323,0.016892,0.057482,...,1.498694e-03,-3.121696e-02,-1.137660e-02,1.542312e-03,-3.588918e-03,3.282780e-02,3.599037e-02,9.373170e-05,1,0
4,0.081588,-0.029447,-0.041692,0.007864,0.000755,0.001088,0.014785,0.007192,0.007662,0.015371,...,1.160800e-12,-6.270052e-14,-1.896029e-12,-3.245247e-12,9.334881e-13,-2.032660e-12,5.976050e-13,-2.425939e-12,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39149,0.485460,0.129866,0.795481,-0.044375,0.109069,-0.004141,0.015776,-0.004536,0.038486,-0.030253,...,-3.794575e-04,2.223317e-04,1.878732e-04,-2.828636e-04,-2.928563e-04,-3.381019e-04,1.602866e-04,3.355469e-05,0,1
39150,0.498714,0.129111,0.793061,-0.037759,0.100212,0.021168,-0.000929,-0.004003,-0.017608,0.001443,...,-3.794575e-04,2.223317e-04,1.878732e-04,-2.828636e-04,-2.928563e-04,-3.381019e-04,1.602866e-04,3.355469e-05,0,1
39151,0.068910,0.181543,-0.031943,-0.003263,-0.058700,-0.040715,-0.064277,-0.008586,-0.011058,-0.010870,...,9.661311e-04,-1.178296e-03,-3.836210e-03,-8.758776e-04,3.468365e-04,-7.777360e-04,1.695609e-04,-1.870242e-03,0,0
39152,0.069078,0.165408,-0.013783,0.000583,-0.043534,-0.053345,-0.018408,0.018679,0.005152,0.006375,...,2.226965e-02,1.191468e-02,-1.097488e-02,-2.147922e-02,-2.401658e-03,-2.895141e-03,-6.864627e-03,1.939919e-02,0,0


In [147]:
data = new_data

In [148]:
data.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,43,44,45,46,47,48,49,50,urls,label
0,0.063286,0.130304,-0.02221,-0.005088,-0.02114,-0.086163,-0.025422,0.049411,-0.009487,0.007434,...,-0.0476423,-0.007083443,0.02977597,0.02493743,-0.03583672,-0.01147733,-0.01518206,-0.008340481,1,1
1,0.091167,0.038318,-0.022531,0.022039,-0.029642,-0.005703,0.064485,0.063031,-0.026433,0.021969,...,-0.0006947093,0.001107725,0.00138244,-0.003354624,0.0002064797,0.002642337,-0.0004453137,-0.0005294903,1,1
2,0.885237,-0.261488,-0.148659,-0.008257,-0.004082,-0.004655,-0.014338,-0.022398,0.28483,-0.191905,...,0.0007896102,0.0001134033,-0.0008621442,0.0001621947,0.0001372798,0.001056232,-0.0001293172,-0.0002332366,1,1
3,0.087471,0.014409,-0.053204,0.003458,0.008816,-0.016598,0.01092,0.020323,0.016892,0.057482,...,0.001498694,-0.03121696,-0.0113766,0.001542312,-0.003588918,0.0328278,0.03599037,9.37317e-05,1,0
4,0.081588,-0.029447,-0.041692,0.007864,0.000755,0.001088,0.014785,0.007192,0.007662,0.015371,...,1.1608e-12,-6.270052e-14,-1.896029e-12,-3.245247e-12,9.334881e-13,-2.03266e-12,5.97605e-13,-2.425939e-12,1,1
5,0.004048,0.007168,-0.002666,2e-06,-0.004115,-0.007377,0.000176,0.001339,0.002738,0.003298,...,0.007622347,0.02249529,-0.001773222,0.0278151,-0.02497209,-0.02838489,0.01075055,0.01363592,0,1
6,0.060204,0.140164,0.007587,0.329133,0.088225,-0.023446,0.015108,0.040557,-0.017742,-0.016814,...,-0.001302608,-0.05662889,0.01171058,-0.05614383,0.006045651,-0.0140136,-0.03115993,-0.03991532,0,1
7,0.884718,-0.261513,-0.148607,-0.008399,-0.003513,-0.00565,-0.014232,-0.022161,0.284683,-0.19166,...,0.0007896102,0.0001134033,-0.0008621442,0.0001621947,0.0001372798,0.001056232,-0.0001293172,-0.0002332366,1,1
8,0.066816,0.144864,-0.026336,-0.005721,-0.011312,-0.041102,-0.052131,-0.007147,0.043906,0.086705,...,0.0004453657,-0.005174961,-0.006072182,0.005174396,0.01931305,-0.01841665,-0.01640152,0.002721182,1,0
9,0.884873,-0.261542,-0.14863,-0.008386,-0.003568,-0.005556,-0.014242,-0.022186,0.284729,-0.191698,...,0.0007896102,0.0001134033,-0.0008621442,0.0001621947,0.0001372798,0.001056232,-0.0001293172,-0.0002332366,1,1


## Decision Class

In [149]:
class Decision:
    """ A decision is used to ask the question at a decision node to split the data.
    This class records column number and values and matches the stored feature value to a give feature value
    """
    
    def __init__(self, feature_index, threshold):
        self.feature_index = feature_index
        self.threshold = threshold
        
    def ask(self, input):
        # Compares input feature value to stored value
        feature_val = input[self.feature_index]
        if isinstance(feature_val, (int, float, np.number)):
            return feature_val >= self.threshold
        else:
            return feature_val == self.threshold
        

## Helper Functions for Splitting

In [150]:
def divide_df(rows, decision):
    # Partitions a data frame
    # Check if each row matches decision, divide into true and false
    col = rows[:, decision.feature_index]
    if np.issubdtype(col.dtype, np.number):
        mask = col >= decision.threshold
    else:
        mask = col == decision.threshold
    left, right = rows[mask],rows[~mask]
    return left, right

In [151]:
def label_count(rows):
    # Counts the number of each classification in data frame
    y = rows[:, -1]
    unique, label_counts = np.unique(y, return_counts=True)
    return dict(zip(unique,label_counts))

In [152]:
def gini_impurity(rows):
    #Calculates Gini Impurity for a data frame of rows.
    y = rows[:, -1]
    _, label_counts = np.unique(y, return_counts=True)
    probs = label_counts/label_counts.sum()
    return 1.0 - np.sum(probs**2)

In [153]:
def info_gain(left, right, curr_gini):
    #Information gain: Gini of the root node subtracted by the impurty of the two children nodes.
    if len(left) + len(right) == 0:
        return 0
    prob = float(len(left) / (len(left) + len(right)))
    return curr_gini - prob * gini_impurity(left) - (1 - prob) * gini_impurity(right)
                 

In [154]:
def threshold_candidates(col, max_thresh=5):
    #Choose candidate threshold split
    unique = np.unique(col)
    if len(unique) > max_thresh:
        quantile = np.linspace(0, 100, max_thresh + 2)[1:-1]
        unique = np.percentile(unique, quantile)
    if len(unique) > 1:
        return (unique[:-1] + unique[1:])/2
    else:
        return unique

In [155]:
def info_gain_split(rows):
    #Find best decision to make based on informaiton gain
    X = rows[:, :-1]
    y = rows[:, -1]
    curr_gini = gini_impurity(rows)
    feature_count = X.shape[1]
    
    highest_gain = 0
    optimal_decision = None
    
    for feature_index in range(feature_count):
        col = X[:, feature_index]
        
        #Candidate Thresholds
        thresholds = threshold_candidates(col) if np.issubdtype(col.dtype, np.number) else np.unique(col)
        
        for candidate in thresholds:
            if np.issubdtype(col.dtype, np.number):
                mask = col >= candidate
            else:
                mask = col == candidate
            
            if mask.sum() == 0 or mask.sum() == len(mask):
                continue
        
            left, right = rows[mask], rows[~mask]
            gain = info_gain(left, right, curr_gini)
            
            if gain > highest_gain:
                highest_gain, optimal_decision = gain, Decision(feature_index, candidate)
                
    return highest_gain, optimal_decision

## Build Tree and Node Classes

In [156]:
class LeafNode:
    # A leaf Node holdes classified data.
    # Holds a dictionary with class counts in the leaf.
    
    def __init__(self,rows):
        self.pred = label_count(rows)

In [157]:
class DecisionNode:
    # A Decision Node asks a Decision to be made.
    # Holds reference to a Decision, and two child nodes.
    
    def __init__(self, decision, left, right):
        self.decision = decision
        self.left = left
        self.right = right

In [158]:
def build_tree(rows, depth=0, max_depth=10, min_sample_split=2):
    # Recursively Builds tree.
    if len(rows) < min_sample_split or depth >= max_depth:
        return LeafNode(rows)
    
    highest_gain, optimal_decision = info_gain_split(rows)
    
    #Base case no further gain
    if highest_gain == 0 or optimal_decision is None:
        return LeafNode(rows)
    
    #Found Partition
    left, right = divide_df(rows, optimal_decision)
    
    #Recurse Left Subtree
    left_subtree = build_tree(left, depth+1, max_depth, min_sample_split)
    
    #Recurse Right Subtree
    right_subtree = build_tree(right, depth+1, max_depth, min_sample_split)
    
    #Return Decision Node
    return DecisionNode(optimal_decision, left_subtree, right_subtree)

In [159]:
def predict(row, curr_node):
    #Base Case: Curr node is a leaf
    if isinstance(curr_node, LeafNode):
        total = sum(curr_node.pred.values())
        return max(curr_node.pred, key=curr_node.pred.get), {k: v/total for k,v in curr_node.pred.items()}
    
    #Recurse the left or right subtree
    if curr_node.decision.ask(row):
        return predict(row, curr_node.left)
    else:
        return predict(row, curr_node.right)

## Prediction and Testing

In [160]:
#test_data = {'sender': ['luna_prado@gmail.com'], 'subject': ['Advisor Help'], 'body':['Hello Dr. Athienitis, can you help me with choosing classes for the upcoming semester. Look forward to staying in contact.'], 'urls':[0]}
#test_df = pd.DataFrame(test_data)

In [161]:
test_data = {'sender': ['asjfnakjsnfkanf@gmail.com'], 'subject': ['SCAM ALERT'], 'body':['Make money quick, urgent new opportunity. Please buy now for your future. Passive Income, Easy life.'], 'urls':[1]}
test_df = pd.DataFrame(test_data)

In [162]:
new_body = vectorizer_body.transform(test_df['body'])
new_body_reduced = svd.transform(new_body)
new_body_df = pd.DataFrame(new_body_reduced, columns=range(1,301))

new_send = vectorizer_send.transform(test_df['sender'])
new_send_reduced = svd_send.transform(new_send)
new_send_df = pd.DataFrame(new_send_reduced, columns=range(1,6))

new_sub = vectorizer_sub.transform(test_df['subject'])
new_sub_reduced = svd_sub.transform(new_sub)
new_sub_df = pd.DataFrame(new_sub_reduced, columns=range(1,51))

new_data = pd.concat([new_body_df, new_send_df, new_sub_df,test_df], axis=1)
new_data = new_data.drop('body', axis=1)
new_data = new_data.drop('subject', axis=1)
new_data = new_data.drop('sender', axis=1)

In [163]:
test_df = new_data
test_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,42,43,44,45,46,47,48,49,50,urls
0,0.02593,0.058458,0.004948,0.028521,-0.045145,0.019482,0.046096,0.048495,-0.003999,0.003845,...,0.002289,-0.000244,-2.4e-05,0.000166,-0.000151,-0.00102,0.000422,0.001585,0.001279,1


## Random Forest Draft:

In [177]:
class RandomForest:
    def __init__(self, tree_count=10, max_depth=10, min_sample_split=2, feature_count=None):
        self.tree_count = tree_count
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.feature_count = feature_count
        self.trees = []
        
    def fit(self, X, y):
        self.trees = []
        
        for _ in range(self.tree_count):
            X_partial, y_partial = self.bootstrap(X, y)
            X_subspace = self.subspace(X_partial)
            rows = np.concatenate((X_subspace, y_partial), axis=1)
            
            tree = build_tree(rows, max_depth=self.max_depth, min_sample_split=self.min_sample_split)
            self.trees.append(tree)
        
        
    def bootstrap(self, X, y):
        sample_count = X.shape[0]
        row_index = np.random.choice(sample_count, sample_count, replace=True)
        return X[row_index], y[row_index]
        
    
    def subspace(self, X):
        feature_index = np.random.choice(self.feature_count, int(self.feature_count**0.5), replace=False)
        return X[:, feature_index]
        
        
    def random_predict(self, X):
        preds = np.array([predict(X, tree) for tree in self.trees])
        final_preds = np.swapaxes(preds, 0, 1)
        #Need to implement mode getter
        values, counts = np.unique(final_preds[0], return_counts=True)
        mode_index = np.argmax(counts)
        mode_value = values[mode_index]
        mode_count = counts[mode_index]/100
        sol = np.array((mode_value,mode_count))
        return sol

In [178]:
X = data.to_numpy()[:,:-1]
y = data.to_numpy()[:, -1].reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(31323, 356)
(7831, 356)


In [180]:
my_forest = RandomForest(tree_count=30, max_depth=10, min_sample_split=2, feature_count=X_train.shape[1])
my_forest.fit(X_train,y_train)

In [185]:
arr = np.array([my_forest.random_predict(X) for X in X_test])
acc_sum = 0
for i in range(arr.shape[0]):
    if y_test[i][0] == arr[i][0]:
        acc_sum += 1

print(acc_sum/arr.shape[0])
print(arr.shape)
print(y_test.shape)

0.5022347082109565
(7831, 2)
(7831, 1)
