# Scam Detector: Decision Tree

    This Jupyter Notebook will be used to run a Decision Tree Algorithm to predict if a given email is a scam or a ham(a normal email).

## Import Packages

In [196]:
#import the packages we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import sys
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Retrieve Data

In [200]:
col_names = ['sender', 'receiver', 'subject', 'body', 'label', 'urls']
path = "./data/CEAS_08.csv"
data1 = pd.read_csv(path)
data1 = data1.drop('date', axis=1)
col_names[-1], col_names[-2] = col_names[-2], col_names[-1]
data1 = data1[col_names]
data1.head(10)
counts = data1['urls'].value_counts()
print(counts)
data1.head()

urls
1    26232
0    12922
Name: count, dtype: int64


Unnamed: 0,sender,receiver,subject,body,urls,label
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,1,0
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [201]:
data1 = data1.drop('receiver', axis=1)

In [202]:
data = data1
data.shape

(39154, 5)

In [203]:
#send_mode = data['sender'].mode()[0]
data['sender'].fillna('Empty', inplace=True)
#sub_mode = data['subject'].mode()[0]
data['subject'].fillna('Empty', inplace=True)
#body_mode = data['body'].mode()[0]
data['body'].fillna('Empty', inplace=True)

data['sender'].isnull().any()

False

## TF-IDF

In [204]:
# Text Vectorization for numeric interpretations
corpus_sender = data['sender'][0:]
vectorizer_send = TfidfVectorizer()
send = vectorizer_send.fit_transform(corpus_sender)
print(vectorizer_send.get_feature_names_out().shape)
print(send.shape)

(49703,)
(39154, 49703)


In [205]:
svd_send = TruncatedSVD(n_components=5, random_state=42)
send_reduced = svd_send.fit_transform(send)

In [206]:
# Text Vectorization for numeric interpretations
corpus_sub = data['subject'][0:]
vectorizer_sub = TfidfVectorizer()
sub = vectorizer_sub.fit_transform(corpus_sub)
print(vectorizer_sub.get_feature_names_out().shape)
print(sub.shape)

(15339,)
(39154, 15339)


In [207]:
svd_sub = TruncatedSVD(n_components=50, random_state=42)
sub_reduced = svd_sub.fit_transform(sub)

In [208]:
# Text Vectorization for numeric interpretations
corpus_body = data['body'][0:]
vectorizer_body = TfidfVectorizer()
body = vectorizer_body.fit_transform(corpus_body)
print(vectorizer_body.get_feature_names_out().shape)
print(body.shape)

(183381,)
(39154, 183381)


In [209]:
svd = TruncatedSVD(n_components=300, random_state=42)
body_reduced = svd.fit_transform(body)

In [210]:
dupes_data = data.index[data.index.duplicated()]
print("data duplicates:", dupes_data)

data duplicates: Index([], dtype='int64')


In [211]:
scaler_body = StandardScaler()
body_reduced = scaler_body.fit_transform(body_reduced)

In [212]:
scaler_send = StandardScaler()
send_reduced = scaler_send.fit_transform(send_reduced)

In [213]:
scaler_sub = StandardScaler()
sub_reduced = scaler_sub.fit_transform(sub_reduced)

In [214]:
body_df = pd.DataFrame(body_reduced, columns=range(1,301))
send_df = pd.DataFrame(send_reduced, columns=range(1,6))
sub_df = pd.DataFrame(sub_reduced, columns=range(1,51))

new_data = pd.concat([body_df, send_df, sub_df, data], axis=1)
new_data = new_data.drop('body', axis=1)
new_data = new_data.drop('sender', axis=1)
new_data = new_data.drop('subject', axis=1)
new_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,43,44,45,46,47,48,49,50,urls,label
0,-0.426917,-0.004104,-0.076846,-0.203992,-0.122698,-0.698500,-0.352484,0.591961,-0.177576,0.010746,...,-1.072639,-0.137006,0.644205,0.531124,-0.850184,-0.286374,-0.385870,-0.205839,1,1
1,-0.301420,-0.604644,-0.078867,-0.003527,-0.189298,0.027241,0.650487,0.776408,-0.426351,0.226446,...,-0.034326,0.044819,0.003294,-0.114293,-0.026271,0.038869,-0.042575,-0.023058,1,1
2,3.272722,-2.561960,-0.871368,-0.227408,0.010922,0.036696,-0.228835,-0.380469,4.142873,-2.947278,...,-0.001498,0.022748,-0.047372,-0.034065,-0.027853,0.002334,-0.035214,-0.016125,1,1
3,-0.318055,-0.760740,-0.271595,-0.140837,0.111953,-0.071034,0.052940,0.198062,0.209648,0.753429,...,0.014184,-0.672714,-0.284708,-0.002581,-0.113030,0.734184,0.806202,-0.008474,1,0
4,-0.344538,-1.047056,-0.199258,-0.108279,0.048807,0.088499,0.096057,0.020240,0.074161,0.128524,...,-0.018962,0.020230,-0.027911,-0.037765,-0.030991,-0.021996,-0.032201,-0.010667,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39149,1.473310,-0.006965,5.060952,-0.494319,0.897272,0.041328,0.107115,-0.138582,0.526641,-0.548493,...,-0.027354,0.025166,-0.023670,-0.044218,-0.037685,-0.029784,-0.028467,-0.009882,0,1
39150,1.532967,-0.011893,5.045746,-0.445428,0.827892,0.269618,-0.079244,-0.131366,-0.296801,-0.078146,...,-0.027354,0.025166,-0.023670,-0.044218,-0.037685,-0.029784,-0.028467,-0.009882,0,1
39151,-0.401602,0.330418,-0.138006,-0.190510,-0.416924,-0.288563,-0.785936,-0.193427,-0.200648,-0.260859,...,0.002406,-0.005925,-0.114503,-0.057746,-0.023063,-0.039911,-0.028251,-0.054432,0,0
39152,-0.400845,0.225076,-0.023898,-0.162085,-0.298123,-0.402489,-0.274232,0.175800,0.037307,-0.004968,...,0.473563,0.284709,-0.275641,-0.527762,-0.085891,-0.088685,-0.192114,0.443284,0,0


In [215]:
data = new_data

In [216]:
data.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,43,44,45,46,47,48,49,50,urls,label
0,-0.426917,-0.004104,-0.076846,-0.203992,-0.122698,-0.6985,-0.352484,0.591961,-0.177576,0.010746,...,-1.072639,-0.137006,0.644205,0.531124,-0.850184,-0.286374,-0.38587,-0.205839,1,1
1,-0.30142,-0.604644,-0.078867,-0.003527,-0.189298,0.027241,0.650487,0.776408,-0.426351,0.226446,...,-0.034326,0.044819,0.003294,-0.114293,-0.026271,0.038869,-0.042575,-0.023058,1,1
2,3.272722,-2.56196,-0.871368,-0.227408,0.010922,0.036696,-0.228835,-0.380469,4.142873,-2.947278,...,-0.001498,0.022748,-0.047372,-0.034065,-0.027853,0.002334,-0.035214,-0.016125,1,1
3,-0.318055,-0.76074,-0.271595,-0.140837,0.111953,-0.071034,0.05294,0.198062,0.209648,0.753429,...,0.014184,-0.672714,-0.284708,-0.002581,-0.11303,0.734184,0.806202,-0.008474,1,0
4,-0.344538,-1.047056,-0.199258,-0.108279,0.048807,0.088499,0.096057,0.02024,0.074161,0.128524,...,-0.018962,0.02023,-0.027911,-0.037765,-0.030991,-0.021996,-0.032201,-0.010667,1,1
5,-0.693548,-0.808012,0.045951,-0.166378,0.010664,0.012139,-0.066918,-0.059027,0.001882,-0.050618,...,0.149617,0.519573,-0.067937,0.596771,-0.601829,-0.675835,0.218235,0.308421,0,1
6,-0.440787,0.060271,0.110373,2.265851,0.733996,-0.132796,0.099662,0.47206,-0.298767,-0.349066,...,-0.047771,-1.236799,0.236425,-1.318554,0.107207,-0.344796,-0.758078,-0.944707,0,1
7,3.270388,-2.562125,-0.871041,-0.228462,0.01538,0.027721,-0.227645,-0.377261,4.140709,-2.943642,...,-0.001498,0.022748,-0.047372,-0.034065,-0.027853,0.002334,-0.035214,-0.016125,1,1
8,-0.411026,0.090955,-0.102771,-0.208674,-0.045716,-0.292051,-0.650438,-0.17394,0.606204,1.187066,...,-0.009112,-0.094642,-0.164975,0.080277,0.410487,-0.446219,-0.414278,0.05301,1,0
9,3.271083,-2.562309,-0.871182,-0.228368,0.014948,0.028572,-0.22776,-0.377589,4.141385,-2.944214,...,-0.001498,0.022748,-0.047372,-0.034065,-0.027853,0.002334,-0.035214,-0.016125,1,1


## Decision Class

In [217]:
class Decision:
    """ A decision is used to ask the question at a decision node to split the data.
    This class records column number and values and matches the stored feature value to a give feature value
    """
    
    def __init__(self, feature_index, threshold):
        self.feature_index = feature_index
        self.threshold = threshold
        
    def ask(self, input):
        # Compares input feature value to stored value
        feature_val = input[self.feature_index]
        if isinstance(feature_val, (int, float, np.number)):
            return feature_val >= self.threshold
        else:
            return feature_val == self.threshold
        

## Helper Functions for Splitting

In [218]:
def divide_df(rows, decision):
    # Partitions a data frame
    # Check if each row matches decision, divide into true and false
    col = rows[:, decision.feature_index]
    if np.issubdtype(col.dtype, np.number):
        mask = col >= decision.threshold
    else:
        mask = col == decision.threshold
    left, right = rows[mask],rows[~mask]
    return left, right

In [219]:
def label_count(rows):
    # Counts the number of each classification in data frame
    y = rows[:, -1]
    unique, label_counts = np.unique(y, return_counts=True)
    return dict(zip(unique,label_counts))

In [220]:
def gini_impurity(rows):
    #Calculates Gini Impurity for a data frame of rows.
    y = rows[:, -1]
    _, label_counts = np.unique(y, return_counts=True)
    probs = label_counts/label_counts.sum()
    return 1.0 - np.sum(probs**2)

In [221]:
def info_gain(left, right, curr_gini):
    #Information gain: Gini of the root node subtracted by the impurty of the two children nodes.
    if len(left) + len(right) == 0:
        return 0
    prob = float(len(left) / (len(left) + len(right)))
    return curr_gini - prob * gini_impurity(left) - (1 - prob) * gini_impurity(right)
                 

In [222]:
def threshold_candidates(col, max_thresh=5):
    #Choose candidate threshold split
    unique = np.unique(col)
    if len(unique) > max_thresh:
        quantile = np.linspace(0, 100, max_thresh + 2)[1:-1]
        unique = np.percentile(unique, quantile)
    if len(unique) > 1:
        return (unique[:-1] + unique[1:])/2
    else:
        return unique

In [223]:
def info_gain_split(rows):
    #Find best decision to make based on informaiton gain
    X = rows[:, :-1]
    y = rows[:, -1]
    curr_gini = gini_impurity(rows)
    feature_count = X.shape[1]
    
    highest_gain = 0
    optimal_decision = None
    
    for feature_index in range(feature_count):
        col = X[:, feature_index]
        
        #Candidate Thresholds
        thresholds = threshold_candidates(col) if np.issubdtype(col.dtype, np.number) else np.unique(col)
        
        for candidate in thresholds:
            if np.issubdtype(col.dtype, np.number):
                mask = col >= candidate
            else:
                mask = col == candidate
            
            if mask.sum() == 0 or mask.sum() == len(mask):
                continue
        
            left, right = rows[mask], rows[~mask]
            gain = info_gain(left, right, curr_gini)
            
            if gain > highest_gain:
                highest_gain, optimal_decision = gain, Decision(feature_index, candidate)
                
    return highest_gain, optimal_decision

## Build Tree and Node Classes

In [224]:
class LeafNode:
    # A leaf Node holdes classified data.
    # Holds a dictionary with class counts in the leaf.
    
    def __init__(self,rows):
        self.pred = label_count(rows)

In [225]:
class DecisionNode:
    # A Decision Node asks a Decision to be made.
    # Holds reference to a Decision, and two child nodes.
    
    def __init__(self, decision, left, right):
        self.decision = decision
        self.left = left
        self.right = right

In [254]:
def build_tree(rows, depth=0, max_depth=10, min_sample_split=2):
    # Recursively Builds tree.
    if len(rows) < min_sample_split or depth >= max_depth:
        return LeafNode(rows)
    
    highest_gain, optimal_decision = info_gain_split(rows)
    
    #Base case no further gain
    if highest_gain < 1e-6 or optimal_decision is None:
        return LeafNode(rows)
    
    #Found Partition
    left, right = divide_df(rows, optimal_decision)
    
    #Recurse Left Subtree
    left_subtree = build_tree(left, depth+1, max_depth, min_sample_split)
    
    #Recurse Right Subtree
    right_subtree = build_tree(right, depth+1, max_depth, min_sample_split)
    
    #Return Decision Node
    return DecisionNode(optimal_decision, left_subtree, right_subtree)

In [227]:
def predict(row, curr_node):
    #Base Case: Curr node is a leaf
    if isinstance(curr_node, LeafNode):
        total = sum(curr_node.pred.values())
        return max(curr_node.pred, key=curr_node.pred.get), {k: v/total for k,v in curr_node.pred.items()}
    
    #Recurse the left or right subtree
    if curr_node.decision.ask(row):
        return predict(row, curr_node.left)
    else:
        return predict(row, curr_node.right)

## Prediction and Testing

In [297]:
#test_data = {'sender': ['luna_prado@gmail.com'], 'subject': ['Advisor Help'], 'body':['Hello Dr. Athienitis, can you help me with choosing classes for the upcoming semester. Look forward to staying in contact.'], 'urls':[0]}
#test_df = pd.DataFrame(test_data)

In [309]:
test_data = {'sender': ['asjfnakjsnfkanf@gmail.com'], 'subject': ['SCAM URGENT'], 'body':['Make money quick, urgent new opportunity. Please buy now for your future. Passive Income, Easy life. Venmo.com'], 'urls':[1]}
test_df = pd.DataFrame(test_data)

In [310]:
new_body = vectorizer_body.transform(test_df['body'])
new_body_reduced = svd.transform(new_body)
new_body_df = pd.DataFrame(new_body_reduced, columns=range(1,301))

new_send = vectorizer_send.transform(test_df['sender'])
new_send_reduced = svd_send.transform(new_send)
new_send_df = pd.DataFrame(new_send_reduced, columns=range(1,6))

new_sub = vectorizer_sub.transform(test_df['subject'])
new_sub_reduced = svd_sub.transform(new_sub)
new_sub_df = pd.DataFrame(new_sub_reduced, columns=range(1,51))

In [311]:
new_scaler_body = StandardScaler()
new_body_reduced = new_scaler_body.fit_transform(new_body_reduced)

new_scaler_send = StandardScaler()
new_send_reduced = new_scaler_send.fit_transform(new_send_reduced)

new_scaler_sub = StandardScaler()
new_sub_reduced = new_scaler_sub.fit_transform(new_sub_reduced)

In [312]:
new_data = pd.concat([new_body_df, new_send_df, new_sub_df,test_df], axis=1)
new_data = new_data.drop('body', axis=1)
new_data = new_data.drop('subject', axis=1)
new_data = new_data.drop('sender', axis=1)

In [313]:
test_df = new_data
test_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,42,43,44,45,46,47,48,49,50,urls
0,0.040516,0.059594,0.002825,0.035057,-0.045219,0.02167,0.055005,0.0523,0.00079,0.015054,...,-0.002516,7.8e-05,0.000762,0.002078,0.006102,-0.001195,0.005409,0.005108,-0.003093,1


## Random Forest Draft:

In [314]:
class RandomForest:
    def __init__(self, tree_count=10, max_depth=10, min_sample_split=2, feature_count=None):
        self.tree_count = tree_count
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.feature_count = feature_count
        self.trees = []
        
    def fit(self, X, y):
        self.trees = []
        self.feature_count = X.shape[1]
        self.feature_subspaces = []
        
        for _ in range(self.tree_count):
            X_partial, y_partial = self.bootstrap(X, y)
            feature_index = np.random.choice(self.feature_count, int(np.sqrt(self.feature_count)), replace=False)
            self.feature_subspaces.append(feature_index)
            X_subspace = X_partial[:, feature_index]
            rows = np.concatenate((X_subspace, y_partial), axis=1)
            
            tree = build_tree(rows, max_depth=self.max_depth, min_sample_split=self.min_sample_split)
            self.trees.append(tree)
        
        
    def bootstrap(self, X, y):
        sample_count = X.shape[0]
        row_index = np.random.choice(sample_count, sample_count, replace=True)
        return X[row_index], y[row_index]
        
    
    def subspace(self, X):
        feature_index = np.random.choice(self.feature_count, int(self.feature_count**0.5), replace=False)
        return X[:, feature_index]
       
                                  
    def predict_one(self, X):
        votes = []
        for tree, features in zip(self.trees, self.feature_subspaces):
            X_subspace = X[features]
            pred, _ = predict(X_subspace, tree)
            votes.append(pred)
        return max(set(votes), key=votes.count)
                                  
                                  
    def random_predict(self, X):
        preds = np.array([predict(X, tree) for tree in self.trees])
        final_preds = np.swapaxes(preds, 0, 1)
        #Need to implement mode getter
        values, counts = np.unique(final_preds[0], return_counts=True)
        mode_index = np.argmax(counts)
        mode_value = values[mode_index]
        mode_count = counts[mode_index]/self.tree_count
        sol = np.array((mode_value,mode_count))
        return sol

In [315]:
X = data.to_numpy()[:,:-1]
y = data.to_numpy()[:, -1].reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(31323, 356)
(7831, 356)


In [316]:
my_forest = RandomForest(tree_count=10, max_depth=10, min_sample_split=2, feature_count=X_train.shape[1])
my_forest.fit(X_train,y_train)

In [317]:
arr = np.array([my_forest.predict_one(X) for X in X_test])
print(arr.shape)
print(y_test.shape)

(7831,)
(7831, 1)


In [318]:
accuracy = accuracy_score(y_test, arr)
print(accuracy)

0.9532626739879965


In [319]:
input_x = test_df.to_numpy()[0,:]
pred = my_forest.predict_one(input_x)
pred

1.0