In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('onlinefraud.csv')

# Convert categorical variables into numerical
Type_Mapping = {
    "PAYMENT": 0,
    "TRANSFER": 1,
    "CASH_OUT": 2,
    "DEBIT": 3,
    "CASH_IN" : 4
}
data["type"] = data["type"].replace(Type_Mapping)

# Discretize continuous
bins = [0, 1000, 10000, 100000, 1000000, np.inf]
data["amount"] = np.digitize(data["amount"], bins) - 1
data["oldbalanceOrg"] = np.digitize(data["oldbalanceOrg"], bins) - 1
data["newbalanceOrig"] = np.digitize(data["newbalanceOrig"], bins) - 1
data["oldbalanceDest"] = np.digitize(data["oldbalanceDest"], bins) - 1
data["newbalanceDest"] = np.digitize(data["newbalanceDest"], bins) - 1

data = data.drop('nameOrig', axis=1)
data = data.drop('nameDest', axis=1)

In [3]:
class DecisionTreeNode:
    def __init__(self, attribute=None, threshold=None, label=None):
        self.split_attribute = attribute  # Attribute used for splitting the node
        self.split_threshold = threshold  # Value used for splitting the node
        self.label = label  # Class label if the node is a leaf, None otherwise
        self.child_nodes = {}  # Dictionary to hold the child nodes

    def is_leaf_node(self):
        """Check if the node is a leaf node (i.e., doesn't have any children)"""
        return not bool(self.child_nodes)

    def add_child(self, attribute_value, node):
        """Add a child node for a specific attribute value"""
        self.child_nodes[attribute_value] = node


In [4]:
def determine_most_common_value(examples):
    return np.argmax(np.bincount(examples.iloc[:, -1]))

In [5]:
def find_attribute_index(attribute, attributes):
    index = list(attributes).index(attribute)
    return index

# attributes = data.columns
# find_attribute_index("type", attributes)

In [6]:
def find_best_entropy(attribute_list, data_samples):
    # Initialize the best attribute and the minimum entropy
    optimal_splitting_attribute = None
    minimum_entropy = float('inf')

    # Iterate over each attribute
    for attribute in attribute_list:
        # Calculate the unique values and their counts for the attribute
        unique_values, value_counts = np.unique(data_samples[attribute], return_counts=True)
        
        # Calculate the probabilities of each unique value
        value_probabilities = value_counts / len(data_samples[attribute])
        
        # Calculate the entropy of the attribute
        attribute_entropy = -np.sum(value_probabilities * np.log2(value_probabilities))

        # If the entropy is less than the current minimum, update the minimum and the best attribute
        if attribute_entropy < minimum_entropy:
            minimum_entropy = attribute_entropy
            optimal_splitting_attribute = attribute

    # Return the attribute with the minimum entropy
    return optimal_splitting_attribute


In [7]:
def find_best_gini(attribute_list, data_samples):
    # Initialize the best attribute and the minimum Gini index
    optimal_splitting_attribute = None
    minimum_gini_index = float('inf')

    for attribute in attribute_list:
        # Get the unique values of the attribute
        attribute_values = np.unique(data_samples.iloc[:, find_attribute_index(attribute, attribute_list)])
        attribute_gini_index = 0

        for value in attribute_values:
            # Get the samples that have the current value for the current attribute
            value_samples = data_samples[data_samples.iloc[:, find_attribute_index(attribute, attribute_list)] == value]
            # Calculate the probability of the value
            value_probability = len(value_samples) / len(data_samples)
            # Calculate the Gini index for the value
            value_gini_index = calculate_gini_index(value_samples.iloc[:, -1])

            # Add the weighted Gini index to the total Gini index for the attribute
            attribute_gini_index += value_probability * value_gini_index

        # If the Gini index for the attribute is less than the current minimum, update the minimum and the best attribute
        if attribute_gini_index < minimum_gini_index:
            minimum_gini_index = attribute_gini_index
            optimal_splitting_attribute = attribute

    # Return the attribute with the minimum Gini index
    return optimal_splitting_attribute

def calculate_gini_index(labels):
    labels = labels.astype(int)
    # Count the number of each label
    label_counts = np.bincount(labels)
    # Calculate the probability of each label
    label_probabilities = label_counts / len(labels)
    # Calculate the Gini index
    gini_index = 1 - np.sum(label_probabilities ** 2)
    return gini_index


In [8]:
attributes = data.columns

In [9]:
def decision_tree(data_samples, attribute_list, parent_samples, criterion):
    # If there are no data samples, return a node with the most common value in the parent samples
    if len(data_samples) == 0:
        return DecisionTreeNode(label=determine_most_common_value(parent_samples))
    
    # If all data samples belong to the same class, return a node with that class
    elif len(data_samples['isFraud'].unique()) == 1:
        return DecisionTreeNode(label=data_samples['isFraud'].values[0])
    
    # If there are no more attributes to consider, return a node with the most common value in the data samples
    elif len(attribute_list) == 0:
        return DecisionTreeNode(label=determine_most_common_value(data_samples))

    else:
        # Find the best attribute based on the specified criterion (entropy or Gini index)
        if criterion == 'entropy':
            best_attribute = find_best_entropy(attribute_list, data_samples)
        else:  # Default to Gini index if no valid criterion is specified
            best_attribute = find_best_gini(attribute_list, data_samples)
        
        # Get the index and unique values of the best attribute
        best_attribute_index = find_attribute_index(best_attribute, attribute_list)
        best_attribute_values = data_samples.iloc[:, best_attribute_index].unique()
        
        # Create a new internal node for the best attribute
        tree = DecisionTreeNode(attribute=best_attribute)
        
        # For each value of the best attribute, create a new branch of the tree
        for value in best_attribute_values:
            # Get the subset of data samples that have the current value for the best attribute
            subset_data_samples = data_samples[data_samples.iloc[:, best_attribute_index] == value]
            
            # Recursively build the subtree for the current branch
            subtree = decision_tree(subset_data_samples, attribute_list[:best_attribute_index] + attribute_list[best_attribute_index+1:], data_samples, criterion)
            
            # Add the subtree to the current branch of the tree
            tree.add_child(value, subtree)
        
        # Return the root of the decision tree
        return tree


In [10]:
def print_tree(node, depth=0, file=None):
    indent = "  " * depth
    if node.is_leaf_node():  # This node is a leaf
        print(f"{indent}Predict {node.label}", file=file)
    else:
        for value, child_node in node.child_nodes.items():
            print(f"{indent}If {node.split_attribute} == {value}", file=file)
            print_tree(child_node, depth + 1, file=file)


In [11]:
def predict(node, instance, attribute_list):
    # If the node is a leaf node, return its label
    if node.is_leaf_node():
        return node.label
    # Get the value of the splitting attribute for the instance
    attribute_value = instance[attribute_list.index(node.split_attribute)]
    # If the attribute value is in the child nodes of the current node
    if attribute_value in node.child_nodes:
        # Get the child node corresponding to the attribute value
        child_node = node.child_nodes[attribute_value]
        # Recursively predict the class label for the instance
        return predict(child_node, instance, attribute_list)
    else:
        # If the attribute value is not in the child nodes, return None
        return None


In [12]:
def test_decision_tree(decision_tree, test_data, attribute_list):
    # Initialize an empty list to store the predictions
    predictions = []
    
    # Iterate over each instance in the test data
    for instance in test_data:
        # Predict the class label for the instance using the decision tree
        predicted_label = predict(decision_tree, instance, attribute_list)
        
        # Add the predicted label to the list of predictions
        predictions.append(predicted_label)
    
    # Return the list of predictions
    return predictions


In [13]:
def calculate_accuracy(predictions, actual_labels):
    correct_predictions = 0
    total_predictions = len(predictions)

    for i in range(total_predictions):
        if predictions[i] == actual_labels[i]:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

In [14]:
# Define the number of training samples
num_train_samples = 2000

# Split the data into training and test sets
train_data = data.iloc[num_train_samples:]
test_data = data.iloc[:num_train_samples]

# Separate the target variable from the predictors
train_target = train_data.iloc[:, -1]
train_predictors = train_data.iloc[:, :-1]

# Get the list of attributes, excluding the target attribute
attribute_list = list(train_data.columns)[:-1]

# Build the decision tree using the training data
decision_tree_model = decision_tree(data_samples=train_data, 
                                    attribute_list=attribute_list, 
                                    parent_samples=None, 
                                    criterion="gini")

# Print the decision tree
print("Decision Tree:")
print_tree(decision_tree_model)

# Save the decision tree to a text file
with open("tree.txt", "w") as file:
    print_tree(decision_tree_model, file=file)

# Prepare the test data
test_data_results = test_data.iloc[:, -1].values
test_data_predictors = test_data.iloc[:, :-1].values

# Use the decision tree to make predictions on the test data
predictions = test_decision_tree(decision_tree_model, test_data_predictors, attribute_list)
print("Predictions:")
print(predictions)

# Calculate the accuracy of the predictions
accuracy = calculate_accuracy(predictions, test_data_results)
print(f"Accuracy: {accuracy * 100:.2f}%")


Decision Tree:
If step == 1
  If newbalanceDest == 0
    If amount == 0
      If oldbalanceDest == 4
        Predict 0
      If oldbalanceDest == 1
        If newbalanceOrig == 2
          Predict 0
        If newbalanceOrig == 1
          If type == 1
            If oldbalanceOrg == 1
              Predict 0
        If newbalanceOrig == 0
          Predict 0
      If oldbalanceDest == 2
        Predict 0
      If oldbalanceDest == 0
        If newbalanceOrig == 1
          If type == 1
            If oldbalanceOrg == 1
              Predict 0
        If newbalanceOrig == 2
          If type == 1
            If oldbalanceOrg == 1
              Predict 0
        If newbalanceOrig == 0
          Predict 0
      If oldbalanceDest == 3
        If newbalanceOrig == 1
          If type == 1
            If oldbalanceOrg == 1
              Predict 0
        If newbalanceOrig == 0
          Predict 0
        If newbalanceOrig == 2
          Predict 0
    If amount == 4
      Predict 0
    If am