In [8]:
import math
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import statistics
import pandas as pd
import numpy as np
%run Isolation_Template.ipynb
import os


#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Downloads", "creditcard.csv")
df = pd.read_csv(file_path) #read csv file as pandas object
CC_data = df.to_numpy() #CC_data will contain the Credit Card Fraud detection dataset as a numpy object

def runPipeline(dataset):
    params = {"features": [.1, .25, .5], "percentOfData": [.1, .25, .50], "z-score": [-4.4, -4.5, -4.7, -4.8]}
    labels = np.array([arr[-1] for arr in dataset])   
    df = np.array([arr[:-1] for arr in dataset])       
    results1 = runTuneTest("IsolationForest", params, df, labels)
    print("Isolation Forest Grid Search CV:")
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    for i in range(len(results1)):
        print("Parameters:")
        print(results1[i][0])
        print("Confusion Matrix Data:")
        # for j in results1[i][2]:
        #     print(j)
        print(results1[i][2])
        print("precision: ")
        print(results1[i][1])
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    

def runTuneTest(learner, parameters, data, labels):

    res = []
    skf = StratifiedKFold(n_splits=2)
    splits = skf.split(X=data, y=labels)
    for i, (train_index, test_index) in enumerate(splits):
        trainFeatures = [data[index] for index in train_index]
        trainLabels = [labels[index] for index in train_index]
        testFeatures = [data[index] for index in test_index]
        testLabels = [labels[index] for index in test_index]
        
        trainFeatures = np.array(trainFeatures)
        trainLabels = np.array(trainLabels)
        testFeatures = np.array(testFeatures)
        testLabels = np.array(testLabels)
        if learner == "IsolationForest":
            clf = Grid_Search(learner, parameters, 2, trainFeatures, trainLabels)
            isoForest = IsolationForest(forestSize = 100, maxDepth = 50, numFeatures = clf["features"], percentOfData = clf["percentOfData"])
            isoForest.createForest(trainFeatures)
            anomalyScores = []
            for instance in testFeatures :
                score = isoForest.anamolyScore(instance)
                anomalyScores.append(score)
            sigma = statistics.stdev(anomalyScores)
            mu = statistics.mean(anomalyScores)
            f1score = isoForest.precision(testLabels, anomalyScores, clf["z-score"], mu, sigma)
            predictions = []
            pos = 0
            minZ = 10
            for i in range(len(anomalyScores)):
                z = (anomalyScores[i]- mu) / sigma
                if minZ>z:
                    index = i
                    minZ = z
                if z<=clf["z-score"]:
                    label = 1
                else:
                    label = 0
                predictions.append(label)
            if 1 not in predictions:
                predictions[index] = 1
            confusionData = isoForest.confusionMatrix(testLabels, predictions)
            res.append((clf, f1score, confusionData))
    return res
        
    
    
def Grid_Search(learner, parameters, cv, data, labels):
    if learner == "IsolationForest":
        skf = StratifiedKFold(n_splits=cv)
        splits = skf.split(X=data, y=labels)
        global_best_params = {"features": 0, "percentOfData": 0, "z-score": 0, "f1score": 0}
        for i, (train_index, test_index) in enumerate(splits):
            trainFeatures = [data[index] for index in train_index]
            trainLabels = [labels[index] for index in train_index]
            testFeatures = [data[index] for index in test_index]
            testLabels = [labels[index] for index in test_index]
            
            trainFeatures = np.array(trainFeatures)
            trainLabels = np.array(trainLabels)
            testFeatures = np.array(testFeatures)
            testLabels = np.array(testLabels)

            global_max_f1score = 0
            local_best_params = {"features": 0, "percentOfData": 0, "z-score": 0, "f1score": 0}
            
            for value1 in parameters["features"]:
                for value2 in parameters["percentOfData"]:
                    isolationForest =IsolationForest(forestSize=100, maxDepth = 50, numFeatures = value1, percentOfData = value2)
                    isolationForest.createForest(trainFeatures)
                    anomalyScores = []
                    for instance in testFeatures:
                        score = isolationForest.anamolyScore(instance)
                        anomalyScores.append(score)
                    sigma = statistics.stdev(anomalyScores)
                    mu = statistics.mean(anomalyScores)
                    bestThresh = 0
                    local_max_f1score = 0
                    for thresh in parameters["z-score"]:
                        f1score_value = isolationForest.precision(testLabels, anomalyScores, thresh, mu, sigma)
                        if f1score_value>local_max_f1score:
                            local_max_f1score = f1score_value
                            bestThresh = thresh
                    if local_max_f1score>global_max_f1score:
                        global_max_f1score = local_max_f1score
                        local_best_params = {"features": value1, "percentOfData": value2, "z-score": bestThresh , "f1score": global_max_f1score}
            if local_best_params["f1score"]>global_best_params["f1score"]:
                global_best_params["f1score"] = local_best_params["f1score"]
                global_best_params["features"] = local_best_params["features"]
                global_best_params["percentOfData"] = local_best_params["percentOfData"]
                global_best_params["z-score"] = local_best_params["z-score"]
        return global_best_params

In [6]:
data =  CC_data[np.random.choice(CC_data.shape[0], 20000, replace=True)]
runPipeline(data)

Isolation Forest Grid Search CV:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Parameters:
{'features': 0.1, 'percentOfData': 0.1, 'z-score': -4.4, 'f1score': 1.0}
Confusion Matrix Data:
[array([[9979,    3],
       [  16,    2]]), 'True Negative: 9979', 'False Positive: 3', 'False Negative: 16', 'True Positive: 2']
precision: 
0.4
Parameters:
{'features': 0.1, 'percentOfData': 0.25, 'z-score': -4.8, 'f1score': 1.0}
Confusion Matrix Data:
[array([[9974,    8],
       [  14,    4]]), 'True Negative: 9974', 'False Positive: 8', 'False Negative: 14', 'True Positive: 4']
precision: 
0.3333333333333333
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
