In [102]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import confusion_matrix
from numba import njit,prange
from itertools import product
from sklearn.model_selection import StratifiedKFold,cross_val_score
import pickle

In [2]:
df=pd.read_csv('creditcard.csv')

In [3]:
X=df.values[:,0:-1]
y=df.values[:,-1]

In [105]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=77)

In [86]:
# Define a function to compute the Euclidean distance between training and test data
@njit(parallel=True)
def euclidean_distance(X_train, X_test):
    num_train, num_features = X_train.shape  # Get the number of training samples and features
    num_test = X_test.shape[0]  # Get the number of test samples
    distances = np.zeros((num_test, num_train))  # Initialize the distance matrix
    
    # Loop over each test sample
    for i in prange(num_test):
        # Loop over each training sample
        for j in prange(num_train):
            distance = 0.0  # Initialize the distance for the current test-train pair
            # Loop over each feature
            for k in prange(num_features):
                distance += (X_test[i, k] - X_train[j, k]) ** 2  # Compute the squared difference
            distances[i, j] = np.sqrt(distance)  # Compute the Euclidean distance
    
    return distances  # Return the distance matrix

# Define a function to predict labels based on the distances
@njit(parallel=True)
def predict_labels(distances, y_train, k, threshold):
    num_test = distances.shape[0]  # Get the number of test samples
    predictions = np.zeros(num_test, dtype=np.int32)  # Initialize the predictions array
    
    # Loop over each test sample
    for i in prange(num_test):
        # Get the indices of the k nearest neighbors
        neighbors_indices = np.argsort(distances[i])[:k]
        # Get the labels of the k nearest neighbors
        neighbor_labels = y_train[neighbors_indices]
        # Predict the most common label among the neighbors
        count_1 = 0  # Initialize the count for label 1
        for label in neighbor_labels:
            if label == 1:
                count_1 += 1
        # Assign the label based on the threshold
        predictions[i] = 1 if count_1 > k * threshold else 0
    
    return predictions  # Return the predictions

# Define a class for the k-Nearest Neighbors classifier
class KNNClassifier:
    
    def __init__(self, k=3, threshold=0.5):
        self.k = k  # Set the number of neighbors
        self.threshold = threshold  # Set the threshold for prediction
    
    def fit(self, X_train, y_train):
        self.X_train = X_train  # Store the training data
        self.y_train = y_train  # Store the training labels
    
    def predict(self, X_test, batch_size=100):
        num_samples = X_test.shape[0]  # Get the number of test samples
        predictions = np.zeros(num_samples, dtype=np.int32)  # Initialize the predictions array
        
        # Loop over test samples in batches
        for i in range(0, num_samples, batch_size):
            end_index = min(i + batch_size, num_samples)  # Determine the end index for the current batch
            batch_X_test = X_test[i:end_index]  # Get the current batch of test samples
            distances = euclidean_distance(self.X_train, batch_X_test)  # Compute distances
            batch_predictions = predict_labels(distances, self.y_train, self.k, self.threshold)  # Predict labels for the batch
            predictions[i:end_index] = batch_predictions  # Store the batch predictions
        
        return predictions  # Return the predictions
    
    def evaluate(self, X_test, y_test):
        self._predictions = self.predict(X_test)  # Get predictions for the test data
        self._accuracy = np.sum(self._predictions == y_test) / len(y_test)  # Compute accuracy
        
        # Compute the number of true positives, false positives, and false negatives
        true_positives = np.sum((self._predictions == 1) & (y_test == 1))
        false_positives = np.sum((self._predictions == 1) & (y_test == 0))
        false_negatives = np.sum((self._predictions == 0) & (y_test == 1))
        
        # Compute precision and recall
        self._precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        self._recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
    def predictions(self):
        return self._predictions  # Return the predictions
    
    def metrics(self):
        # Return a dictionary containing accuracy, precision, and recall
        return np.array([self._accuracy,self._precision,self._recall])

In [94]:
k_array=np.arange(5,20,5)

In [95]:
threshold_array=np.linspace(0.05,0.5,3)

In [96]:
parameter_array=product(k_array,threshold_array)

In [97]:
kfold=StratifiedKFold(n_splits=3,random_state=98,shuffle=True)
hyper_parameter_search=np.array([])

In [98]:
for k,t in parameter_array:
    scores=np.array([])
    for train_index,val_index in kfold.split(X,y):
        X_train,X_val,y_train,y_val=X[train_index],X[val_index],y[train_index],y[val_index]
        model=KNNClassifier(k=k,threshold=t)
        model.fit(X_train,y_train)
        model.evaluate(X_val,y_val)
        scores=np.append(scores,model.metrics())
    print({'k':k,'threshold':t,'metrics':np.mean(scores.reshape(3,3),axis=0)})
    hyper_parameter_search=np.append(hyper_parameter_search,{'k':k,'threshold':t,'metrics':np.mean(scores.reshape(3,3),axis=0)})

{'k': 5, 'threshold': 0.05, 'metrics': array([0.99545306, 0.1070228 , 0.22154472])}
{'k': 5, 'threshold': 0.275, 'metrics': array([0.99837083, 0.70147563, 0.0995935 ])}
{'k': 5, 'threshold': 0.5, 'metrics': array([0.99836732, 0.93265993, 0.05894309])}
{'k': 10, 'threshold': 0.05, 'metrics': array([0.99121159, 0.05306266, 0.24186992])}
{'k': 10, 'threshold': 0.275, 'metrics': array([0.99832869, 0.64164332, 0.07520325])}
{'k': 10, 'threshold': 0.5, 'metrics': array([0.99829007, 0.66666667, 0.0101626 ])}
{'k': 15, 'threshold': 0.05, 'metrics': array([0.98609936, 0.03465352, 0.26219512])}
{'k': 15, 'threshold': 0.275, 'metrics': array([0.99829358, 0.62142857, 0.02642276])}
{'k': 15, 'threshold': 0.5, 'metrics': array([0.99827954, 0.33333333, 0.00406504])}


In [100]:
model_test=KNNClassifier(k=15,threshold=0.05)
model_test.fit(X_train,y_train)
model_test.evaluate(X_test,y_test)

In [101]:
model_test.metrics()

array([0.98767599, 0.04242424, 0.28571429])

In [103]:
with open('hyper_parameter_search.pkl','wb') as f:
    pickle.dump(hyper_parameter_search,f)

In [104]:
with open('test_set_metrics.pkl','wb') as f:
    pickle.dumps(model_test.metrics())