In [19]:
import pandas as pd
import numpy as np
from numba import jit,njit,prange
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import RobustScaler

In [20]:
df=pd.read_csv('creditcard.csv')
df_clean=df[['V11','V4','V1','V18','V7','V3','V16','V10','V12','V14','V17','Class']]

In [42]:
target_correlations=df.corr()['Class'].sort_values(ascending=False)
target_correlations

Class     1.000000
V11       0.154876
V4        0.133447
V2        0.091289
V21       0.040413
V19       0.034783
V20       0.020090
V8        0.019875
V27       0.017580
V28       0.009536
Amount    0.005632
V26       0.004455
V25       0.003308
V22       0.000805
V23      -0.002685
V15      -0.004223
V13      -0.004570
V24      -0.007221
Time     -0.012323
V6       -0.043643
V5       -0.094974
V9       -0.097733
V1       -0.101347
V18      -0.111485
V7       -0.187257
V3       -0.192961
V16      -0.196539
V10      -0.216883
V12      -0.260593
V14      -0.302544
V17      -0.326481
Name: Class, dtype: float64

In [21]:
X=df_clean.drop('Class',axis=1).values
y=df_clean['Class'].values

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=77)

In [26]:
scaler=RobustScaler()
X_train_s=scaler.fit_transform(X_train)
X_test_s=scaler.transform(X_test)

In [17]:
@njit(parallel=True, fastmath=True)
def euclidean_distance(X_train, X_test):
    num_train, num_features = X_train.shape
    num_test = X_test.shape[0]
    distances = np.empty((num_test, num_train), dtype=np.float64)
    
    for i in prange(num_test):
        for j in range(num_train):
            diff = X_train[j] - X_test[i]
            distances[i, j] = np.sqrt(np.sum(diff ** 2))
    
    return distances

@njit(parallel=True, fastmath=True)
def predict_labels(distances, y_train, k):
    num_test = distances.shape[0]
    predictions = np.empty(num_test, dtype=np.int32)
    
    for i in prange(num_test):
        neighbors_indices = np.argsort(distances[i])[:k]
        neighbor_labels = y_train[neighbors_indices]
        count_1 = np.sum(neighbor_labels == 1)
        count_0 = np.sum(neighbor_labels == 0)
        predictions[i] = 1 if count_1 > count_0 else 0
    
    return predictions

class KNNClassifier:
    
    def __init__(self, k=5):
        self.k = k
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test, batch_size=100):
        num_samples = X_test.shape[0]
        predictions = np.empty(num_samples, dtype=np.int32)
        
        for i in range(0, num_samples, batch_size):
            end_index = min(i + batch_size, num_samples)
            batch_X_test = X_test[i:end_index]
            distances = euclidean_distance(self.X_train, batch_X_test)
            batch_predictions = predict_labels(distances, self.y_train, self.k)
            predictions[i:end_index] = batch_predictions
        
        return predictions
    
    def evaluate(self, X_test, y_test):
        self._predictions = self.predict(X_test)
        
        self._accuracy = np.sum(self._predictions == y_test) / len(y_test)
        
        # Compute the number of true positives, false positives, and false negatives
        true_positives = np.sum((self._predictions == 1) & (y_test == 1))
        false_positives = np.sum((self._predictions == 1) & (y_test == 0))
        false_negatives = np.sum((self._predictions == 0) & (y_test == 1))
        
        # Compute precision and recall
        self._precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        self._recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
    def predictions(self):
        return self._predictions  # Return the predictions
    
    def metrics(self):
        return np.array([self._accuracy, self._precision, self._recall])  # Return accuracy, precision, and recall

In [40]:
def cross_validate_knn(X, y, k_values, n_splits=5):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = []
    
    for k in k_values:
        knn = KNNClassifier(k=k)
        accuracies = np.array([])
        precisions = np.array([])
        recalls = np.array([])
        
        for train_index, val_index in kf.split(X,y):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]
            
            knn.fit(X_train, y_train)
            knn.evaluate(X_val, y_val)
            accuracy, precision, recall = knn.metrics()
            
            accuracies=np.append(accuracies,accuracy)
            precisions=np.append(precisions,precision)
            recalls=np.append(recalls,recall)
            
        
        results=np.append(results,{
            'k': k,
            'accuracy': np.mean(accuracies),
            'precision': np.mean(precisions),
            'recall': np.mean(recalls)
        })
        print({
            'k': k,
            'accuracy': np.mean(accuracies),
            'precision': np.mean(precisions),
            'recall': np.mean(recalls)
        })
    
    return results

In [41]:
results=cross_validate_knn(X_train_s,y_train,np.arange(5,30,5))

{'k': 5, 'accuracy': 0.9995479382913823, 'precision': 0.9296246639473441, 'recall': 0.7994157740993184}
{'k': 10, 'accuracy': 0.9995128267023634, 'precision': 0.9363889822693068, 'recall': 0.7714703018500486}
{'k': 15, 'accuracy': 0.9994864930105993, 'precision': 0.9012977827947101, 'recall': 0.791755923401493}
{'k': 20, 'accuracy': 0.9994645482674626, 'precision': 0.8916900049875156, 'recall': 0.7892242778318728}
{'k': 25, 'accuracy': 0.9994162698325617, 'precision': 0.8589745963900747, 'recall': 0.7968516715352159}


In [43]:
model=KNNClassifier(k=5)
model.fit(X_train_s,y_train)
model.evaluate(X_test_s,y_test)

In [44]:
model.metrics()

array([0.999526  , 0.91764706, 0.79591837])