In [1]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd

from algorithms.utils import train_test_split

from algorithms.ConfusionMatrix import ConfusionMatrix

from algorithms.Gaussian import QuadraticGaussianClassifier, NormalNaiveBayes
from algorithms.NearestNeighbors import NearestCentroidClassifier, KNNClassifier, NNClassifier
from algorithms.Quadratic import QuadraticClassifier

import logging

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
df = pd.read_csv("../datasets/default of credit card clients.csv", delimiter=',')
df.head()

Unnamed: 0,ID,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
df['Y'].value_counts()

0    23364
1     6636
Name: Y, dtype: int64

In [4]:
data = np.loadtxt("../datasets/default of credit card clients.csv", delimiter=',', skiprows=1)
data.shape

(30000, 25)

In [5]:
def run_experiments(data, experiments_indices):
    logger.setLevel(logging.CRITICAL)
    models = {
        "NB": NormalNaiveBayes(),
        "CQ(P)": 
            QuadraticClassifier(check_invertibility=True,pinv_mode="pooled"),
        "CQG(P)": 
            QuadraticGaussianClassifier(check_invertibility=True,pinv_mode="pooled"),
        "CQ(F)": 
            QuadraticClassifier(check_invertibility=True,pinv_mode="friedman"),
        "CQG(F)": 
            QuadraticGaussianClassifier(check_invertibility=True,pinv_mode="friedman"),
        "DMC": NearestCentroidClassifier(),
        "KNN(k=11)": KNNClassifier(k=11),
        "NN": NNClassifier()
    }

    acc_results = []
    metrics_results = []
    for model_name in models:
        model = models[model_name]
        min_score = 101
        max_score = -1

        scores = []
        sensitivities = []
        specificities = []
        precisions = []
        for indices in experiments_indices:
            
            train,test = train_test_split(data[indices,:],.8, shuffle=False)
            model.fit(train[:,:-1],train[:,-1])

            predicted = [model.predict(x) for x in test[:,:-1]]
            conf_matrix = ConfusionMatrix(test[:,-1],predicted)

            sensitivities.append(conf_matrix.sensitivity())
            specificities.append(conf_matrix.specificity())
            precisions.append(conf_matrix.precision())

            score = conf_matrix.accuracy()
            scores.append(score)

            if score > max_score:
                max_score = score

            if score < min_score:
                min_score = score

        mean = np.mean(scores)
        std = np.std(scores)
        median = np.median(scores)

        acc_results.append({
            "1 - Alg": model_name,
            "2 - Média(%)": mean*100,
            "3 - Mediana(%)": median*100,
            "4 - Min/Max(%)": "{:.1f} / {:.1f}".format(min_score*100,max_score*100),
            "5 - Desv. Pad.(%)": std*100
        })
        
        metrics_results.append({
            "1 - Alg": model_name,
            "6 - Sensibilidade(%)": np.mean(sensitivities)*100,
            "7 - Especificidade(%)": np.mean(specificities)*100,
            "8 - Precisão(%)": np.mean(precisions)*100
        })

    return pd.DataFrame(acc_results).round(1), pd.DataFrame(metrics_results).round(1)

In [None]:
experiments_indices = [np.random.permutation(range(data.shape[0])) for _ in range(100)]
experiments_indices[0]

array([ 6006, 10220, 13293, ...,  7152, 19225, 10750])

In [None]:
acc_df, metrics_df = run_experiments(data,experiments_indices)
acc_df

  r = _umath_linalg.det(a, signature=signature)


In [None]:
metrics_df

In [None]:
from sklearn.cluster import KMeans

def clusterize_data(data,k):
    classes = np.unique(data[:,-1])
    n = data.shape[1]
    clusterized_data = np.reshape(np.zeros(n),(1,n))
    for c in classes:
        d = data[data[:,-1]==c]
        model = KMeans(n_clusters=k, max_iter = 10)
        model.fit(d[:,:-1])
        y = np.array([c for _ in range(k)])
        clusterized_data = np.append(clusterized_data,np.c_[model.cluster_centers_,y],axis=0)

    return clusterized_data[1:,:]

In [None]:
# data = clusterize_data(data, 1000)
# np.savetxt("clustered-data-1000.csv",data,delimiter=',')
data = np.loadtxt("clustered-data-1000.csv",delimiter=',')
data.shape

In [None]:
acc_df, metrics_df = run_experiments(data,experiments_indices)
acc_df

In [None]:
metrics_df