In [1]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd

from algorithms.utils import train_test_split

from algorithms.ConfusionMatrix import ConfusionMatrix

from algorithms.Gaussian import QuadraticGaussianClassifier, NormalNaiveBayes
from algorithms.NearestNeighbors import NearestCentroidClassifier
from algorithms.Quadratic import QuadraticClassifier

from sklearn.neighbors import KNeighborsClassifier

import logging

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
df = pd.read_csv("../datasets/default of credit card clients.csv", delimiter=',')
df.head()

Unnamed: 0,ID,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
df['Y'].value_counts()

0    23364
1     6636
Name: Y, dtype: int64

In [4]:
data = np.loadtxt("../datasets/default of credit card clients.csv", delimiter=',', skiprows=1)
data.shape

(30000, 25)

In [18]:
def run_experiments(data, times = 100):
    logger.setLevel(logging.CRITICAL)
    models = {
        "NB": NormalNaiveBayes(),
        "CQ(P)": 
            QuadraticClassifier(check_invertibility=True,pinv_mode="pooled"),
        "CQG(P)": 
            QuadraticGaussianClassifier(check_invertibility=True,pinv_mode="pooled"),
        "CQ(F)": 
            QuadraticClassifier(check_invertibility=True,pinv_mode="friedman"),
        "CQG(F)": 
            QuadraticGaussianClassifier(check_invertibility=True,pinv_mode="friedman"),
        "DMC": NearestCentroidClassifier(),
        "KNN(k=11)": KNeighborsClassifier(n_neighbors=11),
#         "KNN(k=11)": KNNClassifier(k=11),
        "NN": KNeighborsClassifier(n_neighbors=1)
#         "NN": NNClassifier()
    }

    acc_results = []
    metrics_results = []
    for model_name in models:
        print("Running model {}".format(model_name))
        model = models[model_name]
        min_score = 101
        max_score = -1

        scores = []
        sensitivities = []
        specificities = []
        precisions = []
        for _ in range(times):
            
            train,test = train_test_split(data,.8, shuffle=True)
            model.fit(train[:,:-1],train[:,-1])
            
            predicted = []
            for x in test[:,:-1]:
                try:
                    y = model.predict(x)
                except Exception:
                    y = model.predict(x.reshape(1, -1))
                    
                if isinstance(y,list):
                    predicted.append(y[0])
                else:
                    predicted.append(y)
                
            conf_matrix = ConfusionMatrix(test[:,-1],predicted)

            sensitivities.append(conf_matrix.sensitivity())
            specificities.append(conf_matrix.specificity())
            precisions.append(conf_matrix.precision())

            score = conf_matrix.accuracy()
            scores.append(score)

            if score > max_score:
                max_score = score

            if score < min_score:
                min_score = score

        mean = np.mean(scores)
        std = np.std(scores)
        median = np.median(scores)

        acc_results.append({
            "1 - Alg": model_name,
            "2 - Média(%)": mean*100,
            "3 - Mediana(%)": median*100,
            "4 - Min/Max(%)": "{:.1f} / {:.1f}".format(min_score*100,max_score*100),
            "5 - Desv. Pad.(%)": std*100
        })
        
        metrics_results.append({
            "1 - Alg": model_name,
            "6 - Sensibilidade(%)": np.mean(sensitivities)*100,
            "7 - Especificidade(%)": np.mean(specificities)*100,
            "8 - Precisão(%)": np.mean(precisions)*100
        })

    return pd.DataFrame(acc_results).round(1), pd.DataFrame(metrics_results).round(1)

In [13]:
acc_df, metrics_df = run_experiments(data)
acc_df

Running model NB
Running model CQ(P)
Running model CQG(P)
Running model CQ(F)
Running model CQG(F)


  r = _umath_linalg.det(a, signature=signature)


Running model DMC
Running model KNN(k=11)
Running model NN


Unnamed: 0,1 - Alg,2 - Média(%),3 - Mediana(%),4 - Min/Max(%),5 - Desv. Pad.(%)
0,NB,70.5,71.0,61.2 / 77.6,3.9
1,CQ(P),65.0,77.7,21.6 / 79.2,23.4
2,CQG(P),65.0,77.7,21.6 / 79.2,23.4
3,CQ(F),77.8,77.8,76.7 / 79.2,0.5
4,CQG(F),65.0,77.7,21.6 / 79.2,23.4
5,DMC,53.6,53.6,52.2 / 54.9,0.5
6,KNN(k=11),76.7,76.7,75.7 / 78.2,0.5
7,NN,68.7,68.7,67.7 / 70.0,0.5


In [14]:
metrics_df

Unnamed: 0,1 - Alg,6 - Sensibilidade(%),7 - Especificidade(%),8 - Precisão(%)
0,NB,65.3,72.0,40.6
1,CQ(P),23.0,77.0,5.1
2,CQG(P),23.0,77.0,5.1
3,CQ(F),0.0,100.0,0.0
4,CQG(F),23.0,77.0,5.1
5,DMC,67.1,49.8,27.6
6,KNN(k=11),12.5,95.1,42.0
7,NN,29.0,80.0,29.3


In [15]:
from sklearn.cluster import KMeans

def clusterize_data(data,k):
    classes = np.unique(data[:,-1])
    n = data.shape[1]
    clusterized_data = np.reshape(np.zeros(n),(1,n))
    for c in classes:
        d = data[data[:,-1]==c]
        model = KMeans(n_clusters=k, max_iter = 10)
        model.fit(d[:,:-1])
        y = np.array([c for _ in range(k)])
        clusterized_data = np.append(clusterized_data,np.c_[model.cluster_centers_,y],axis=0)

    return clusterized_data[1:,:]

In [16]:
data = clusterize_data(data, 1000)
np.savetxt("clustered-data-1000.csv",data,delimiter=',')
# data = np.loadtxt("clustered-data-1000.csv",delimiter=',')
data.shape

(2000, 25)

In [19]:
acc_df, metrics_df = run_experiments(data)
acc_df

Running model NB
Running model CQ(P)
Running model CQG(P)
Running model CQ(F)
Running model CQG(F)


  r = _umath_linalg.det(a, signature=signature)


Running model DMC
Running model KNN(k=11)
Running model NN


Unnamed: 0,1 - Alg,2 - Média(%),3 - Mediana(%),4 - Min/Max(%),5 - Desv. Pad.(%)
0,NB,68.3,68.2,61.3 / 75.5,2.6
1,CQ(P),50.2,50.2,43.8 / 55.0,2.4
2,CQG(P),50.2,50.0,46.0 / 55.5,2.0
3,CQ(F),50.1,50.0,43.5 / 55.5,2.2
4,CQG(F),50.0,50.0,44.0 / 56.5,2.4
5,DMC,60.8,60.8,55.5 / 65.8,2.2
6,KNN(k=11),60.0,59.5,55.5 / 66.8,2.3
7,NN,35.7,35.5,31.0 / 40.8,2.0


In [20]:
metrics_df

Unnamed: 0,1 - Alg,6 - Sensibilidade(%),7 - Especificidade(%),8 - Precisão(%)
0,NB,88.9,47.8,63.0
1,CQ(P),41.0,59.0,20.8
2,CQG(P),59.0,41.0,29.6
3,CQ(F),0.0,100.0,0.0
4,CQG(F),55.0,45.0,27.5
5,DMC,66.6,55.1,59.7
6,KNN(k=11),72.0,48.2,58.0
7,NN,32.2,39.3,34.7
