In [37]:
#Librerias necesarias

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

#Modelos a explorar
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#Metricas usadas
from sklearn.metrics import make_scorer,recall_score, precision_score, confusion_matrix, average_precision_score, confusion_matrix

import warnings

from math import ceil

warnings.simplefilter(action="ignore")

%matplotlib inline

mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['figure.figsize'] = (6, 5)


In [47]:
#Funciones y clases requeridas

def dataFaltante(dataSet):
    '''
    Funcion que retorna informacion referente a data faltante de un dataset, en forma de string
    dataset: Objeto de tipo panda.DataFrame
    '''
    str_p = ""
    for col in dataSet:
        a = dataSet[col].isnull().sum()
        if a > 0:
            str_p += f"\nCol {str(col)}:\n\t Faltantes: {a}, Porcentaje: {a/dataSet.shape[0]}"
        if str_p == "":
            str_p = "Dataset no tiene datos en blanco"
    return str_p

def porcentajeClases(targetsSerie):
    '''
    retorna objeto tipo <pandas.DataFrame> con el conteo de elementos y su porcentaje con respecto a 
    targetsSeries.shape[0]
    targetsSeries: Elemento tipo <pandas.Series> que contiene etiquetas
    '''
    s1 = targetsSerie.value_counts().rename("Elementos por clase")
    s2 = pd.Series(
        data=[(x/targetsSerie.shape[0])*100 for x in targetsSerie.value_counts()], 
        index=list(s1.index),
        name="Porcentaje")
    
    return pd.concat([s1, s2], axis=1)

def showBoxPlots(dataFrame, features=None):
    '''
    Funcion que muestra diagrama de caja y bigotes para las caracteristicas contenidas en un dataset. Si no
    se suministra el parámetro features, se mostrará un diagrama para cada feature.
    dataFrame: Objeto <pandas.DataFrame> que contiene las caracteristicas.
    features: Objeto <List> que contiene los nombres de las caracteristicas a mostrar.
    '''
    def particion(feat):
        if len(feat)>3:
            return (ceil(len(feat)/3), 3)
        else:
            return (1,3)
    if features == None:
        features = dataFrame.keys()
        
    tp = particion(features)
    l = 5 * tp[0] 
    fig = plt.figure(figsize=(12, l))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(tp[0],tp[1],i+1)
        #sns.boxplot(x=dataFrame[feature], ax=ax, width=0.2, fliersize=0.5)
        #sns.stripplot(x=dataFrame[feature], ax=ax, edgecolor="red", size=1)
        sns.violinplot(x=dataFrame[feature], ax=ax)
        plt.grid()
    plt.show()
    
class data_gridsearch:
    '''
    Clase encargada de entregar datos a constructor de objeto <sklearn.model_selection.GridSearchCV>
    '''
    def __init__(self, cv=3, scoring="average_precision", verbose=2, refit=True):
        self.estimator = None
        self.param_grid = None
        self.scoring = scoring
        self.cv = cv
        self.verbose = verbose
        self.refit = refit
        
    def to_dict(self):
        '''
        Retorna diccionario para ser usado en el constructor de objeto <sklearn.model_selection.GridSearchCV>
        '''
        return {"estimator": self.estimator, "param_grid": self.param_grid, "scoring": self.scoring, "cv": self.cv, "verbose": self.verbose, "refit": self.refit}
    
def FPR_(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print(f"M:{cm}\n")
    return 1-(cm[0,0]/(cm[0,0]+cm[0,1])) #1 

def setBestResults(clf, n):
    '''
    Funcion que genera dataFrame con los n mejores clasificadores segun la metrica dispuesta en <Refit>
    de objeto <GridSearch>
    '''
    dfs = pd.DataFrame(clf.cv_results_)
    t = dfs[dfs["rank_test_average_p"] < n]
    filter_col = [col for col in t if col.startswith("mean")]
    filter_col.append("rank_test_average_p")
    
    return t[filter_col].sort_values("rank_test_average_p")

In [None]:
#Se trae dataset alojado en GoogleDrive. Nombre: creditcard.csv

!gdown --id 1WINdeQEoz6qlFkdxfXQLnLkyXYmyAzDQ

In [39]:
#carga de dataset a ambiente
dataSet = pd.read_csv("creditcard.csv")

In [None]:
#Informacion basica del set:

print(f"Número de caracteristicas: {dataSet.shape[1]}\n")
print(f"Número de ejemplos:        {dataSet.shape[0]}\n")
print(f"Tipos de datos presentes(columnas):\n\n{dataSet.dtypes.value_counts()}\n")
print(f"Data faltante:\n{dataFaltante(dataSet)}\n")
print(f"Registros duplicados:\n{dataSet.duplicated().sum()}\n")

In [40]:
#Eliminar registros no utiles
dataSet.drop_duplicates(keep="first", inplace=True, ignore_index=False)
print(f"Número de caracteristicas: {dataSet.shape[1]}\n")
print(f"Número de ejemplos:        {dataSet.shape[0]}\n")

#Revision de info:
dataSet.info()


Número de caracteristicas: 31

Número de ejemplos:        283726

<class 'pandas.core.frame.DataFrame'>
Int64Index: 283726 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19     283726 non-n

In [None]:
#Revision de dominio de las variables y etiquetas:

#Etiquetas:
sns.countplot(data=dataSet, x="Class")
plt.show()
#Porcentaje de ejemplos por clase:
print(f"Revision de etiquetas:\n {porcentajeClases(dataSet['Class'])}")

In [None]:
#revision de features: Time', 'V1', 'V2', 'V3', 'V4', 'V5'
showBoxPlots(dataSet, ['Time', 'V1', 'V2', 'V3', 'V4', 'V5'])

In [None]:
#revision de features: 'V6', 'V7', 'V8', 'V9', 'V10','V11'
showBoxPlots(dataSet, ['V6', 'V7', 'V8', 'V9', 'V10','V11'])

In [None]:
#revision de features: 'V12', 'V13', 'V14', 'V15', 'V16', 'V17'
showBoxPlots(dataSet, ['V12', 'V13', 'V14', 'V15', 'V16', 'V17'])

In [None]:
#revision de features: 'V18', 'V19', 'V20', 'V21', 'V22', 'V23'
showBoxPlots(dataSet, ['V18', 'V19', 'V20', 'V21', 'V22', 'V23'])

In [None]:
#revision de features: 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'
showBoxPlots(dataSet, ['V24', 'V25', 'V26', 'V27', 'V28', 'Amount'])

In [None]:
#Montos en situaciones de fraude:
#sns.violinplot(x="Class", y="Amount", data=dataSet, scale="count")
fig, ax = plt.subplots(nrows=2, ncols=1)
fig.tight_layout(pad=3.0)

ax[0].set_title("Densidad de distribucion(según monto) para transacciones sin Fraude")
ax[1].set_title("Densidad de distribucion(según monto) para transacciones con Fraude")
am0 = dataSet["Amount"][dataSet["Class"] == 0]
am1 = dataSet["Amount"][dataSet["Class"] == 1]
sns.violinplot(x=am0, ax=ax[0])
print(f"Describe para Categoria 0, segun monto\n\n{am0.describe()}\n")
print(f"Describe para Categoria 1, segun monto\n\n{am1.describe()}\n")
sns.violinplot(x=am1, ax=ax[1])
plt.show()
del am0, am1

In [41]:
#Procesamiento manual de datos
X = dataSet.drop(labels=["Time","Class"], axis = 1)
y = dataSet["Class"]
print(f"Tamaño X: {X.shape}")
print(f"Tamaño y: {y.shape}")

Tamaño X: (283726, 29)
Tamaño y: (283726,)


In [43]:
#Particionamiento en prueba y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234, stratify=y)
print(f"Particiones:\n\n## Entrenamiento ##\n\tX: {X_train.shape}\n\ty: {y_train.shape}\nEjemplos por clase:\n{y_train.value_counts()}\n\n## Prueba ##\n\tX: {X_test.shape}\n\ty: {y_test.shape}\nEjemplos por clase:\n{y_test.value_counts()}")

Particiones:

## Entrenamiento ##
	X: (198608, 29)
	y: (198608,)
Ejemplos por clase:
0    198277
1       331
Name: Class, dtype: int64

## Prueba ##
	X: (85118, 29)
	y: (85118,)
Ejemplos por clase:
0    84976
1      142
Name: Class, dtype: int64


In [44]:
#Objetos comunes para los estimadores a explorar

dict_scoring = {"FPR": make_scorer(FPR_, greater_is_better=True),
                "rec": "recall",
                "prec": "precision",
                "average_p" : "average_precision"
               }

sc = ("columTransformer", ColumnTransformer([("standard", StandardScaler(), ["Amount"])], remainder="passthrough"))

data_svc = data_gridsearch(verbose=3)
data_svc.scoring = dict_scoring
data_svc.refit = "average_p"

In [None]:
#Pipeline y GridSearch para estimador: SVC 

data_svc.estimator = Pipeline(steps=[sc,("svc",SVC(random_state=123))])
data_svc.param_grid = {"svc__C": [10**x for x in range(-1,3)], #4 valores
                       "svc__kernel": ["linear"], #1 Valores
                       "svc__class_weight": [{1: w} for w in range(1, 200, 50)] #4valores
                       #"svc__gamma": [10**x for x in range(-1,3)] #4valores
                      }

print(f"Comienza\n")

gr = GridSearchCV(**data_svc.to_dict())
gr.fit(X_train, y_train)

print("\nTerminado")

In [8]:
#Pipeline y GridSearch para estimador: LogisticRegression

data_svc.estimator = Pipeline(steps=[sc,("LogReg",LogisticRegression(random_state=1234, max_iter=200))])
data_svc.param_grid = {"LogReg__C": [10**x for x in range(-1,3)], #4 valores
                       "LogReg__solver": ["lbfgs", "sag"], #2 Valores
                       "LogReg__class_weight": [{1: w} for w in range(1, 200, 50)] #4valores
                      }

print(f"Comienza\n")

gr = GridSearchCV(**data_svc.to_dict())
gr.fit(X_train, y_train)

print("\nTerminado")

Comienza

Fitting 3 folds for each of 32 candidates, totalling 96 fits
M:[[66083    10]
 [   41    69]]

[CV 1/3] END LogReg__C=0.1, LogReg__class_weight={1: 1}, LogReg__solver=lbfgs; FPR: (test=0.000) average_p: (test=0.731) prec: (test=0.873) rec: (test=0.627) total time=   1.9s
M:[[66083     9]
 [   47    64]]

[CV 2/3] END LogReg__C=0.1, LogReg__class_weight={1: 1}, LogReg__solver=lbfgs; FPR: (test=0.000) average_p: (test=0.750) prec: (test=0.877) rec: (test=0.577) total time=   1.5s
M:[[66081    11]
 [   43    67]]

[CV 3/3] END LogReg__C=0.1, LogReg__class_weight={1: 1}, LogReg__solver=lbfgs; FPR: (test=0.000) average_p: (test=0.747) prec: (test=0.859) rec: (test=0.609) total time=   1.5s
M:[[66084     9]
 [   45    65]]

[CV 1/3] END LogReg__C=0.1, LogReg__class_weight={1: 1}, LogReg__solver=sag; FPR: (test=0.000) average_p: (test=0.725) prec: (test=0.878) rec: (test=0.591) total time=  25.7s
M:[[66084     8]
 [   47    64]]

[CV 2/3] END LogReg__C=0.1, LogReg__class_weight={1: 

M:[[65832   261]
 [   18    92]]

[CV 1/3] END LogReg__C=1, LogReg__class_weight={1: 101}, LogReg__solver=sag; FPR: (test=0.004) average_p: (test=0.711) prec: (test=0.261) rec: (test=0.836) total time=  30.9s
M:[[65898   194]
 [   15    96]]

[CV 2/3] END LogReg__C=1, LogReg__class_weight={1: 101}, LogReg__solver=sag; FPR: (test=0.003) average_p: (test=0.739) prec: (test=0.331) rec: (test=0.865) total time=  34.8s


KeyboardInterrupt: 

In [None]:
#DataFrame con mejores resultados

setBestResults(gr, 30)

In [None]:
#Pipeline y GridSearch para estimador: RandomForest 

data_svc.estimator = Pipeline(steps=[sc,("rForest", RandomForestClassifier(random_state=123, criterion="gini"))])
data_svc.param_grid = {"rForest__n_estimators": [20, 30, 40, 50], #4 valores
                       "rForest__class_weight": [{0: 1, 1: w} for w in range(1, 200, 50)], #4valores
                       "rForest__max_depth": [4, 8, 16, 32, 64] #5valores
                      }

print(f"Comienza\n")

gr = GridSearchCV(**data_svc.to_dict())
gr.fit(X_train, y_train)

print("\nTerminado")

In [None]:
#DataFrame con mejores resultados

setBestResults(gr, 30)

In [45]:
#Pipeline y GridSearch para estimador: MLPClassifier

data_svc.estimator = Pipeline(steps=[sc,("NR",MLPClassifier(random_state=1234,
                                                                solver="adam",
                                                                learning_rate="constant",
                                                                learning_rate_init=0.0008,
                                                                max_iter=1000,
                                                                n_iter_no_change=30))])
data_svc.param_grid = {"NR__hidden_layer_sizes": [(20,5),(20,10),(20,15),(20,20)], #4 valores
                       "NR__activation": ["logistic", "relu", "tanh"] #3 Valores
                      }

print(f"Comienza\n")

gr = GridSearchCV(**data_svc.to_dict())
gr.fit(X_train, y_train)

print("\nTerminado")

Comienza

Fitting 3 folds for each of 12 candidates, totalling 36 fits
M:[[66074    19]
 [   25    85]]

[CV 1/3] END NR__activation=logistic, NR__hidden_layer_sizes=(20, 5); FPR: (test=0.000) average_p: (test=0.719) prec: (test=0.817) rec: (test=0.773) total time=  44.7s
M:[[66080    12]
 [   27    84]]

[CV 2/3] END NR__activation=logistic, NR__hidden_layer_sizes=(20, 5); FPR: (test=0.000) average_p: (test=0.841) prec: (test=0.875) rec: (test=0.757) total time=  36.0s
M:[[66080    12]
 [   25    85]]

[CV 3/3] END NR__activation=logistic, NR__hidden_layer_sizes=(20, 5); FPR: (test=0.000) average_p: (test=0.833) prec: (test=0.876) rec: (test=0.773) total time=  36.8s
M:[[66074    19]
 [   26    84]]

[CV 1/3] END NR__activation=logistic, NR__hidden_layer_sizes=(20, 10); FPR: (test=0.000) average_p: (test=0.713) prec: (test=0.816) rec: (test=0.764) total time=  36.5s
M:[[66081    11]
 [   26    85]]

[CV 2/3] END NR__activation=logistic, NR__hidden_layer_sizes=(20, 10); FPR: (test=0.00

In [48]:
#DataFrame con mejores resultados 

setBestResults(gr, 5)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_FPR,mean_test_rec,mean_test_prec,mean_test_average_p,rank_test_average_p
4,55.149791,0.235676,0.000182,0.758313,0.875215,0.818439,1
5,46.35546,0.303975,0.000252,0.785531,0.839077,0.813502,2
7,68.972586,0.40269,0.000151,0.773437,0.89807,0.810214,3
6,75.738294,0.327237,9.6e-05,0.75834,0.930686,0.809685,4


In [51]:
dfs = pd.DataFrame(gr.cv_results_)
t = dfs[dfs["rank_test_average_p"] < 5].sort_values("rank_test_average_p")
t

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_NR__activation,param_NR__hidden_layer_sizes,params,split0_test_FPR,split1_test_FPR,split2_test_FPR,...,split2_test_prec,mean_test_prec,std_test_prec,rank_test_prec,split0_test_average_p,split1_test_average_p,split2_test_average_p,mean_test_average_p,std_test_average_p,rank_test_average_p
4,55.149791,4.534699,0.235676,0.076981,relu,"(20, 5)","{'NR__activation': 'relu', 'NR__hidden_layer_s...",0.000242,0.000121,0.000182,...,0.877551,0.875215,0.031883,5,0.802652,0.833866,0.8188,0.818439,0.012745,1
5,46.35546,2.052634,0.303975,0.07748,relu,"(20, 10)","{'NR__activation': 'relu', 'NR__hidden_layer_s...",0.000318,0.000197,0.000242,...,0.850467,0.839077,0.02993,12,0.777677,0.832278,0.830551,0.813502,0.025342,2
7,68.972586,9.048872,0.40269,0.007826,relu,"(20, 20)","{'NR__activation': 'relu', 'NR__hidden_layer_s...",0.000272,0.000106,7.6e-05,...,0.945055,0.89807,0.052215,2,0.778253,0.821807,0.830583,0.810214,0.022882,3
6,75.738294,9.085864,0.327237,0.090648,relu,"(20, 15)","{'NR__activation': 'relu', 'NR__hidden_layer_s...",0.000136,0.000121,3e-05,...,0.976744,0.930686,0.032768,1,0.767705,0.816611,0.844738,0.809685,0.031828,4
