# Detección de transacciones fraudulentas

## Uso de librerías

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

## Entrada y exploración de datos

In [13]:
df = pd.read_csv('dataset/creditcard.csv', dtype={'column_name': 'string'})
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [14]:
print("Información del dataset:")
print(f"Dimensiones: {df.shape}")
print(f"Distribución de clases: {df['Class'].value_counts()}")
print(f"Porcentaje de fraude: {df['Class'].mean()*100:.4f}%")
print(f"\nValores nulos: {df.isnull().sum().sum()}")

Información del dataset:
Dimensiones: (284807, 31)
Distribución de clases: Class
0    284315
1       492
Name: count, dtype: int64
Porcentaje de fraude: 0.1727%

Valores nulos: 0


## Preprocesamiento de datos

### Escalar datos

In [15]:
columnas_pca = [f'V{i}' for i in range(1, 29)]

print("VERIFICACIÓN DE COMPONENTES PCA:")
print("=" * 50)

for col in columnas_pca:
    media = df[col].mean()
    std = df[col].std()
    print(f"{col}: Media = {media:8.4f}, Std = {std:8.4f}")

VERIFICACIÓN DE COMPONENTES PCA:
V1: Media =   0.0000, Std =   1.9587
V2: Media =   0.0000, Std =   1.6513
V3: Media =  -0.0000, Std =   1.5163
V4: Media =   0.0000, Std =   1.4159
V5: Media =   0.0000, Std =   1.3802
V6: Media =   0.0000, Std =   1.3323
V7: Media =  -0.0000, Std =   1.2371
V8: Media =   0.0000, Std =   1.1944
V9: Media =  -0.0000, Std =   1.0986
V10: Media =   0.0000, Std =   1.0888
V11: Media =   0.0000, Std =   1.0207
V12: Media =  -0.0000, Std =   0.9992
V13: Media =   0.0000, Std =   0.9953
V14: Media =   0.0000, Std =   0.9586
V15: Media =   0.0000, Std =   0.9153
V16: Media =   0.0000, Std =   0.8763
V17: Media =  -0.0000, Std =   0.8493
V18: Media =   0.0000, Std =   0.8382
V19: Media =   0.0000, Std =   0.8140
V20: Media =   0.0000, Std =   0.7709
V21: Media =   0.0000, Std =   0.7345
V22: Media =  -0.0000, Std =   0.7257
V23: Media =   0.0000, Std =   0.6245
V24: Media =   0.0000, Std =   0.6056
V25: Media =   0.0000, Std =   0.5213
V26: Media =   0.0000, Std

In [16]:
# Verificación general
print(f"\nESTADÍSTICAS GENERALES PCA:")
print(f"Rango de medias: [{df[columnas_pca].mean().min():.4f}, {df[columnas_pca].mean().max():.4f}]")
print(f"Rango de std: [{df[columnas_pca].std().min():.4f}, {df[columnas_pca].std().max():.4f}]")


ESTADÍSTICAS GENERALES PCA:
Rango de medias: [-0.0000, 0.0000]
Rango de std: [0.3301, 1.9587]


Dado que las medias son igual a 0, en el caso de las variables PCA, pero sus desviaciones estándar no son iguales, se necesita escalar esos datos sin centrar.

In [17]:
escalado_sin_centrar = StandardScaler(with_mean=False)

preprocesador_optimo = ColumnTransformer([
    # PCA: Solo escalar, no centrar (ya están centradas)
    ('pca_features', escalado_sin_centrar, [f'V{i}' for i in range(1, 29)]),
    
    # Time y Amount: RobustScaler completo
    ('robust_features', RobustScaler(), ['Time', 'Amount'])
])

print("ESTRATEGIA APLICADA:")
print("   - V1-V28: StandardScaler(with_mean=False) → Solo escalar")
print("   - Time: RobustScaler → Manejar outliers temporales")  
print("   - Amount: RobustScaler → Manejar outliers monetarios")

ESTRATEGIA APLICADA:
   - V1-V28: StandardScaler(with_mean=False) → Solo escalar
   - Time: RobustScaler → Manejar outliers temporales
   - Amount: RobustScaler → Manejar outliers monetarios


In [18]:
# Aplicar el preprocesamiento
X_escalado = preprocesador_optimo.fit_transform(df.drop('Class', axis=1))

# Convertir a DataFrame para verificación
columnas_escaladas = [f'V{i}' for i in range(1, 29)] + ['Time', 'Amount']
df_verificacion = pd.DataFrame(X_escalado, columns=columnas_escaladas)

print("📊 VERIFICACIÓN POST-ESCALADO:")
print("Componentes PCA después del escalado:")
print(df_verificacion[[f'V{i}' for i in range(1, 29)]].std())

📊 VERIFICACIÓN POST-ESCALADO:
Componentes PCA después del escalado:
V1     1.000002
V2     1.000002
V3     1.000002
V4     1.000002
V5     1.000002
V6     1.000002
V7     1.000002
V8     1.000002
V9     1.000002
V10    1.000002
V11    1.000002
V12    1.000002
V13    1.000002
V14    1.000002
V15    1.000002
V16    1.000002
V17    1.000002
V18    1.000002
V19    1.000002
V20    1.000002
V21    1.000002
V22    1.000002
V23    1.000002
V24    1.000002
V25    1.000002
V26    1.000002
V27    1.000002
V28    1.000002
dtype: float64


### Dividir el conjunto de datos

In [19]:
X = df.drop('Class', axis=1)
y = df['Class']  

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

print("DIVISIÓN ESTRATIFICADA 70-30:")
print(f"Dataset original - Clase 1: {y.mean():.4f}%")
print(f"Train - Clase 1: {y_train.mean():.4f}%")
print(f"Test - Clase 1: {y_test.mean():.4f}%")

DIVISIÓN ESTRATIFICADA 70-30:
Dataset original - Clase 1: 0.0017%
Train - Clase 1: 0.0017%
Test - Clase 1: 0.0017%


## Balanceo de Clases

In [20]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# dividir los datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# aplicar el preprocesamiento
X_train_preprocesado = preprocesador_optimo.fit_transform(X_train)
X_test_preprocesado = preprocesador_optimo.transform(X_test)

# se aplica SMOTE solo al conjunto de entrenamiento
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_preprocesado, y_train)

y_train_balanced.value_counts()

#

from scipy.sparse import issparse
import numpy as np

def to_dense(X):
    from scipy.sparse import issparse
    return X.toarray() if issparse(X) else X

Xtr_bal = to_dense(X_train_balanced)          # <-- este es el correcto para fit de RFE/RFECV
Xte      = to_dense(X_test_preprocesado)      # test NO se resamplea

# Si necesitas nombres de columnas del preprocesador:
try:
    feature_names = preprocesador_optimo.get_feature_names_out()
except Exception:
    feature_names = np.array([f"f{i}" for i in range(Xtr_bal.shape[1])])


In [29]:
# =========================
# Selección de características por modelo + evaluación
# =========================
import numpy as np
from scipy.sparse import issparse
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector, SelectKBest, mutual_info_classif

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC  # SVM lineal con coef_, apto para RFE

# --- Utilidades ---
def to_dense(X):
    return X.toarray() if issparse(X) else X

def get_feature_names(preprocesador, n_cols):
    try:
        return preprocesador.get_feature_names_out()
    except Exception:
        return np.array([f"f{i}" for i in range(n_cols)])

def evaluar_modelo(nombre, clf, X_tr, y_tr, X_te, y_te, proba_ok=True):
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)

    print(f"\n===== {nombre} =====")
    print(classification_report(y_te, y_pred, digits=4))
    # ROC-AUC si el clasificador tiene predict_proba o decision_function
    auc = None
    if proba_ok and hasattr(clf, "predict_proba"):
        y_proba = clf.predict_proba(X_te)[:, 1]
        auc = roc_auc_score(y_te, y_proba)
    elif hasattr(clf, "decision_function"):
        y_scores = clf.decision_function(X_te)
        auc = roc_auc_score(y_te, y_scores)
    if auc is not None:
        print("ROC-AUC:", round(auc, 4))
    return clf

# Densificar si hace falta (RFE y SFS suelen requerir matrices densas)
Xtr = to_dense(X_train_preprocesado)
Xte = to_dense(X_test_preprocesado)
feature_names = get_feature_names(preprocesador_optimo, Xtr.shape[1])

# CV común
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

resultados = {}

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

dt_base = DecisionTreeClassifier(random_state=42)
rfecv_dt = RFECV(
    estimator=dt_base,
    step=1,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)

# ✅ Ajusta con datos balanceados
rfecv_dt.fit(Xtr_bal, y_train_balanced)

# Transforma tanto train balanceado como test original
X_train_sel_dt = rfecv_dt.transform(Xtr_bal)
X_test_sel_dt  = rfecv_dt.transform(Xte)

mask_dt = rfecv_dt.support_
sel_names_dt = feature_names[mask_dt]
print("DT - #features:", mask_dt.sum())

KeyboardInterrupt: 

## Selección de características

In [None]:
# =============== 2) Random Forest + RFE/RFECV ===============
rf_base = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rfecv_rf = RFECV(
    estimator=rf_base,
    step=1,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)
rfecv_rf.fit(Xtr, y_train_balanced)
mask_rf = rfecv_rf.support_
sel_names_rf = feature_names[mask_rf]
print("\n[Random Forest] Nº features seleccionadas:", mask_rf.sum())
print("Algunas features:", list(sel_names_rf[:20]))
Xtr_rf = rfecv_rf.transform(Xtr)
Xte_rf = rfecv_rf.transform(Xte)

clf_rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
clf_rf = evaluar_modelo("Random Forest (con RFE)", clf_rf, Xtr_rf, y_train_balanced, Xte_rf, y_test, proba_ok=True)
resultados["RandomForest"] = {"n_features": mask_rf.sum(), "features": sel_names_rf}

In [None]:
# =============== 3) SVM (lineal) + RFE/RFECV ===============
# Usamos LinearSVC (tiene coef_) -> apto para RFE; más estable que SVC(kernel='linear') en high-dim
svm_base = LinearSVC(C=1.0, random_state=42)
rfecv_svm = RFECV(
    estimator=svm_base,
    step=1,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)
rfecv_svm.fit(Xtr, y_train_balanced)
mask_svm = rfecv_svm.support_
sel_names_svm = feature_names[mask_svm]
print("\n[SVM lineal] Nº features seleccionadas:", mask_svm.sum())
print("Algunas features:", list(sel_names_svm[:20]))
Xtr_svm = rfecv_svm.transform(Xtr)
Xte_svm = rfecv_svm.transform(Xte)

# Nota: LinearSVC no expone predict_proba; usamos decision_function para AUC
clf_svm = LinearSVC(C=1.0, random_state=42)
clf_svm = evaluar_modelo("SVM lineal (con RFE)", clf_svm, Xtr_svm, y_train_balanced, Xte_svm, y_test, proba_ok=False)
resultados["SVM_linear"] = {"n_features": mask_svm.sum(), "features": sel_names_svm}

In [None]:
# =============== 4) k-NN (NO soporta RFE) ===============
# Alternativa A (wrapper): Sequential Forward Selector con kNN
knn = KNeighborsClassifier(n_neighbors=5)
sfs_knn = SequentialFeatureSelector(
    estimator=knn,
    n_features_to_select=min(30, Xtr.shape[1]),  # ajusta este número
    direction="forward",
    scoring="f1",
    cv=cv,
    n_jobs=-1
)
sfs_knn.fit(Xtr, y_train_balanced)
mask_knn = sfs_knn.get_support()
sel_names_knn = feature_names[mask_knn]
print("\n[k-NN] (SFS) Nº features seleccionadas:", mask_knn.sum())
print("Algunas features:", list(sel_names_knn[:20]))
Xtr_knn = sfs_knn.transform(Xtr)
Xte_knn = sfs_knn.transform(Xte)

clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn = evaluar_modelo("k-NN (con SFS, alternativa a RFE)", clf_knn, Xtr_knn, y_train_balanced, Xte_knn, y_test, proba_ok=False)
resultados["kNN_SFS"] = {"n_features": mask_knn.sum(), "features": sel_names_knn}

# Alternativa B (filter): si prefieres algo más rápido para kNN, comenta SFS y usa SelectKBest:
# kbest = SelectKBest(mutual_info_classif, k=30)
# Xtr_knn = kbest.fit_transform(Xtr, y_train_balanced)
# Xte_knn = kbest.transform(Xte)
# mask_knn = kbest.get_support()
# sel_names_knn = feature_names[mask_knn]
# clf_knn = evaluar_modelo("k-NN (SelectKBest)", KNeighborsClassifier(n_neighbors=5),
#                          Xtr_knn, y_train_balanced, Xte_knn, y_test, proba_ok=False)
# resultados["kNN_KBest"] = {"n_features": mask_knn.sum(), "features": sel_names_knn}




In [None]:
# =============== Resumen final ===============
print("\n================ RESUMEN SELECCIÓN ================")
for k, v in resultados.items():
    print(f"{k}: {v['n_features']} features seleccionadas")

## Entrenamiento de clasificadores

## Búsqueda y selección de hiperparámetros

## Evaluar el desempeño de los modelos