<a href="https://colab.research.google.com/github/pablo-jph/DataScienceTFM/blob/main/Financial_Fraud_CNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random as python_random
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy, Precision, Recall, AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns

# Fijar a semente para a reproducibilidade
np.random.seed(42)
python_random.seed(42)
tf.random.set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Dataset.csv'
df = pd.read_csv(file_path)

display(df.head())
display(df.info())

# Optional: List files in the drive to confirm the dataset path
# drive_path = '/content/drive/MyDrive/'
# print(os.listdir(drive_path))

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Datos (X: numpy [n_samples, n_features], y: {0,1}) ---
# Suponemos X, y ya cargados y preprocesados (nulos/duplicados/outliers)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Para Conv1D: [batch, timesteps(features), channels]
X_cnn = np.expand_dims(X_scaled.astype("float32"), axis=-1)  # shape: (N, F, 1)

X_train, X_tmp, y_train, y_tmp = train_test_split(X_cnn, y, test_size=0.30, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=2/3, stratify=y_tmp, random_state=42)
# => 70%/10%/20%

def build_cnn_1d(input_shape):
    inp = layers.Input(shape=input_shape)
    x = layers.Conv1D(32, 3, padding="same", activation="relu")(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D()(x)
    x = layers.Dropout(0.25)(x)

    x = layers.Conv1D(64, 3, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D()(x)
    x = layers.Dropout(0.25)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.AUC(curve="PR", name="pr_auc"),
                           tf.keras.metrics.AUC(curve="ROC", name="roc_auc")])
    return model

model = build_cnn_1d(X_train.shape[1:])


In [None]:
# Estima pesos inversos a la frecuencia (ejemplo: mayor peso a la clase 1)
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
w_pos = neg / max(pos, 1)  # p.ej., 10–50 si hay mucho desbalance
class_weight = {0: 1.0, 1: float(w_pos)}

model = build_cnn_1d(X_train.shape[1:])
cb = [
  tf.keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max", patience=5, restore_best_weights=True),
  tf.keras.callbacks.ReduceLROnPlateau(monitor="val_pr_auc", mode="max", patience=2, factor=0.5, min_lr=1e-6)
]
hist = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=60, batch_size=256,
    class_weight=class_weight,
    callbacks=cb, verbose=1
)
