# Preprocesamiento

In [12]:
import pandas as pd

# Cargar dataset original
df = pd.read_csv('../data/raw/creditcard.csv')
print(df.shape)
df.head()


(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Conclusiones del EDA:
- Dataset desbalanceado: solo 0.172% de fraudes
- Distribuciones de 'Amount' muy asimétricas
- Variables principales (V1-V28) ya estandarizadas
- Potenciales correlaciones en V11, V4, V2, V3, V10, etc.

# Tratamiento del desbalance de clases

Dado que el dataset está muy desbalanceado, podemos considerar dos estrategias:

- Undersampling / Oversampling
- Uso de técnicas como SMOTE

Por ahora, generamos un dataset balanceado con undersampling:

In [13]:
# Separar clases
df_fraud = df[df["Class"] == 1]
df_no_fraud = df[df["Class"] == 0].sample(n=len(df_fraud), random_state=42)

# Combinar
df_balanced = pd.concat([df_fraud, df_no_fraud], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced["Class"].value_counts())
df_balanced.head()


Class
0    492
1    492
Name: count, dtype: int64


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,157278.0,1.984787,-1.937036,0.486613,-1.245536,-2.518536,-0.544524,-1.819244,-0.074875,-0.659777,...,0.174011,0.889289,0.218248,0.492384,-0.584599,-0.13504,0.070319,-0.000205,120.0,0
1,153875.0,-0.613696,3.698772,-5.534941,5.620486,1.649263,-2.335145,-0.907188,0.706362,-3.747646,...,0.319261,-0.471379,-0.07589,-0.667909,-0.642848,0.0706,0.48841,0.292345,0.0,1
2,56424.0,0.319007,-1.072867,-0.216146,1.494709,-0.627063,-0.761867,0.941687,-0.430272,-0.267431,...,0.332521,-0.022461,-0.537691,0.452072,0.555495,-0.383543,-0.06852,0.106578,484.0,0
3,150139.0,-6.682832,-2.714268,-5.77453,1.449792,-0.661836,-1.14865,0.849686,0.433427,-1.315646,...,0.220526,1.187013,0.335821,0.215683,0.80311,0.044033,-0.054988,0.082337,237.26,1
4,85285.0,-6.713407,3.921104,-9.746678,5.148263,-5.151563,-2.099389,-5.937767,3.57878,-4.684952,...,0.954272,-0.451086,0.127214,-0.33945,0.394096,1.075295,1.649906,-0.394905,252.92,1


# Feature Engineering

Dado que las variables principales ya están transformadas (PCA), nos centramos en Amount y Time:

In [14]:
# Escalar Amount
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_balanced["Amount_scaled"] = scaler.fit_transform(df_balanced[["Amount"]])

In [15]:
# convertir time en horas del día
df_balanced["Hour"] = (df_balanced["Time"] / 3600) % 24

In [16]:
# eliminar columnas originales
df_balanced_final = df_balanced.drop(columns=["Time", "Amount"])

# Guardar dataset balanceado listo para modelar
df_balanced_final.to_csv("../data/processed/creditcard_preprocessed.csv", index=False)
print("✅ Dataset preprocesado guardado.")

✅ Dataset preprocesado guardado.
