<a href="https://colab.research.google.com/github/paulatatian/JavaScript/blob/main/ml_analysis_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================
# 1. Montar Drive y crear estructura
# ==============================
from google.colab import drive
import os

# Montar Google Drive
drive.mount('/content/drive')

# Definir ruta base
base_path = '/content/drive/MyDrive/proyecto_fraude'

# Estructura completa requerida
folders = [
    f'{base_path}/notebooks',
    f'{base_path}/data/raw',
    f'{base_path}/data/processed',
    f'{base_path}/results/visualizations',
    f'{base_path}/results/models',
    f'{base_path}/results/reports',
    f'{base_path}/docs',
    f'{base_path}/video'
]

files = {
    f'{base_path}/README.md': "# Proyecto: Análisis de Datos Financieros\n\nDocumentación principal.",
    f'{base_path}/data/data_dictionary.md': "# Diccionario de Datos\n\nDescribe las variables del dataset creditcard.csv.",
    f'{base_path}/docs/methodology.md': "# Metodología\n\nExplicación paso a paso del proceso de análisis.",
    f'{base_path}/docs/conclusions.md': "# Conclusiones\n\nResumen de los hallazgos y recomendaciones finales."
}

# Crear carpetas
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# Crear archivos
for file_path, content in files.items():
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

# Crear archivo de video vacío (solo marcador)
open(f'{base_path}/video/presentation.mp4', 'a').close()

print("✅ Estructura completa creada con éxito.\n")
for folder in folders:
    print(folder)

In [None]:
# ==============================
# 2. Cargar librerías necesarias
# ==============================
!pip install -q imbalanced-learn xgboost seaborn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import joblib

In [None]:
# ==============================
# 3. Cargar dataset
# ==============================
data_path = f'{base_path}/data/raw/creditcard.csv'
df = pd.read_csv(data_path)
print("Filas y columnas:", df.shape)
df.head()

In [None]:
# ==============================
# 4. Exploración inicial
# ==============================
df.info()
df.describe().T
print("Nulos:", df.isnull().sum().sum())
print("Duplicados:", df.duplicated().sum())

In [None]:
# ==============================
# 5. Visualizaciones (EDA)
# ==============================
plt.figure(figsize=(5,4))
sns.countplot(x='Class', data=df)
plt.title('Distribución de clases (0=Normal, 1=Fraude)')
plt.show()

print("Porcentaje de fraude:", df['Class'].value_counts(normalize=True)[1]*100, "%")

# Distribución del monto
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df['Amount'], bins=50, log_scale=True)
plt.title('Distribución de Amount (escala log)')
plt.subplot(1,2,2)
sns.boxplot(x='Class', y='Amount', data=df)
plt.title('Amount por clase')
plt.tight_layout()
plt.show()

# Correlación
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='coolwarm', vmax=0.6, vmin=-0.6)
plt.title('Matriz de correlación')
plt.show()


In [None]:
# ==============================
# 6. Preprocesamiento (optimizado)
# ==============================
X = df.drop('Class', axis=1)
y = df['Class']

# Columnas a escalar
num_features = ['Amount', 'Time']

# Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), num_features)
    ],
    remainder='passthrough'  # Mantiene el resto de columnas sin cambios
)

# Aplicar escalado
X_scaled = preprocessor.fit_transform(X)

# Convertir a DataFrame conservando nombres de columnas
X_scaled = pd.DataFrame(X_scaled, columns=num_features + [c for c in X.columns if c not in num_features])

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)
print("Datos de entrenamiento:", X_train.shape, "y prueba:", X_test.shape)


In [None]:
# ==============================
# 7. Balanceo de clases (SMOTE)
# ==============================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("Antes del balanceo:", y_train.value_counts())
print("Después del balanceo:", y_train_res.value_counts())


In [None]:
# ==============================
# 8. Modelos optimizados (rápida ejecución)
# ==============================
print("Entrenando modelos, esto tardará menos de 2 minutos...")

# Logistic Regression
lr = LogisticRegression(
    max_iter=800,
    class_weight='balanced',
    solver='lbfgs',
    random_state=42
)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:, 1]

print("\n=== Logistic Regression ===")
print("ROC AUC:", round(roc_auc_score(y_test, y_proba_lr), 4))
print(classification_report(y_test, y_pred_lr, digits=4))

# Random Forest
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("\n=== Random Forest (optimizado) ===")
print("ROC AUC:", round(roc_auc_score(y_test, y_proba_rf), 4))
print(classification_report(y_test, y_pred_rf, digits=4))

# Matriz de confusión
fig_cm = f'{base_path}/results/visualizations/confusion_matrix.png'
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf, cmap='Blues')
plt.title('Matriz de confusión - Random Forest')
plt.savefig(fig_cm, dpi=200)
plt.show()

# Curva ROC comparativa
plt.figure(figsize=(6,5))
RocCurveDisplay.from_estimator(lr, X_test, y_test, name='Logistic Regression')
RocCurveDisplay.from_estimator(rf, X_test, y_test, name='Random Forest', color='darkorange')
plt.title('Curva ROC comparativa')
plt.show()

print("✅ Modelos entrenados y evaluados correctamente.")

In [None]:
# ==============================
# 9. Importancia de variables
# ==============================
feat_imp = pd.Series(rf.feature_importances_, index=X_scaled.columns).sort_values(ascending=False)
plt.figure(figsize=(8,5))
feat_imp.head(10).plot(kind='barh')
plt.title('Top 10 variables más importantes')
plt.tight_layout()
plt.show()

In [None]:
# ==============================
# 10. Guardar modelo
# ==============================
model_path = f'{base_path}/results/models/random_forest.joblib'
joblib.dump(rf, model_path)
print(f"Modelo guardado en: {model_path}")

In [1]:
!git clone https://github.com/paulatatian/Machine-Learning-usando-Google-Colab-SaaS-.git


Cloning into 'Machine-Learning-usando-Google-Colab-SaaS-'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 170.49 KiB | 1.54 MiB/s, done.
Resolving deltas: 100% (1/1), done.
