In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    roc_auc_score
)

# Cargar dataset
df = pd.read_csv("retail_data_sample.csv")
df["compra_online"] = (df["online_purchases"] > df["in_store_purchases"]).astype(int)


In [None]:
selected_features = [
    "age", "gender", "income_bracket", "education_level",
    "purchase_frequency", "avg_discount_used", "preferred_store",
    "avg_items_per_transaction", "app_usage", "website_visits",
    "social_media_engagement", "season"
]

df_model = df[selected_features + ["compra_online"]].copy()

num_cols = df_model.select_dtypes(include=["float64", "int64"]).columns
imputer = SimpleImputer(strategy="median")
df_model[num_cols] = imputer.fit_transform(df_model[num_cols])

df_model = pd.get_dummies(df_model, drop_first=True)

X = df_model.drop(columns=["compra_online"])
y = df_model["compra_online"]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("🌲 Random Forest Accuracy:", round(accuracy_score(y_test, rf_pred), 4))
print(classification_report(y_test, rf_pred))


In [None]:
# Mapa de calor de correlaciones entre variables numéricas seleccionadas
numeric_df = df_model.select_dtypes(include=["float64", "int64"])
corr = numeric_df.corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Mapa de calor de correlación numérica")
plt.tight_layout()
plt.show()


In [None]:
# Histograma de distribución de la edad
plt.figure(figsize=(8, 4))
sns.histplot(df['age'], kde=True, bins=30)
plt.title('Distribución de Edad')
plt.xlabel('Edad')
plt.ylabel('Frecuencia')
plt.grid(True)
plt.show()


In [None]:
# Boxplot del valor promedio de ítems por transacción vs preferencia de compra
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="compra_online", y="avg_items_per_transaction")
plt.title("Promedio de Ítems por Transacción vs Preferencia de Compra")
plt.xlabel("Compra Online (1=Sí)")
plt.ylabel("Ítems por Transacción")
plt.grid(True)
plt.show()


In [None]:
# Barras de frecuencia por nivel educativo
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x="education_level", order=df["education_level"].value_counts().index)
plt.title("Frecuencia por Nivel Educativo")
plt.xlabel("Nivel Educativo")
plt.ylabel("Cantidad de Clientes")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
# Comparación de visitas al sitio web entre grupos de compra
plt.figure(figsize=(8, 5))
sns.violinplot(data=df, x="compra_online", y="website_visits", inner="quartile")
plt.title("Visitas al Sitio Web según Preferencia de Compra")
plt.xlabel("Compra Online (1=Sí)")
plt.ylabel("Visitas al Sitio Web")
plt.grid(True)
plt.show()


In [None]:
log_model = LogisticRegression(max_iter=300)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_prob = log_model.predict_proba(X_test)[:, 1]

print("📊 Logistic Regression Accuracy:", round(accuracy_score(y_test, log_pred), 4))
print(classification_report(y_test, log_pred))


In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print("🔎 KNN Accuracy:", round(accuracy_score(y_test, knn_pred), 4))
print(classification_report(y_test, knn_pred))


In [None]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)

print("🌳 Árbol de Decisión Accuracy:", round(accuracy_score(y_test, tree_pred), 4))
print(classification_report(y_test, tree_pred))

plt.figure(figsize=(14, 6))
plot_tree(tree_model, filled=True, max_depth=3, feature_names=X.columns, class_names=["Tienda", "Online"])
plt.title("Árbol de Decisión (primeros niveles)")
plt.show()


In [None]:
fpr, tpr, _ = roc_curve(y_test, log_prob)
roc_auc = roc_auc_score(y_test, log_prob)

plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Falsos Positivos')
plt.ylabel('Verdaderos Positivos')
plt.title('Curva ROC - Regresión Logística')
plt.legend()
plt.grid(True)
plt.show()

ConfusionMatrixDisplay.from_estimator(log_model, X_test, y_test, cmap="Blues")
plt.title("Matriz de Confusión - Regresión Logística")
plt.show()


In [None]:
importances = rf_model.feature_importances_
features = X.columns
feat_importance = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feat_importance.values, y=feat_importance.index)
plt.title("Importancia de variables (Random Forest)")
plt.xlabel("Importancia")
plt.ylabel("Variable")
plt.tight_layout()
plt.show()


In [None]:
models = {
    "Random Forest": rf_model,
    "Regresión Logística": LogisticRegression(max_iter=300),
    "KNN": KNeighborsClassifier(),
    "Árbol de Decisión": DecisionTreeClassifier()
}

print("📊 Comparación de exactitud (Validación Cruzada - 5 folds):\n")
for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring="accuracy")
    print(f"{name:20}: {scores.mean():.3f} ± {scores.std():.3f}")
