# DSMarket - Clustering

### 1. Instalacion de paquetes 

In [None]:
!pip install seaborn
!pip install scikit-learn

## 2. Importacion de librerias

In [None]:
# silence warnings
import warnings
warnings.filterwarnings("ignore")

# operating system
import os

# time calculation to track some processes
import time

# numeric and matrix operations
import numpy as np
import pandas as pd

# loading ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# python core library for machine learning and data science
import sklearn
from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## 3. Carga de datos

In [None]:
df = pd.read_parquet("df_sales_week.parquet", engine="pyarrow")
print(df.shape)
df.head(3)

## 4. Exploratory Data Analysis (EDA)

In [None]:
def report_df(df, verbose = True):
    '''
    Hace un report simple sobre el DataFrame suministrado.
    '''
    print(df.info(verbose = verbose))
    total_nulos = df.isnull().sum().sum()
    print()
    print(f"Tenemos un total de {total_nulos} nulos")

def unique_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Retorna un DataFrame con la cantidad de valores únicos por columna.
    
    Parámetros:
        df (pd.DataFrame): DataFrame de entrada
    
    Retorna:
        pd.DataFrame: columnas y cantidad de valores únicos
    """
    result = pd.DataFrame({
        "columna": df.columns,
        "valores_unicos": [df[col].nunique(dropna=False) for col in df.columns]
    })
    return result

In [None]:
report_df(df)

In [None]:
unique_counts(df)

In [None]:
# Asignar 'id' como índice
df = df.set_index("id", drop=True)

In [None]:
df.head(3)

## 5. Clustering productos

### 5.1. Agrupar ventas por producto-semana

In [None]:
df_items = (
    df.groupby(["item", "category", "department", "yearweek", "event"], as_index=False)
    .agg(
        n_sales=("n_sales", "sum"),
        revenue=("revenue", "sum"),
        avg_sell_price=("avg_sell_price", "mean"),
    )
)

In [None]:
df_items

### 5.2. Agregar features

In [None]:
# Features básicos por item
features_item = (
    df_items.groupby("item")
    .agg(
        mean_sales=("n_sales", "mean"),
        std_sales=("n_sales", "std"),
        max_sales=("n_sales", "max"),
        min_sales=("n_sales", "min"),
        mean_revenue=("revenue", "mean"),
        std_revenue=("revenue", "std"),
        total_sales=("n_sales", "sum"),
        total_revenue=("revenue", "sum"),
        avg_price=("avg_sell_price", "mean"),
    )
    .reset_index()
)

# Coeficiente de variación
# features_item["cv_sales"] = features_item["std_sales"] / features_item["mean_sales"].replace(0, np.nan)
features_item["cv_revenue"] = features_item["std_revenue"] / features_item["mean_revenue"].replace(0, np.nan)

# % semanas con 0 ventas
pct_zero = (
    df_items.groupby("item")["n_sales"]
    .apply(lambda x: (x == 0).mean())
    .reset_index(name="pct_weeks_zero")
)

# % de ventas en semanas con evento
sales_event = (
    df_items.groupby(["item", "event"])["n_sales"].sum().reset_index()
)

sales_event_flag = (
    sales_event.groupby("item")
    .apply(lambda x: x.loc[x["event"] != "Without event", "n_sales"].sum() / x["n_sales"].sum())
    .reset_index(name="pct_sales_event")
)

# Unir todo
features_item = (
    features_item
    .merge(pct_zero, on="item", how="left")
    .merge(sales_event_flag, on="item", how="left")
    .fillna({"pct_sales_event": 0})
)


In [None]:
features_item.head(3)

### 5.3. Establecer numero de clusters

In [None]:
# Selección dinámica solo de variables numéricas
X = (
    features_item
    .select_dtypes(include=[np.number])   # solo numéricas
    .replace([np.inf, -np.inf], np.nan)   # manejar infinitos
    .fillna(0.0)                          # manejar nulos
)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
sse = {}
for k in range(2, 15):   # probamos de 2 a 14 clusters
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(X_scaled)
    sse[k] = kmeans.inertia_


In [None]:
fig = plt.figure(figsize=(8, 4))
ax = fig.add_subplot()

x_values = list(sse.keys())
y_values = list(sse.values())

ax.plot(x_values, y_values, marker="o", label="Inercia (SSE)")
ax.set_xlabel("Número de clusters (k)", fontsize=12)
ax.set_ylabel("Inercia (SSE)", fontsize=12)
ax.legend()
fig.suptitle("Método del Codo (Elbow Method)", fontsize=16)
plt.show()

### 5.4. Creacion de pipeline

In [None]:
# Definir pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=4, random_state=175, n_init="auto"))
])

# Entrenar modelo
pipeline.fit(X)

# Obtener etiquetas de cluster
features_item["cluster"] = pipeline["kmeans"].labels_

In [None]:
# Distribución por cluster
print(features_item["cluster"].value_counts())

In [None]:
features_item.head(3)

In [None]:
# Tomar las columnas categóricas únicas por item
df_cats = df_items[["item", "category", "department"]].drop_duplicates("item")

# Unirlas con tus features agregados
features_item = features_item.merge(df_cats, on="item", how="left")

In [None]:
df_items.shape

In [None]:
features_item.head(3)

In [None]:
features_item.to_csv("clustering_data.csv", index=False, encoding="utf-8")

In [None]:
pipeline["kmeans"].labels_.shape

In [None]:
pipeline[:2]

In [None]:
features_item["cluster"] = pipeline["kmeans"].labels_

In [None]:
features_item.head(3)

### 5.5. Verificacion de clusters por feature

In [None]:
(
    features_item
    .groupby(["cluster"])
    .describe()
    .T
    .style
    .format('{:.2f}')
    .background_gradient(cmap = 'Blues', axis = 1)
)

## 6. Ficha de productos

In [None]:
ficha_df = pd.DataFrame()

In [None]:
for i, col in enumerate(["pct_weeks_zero", "total_sales", "pct_sales_event", "avg_price"]):
    resumen_data = features_item[["cluster", col]].groupby("cluster").describe().T[1:]
    ficha_df = pd.concat([ficha_df, resumen_data])

In [None]:
ficha_df

In [None]:
# generamos nuestro multiindex
out_index = [
    "Ventas",
    "Rotacion",
    "Estacionalidad",
    "Costo"
]

inner_index = [
    "pct_weeks_zero", "total_sales", "pct_sales_event", "avg_price"
]

estadisticos = ["Media", "Desviación", "Mínimo", "Perc. 25", "Perc. 50", "Perc. 75", "Máximo"]

new_multi_index = []

for oi, ii, in zip(out_index, inner_index):
    for es in estadisticos:
        new_multi_index.append((oi, ii, es))

In [None]:
def generate_multiindex(list_of_tuples, names):
    return pd.MultiIndex.from_tuples(list_of_tuples, names = names)

In [None]:
names = ["Grupo Indicadores", "Indicador", "Estadístico"]
index_ficha = generate_multiindex(new_multi_index, names)
ficha_df.set_index(index_ficha, inplace = True)

In [None]:
tamaño_clusters = features_item.groupby("cluster").size().to_frame().T
tamaño_clusters.set_index(generate_multiindex([("General", "Clúster", "Tamaño")] , names), inplace = True)

In [None]:
ficha_df = pd.concat([tamaño_clusters, ficha_df])

In [None]:
ficha_df

In [None]:
float_format = '{:.2f}'

In [None]:
(
    ficha_df
    .rename(columns = {
        0 : "Menor Rotacion",
        3 : "Eventos",
        1 : "Top Ventas",
        2 : "Alto valor"
    })
    .style
    .format(float_format)
    .background_gradient(cmap = 'Blues', axis = 1)
)

## 7. Clustering tiendas

In [None]:
df.head(3)

### 7.1. Agrupar ventas por tienda-semana

In [None]:
df_store = (
    df.groupby(["store_code", "store", "region", "event", "yearweek"], as_index=False)
    .agg(
        n_sales=("n_sales", "sum"),
        revenue=("revenue", "sum"),
    )
)

In [None]:
df_store

### 7.2. Agregar features

In [None]:
# Features básicos por item
features_store = (
    df_store.groupby("store_code")
    .agg(
        mean_sales=("n_sales", "mean"),
        std_sales=("n_sales", "std"),
        max_sales=("n_sales", "max"),
        min_sales=("n_sales", "min"),
        mean_revenue=("revenue", "mean"),
        std_revenue=("revenue", "std"),
        total_sales=("n_sales", "sum"),
        total_revenue=("revenue", "sum"),
    )
    .reset_index()
)

# Coeficiente de variación
# features_item["cv_sales"] = features_item["std_sales"] / features_item["mean_sales"].replace(0, np.nan)
features_store["cv_revenue"] = features_store["std_revenue"] / features_store["mean_revenue"].replace(0, np.nan)

# % semanas con 0 ventas
pct_zero_store = (
    df_store.groupby("store_code")["n_sales"]
    .apply(lambda x: (x == 0).mean())
    .reset_index(name="pct_weeks_zero_store")
)

# % de ventas en semanas con evento
sales_event_store = (
    df_store.groupby(["store_code", "event"])["n_sales"].sum().reset_index()
)

sales_event_flag_store = (
    sales_event_store.groupby("store_code")
    .apply(lambda x: x.loc[x["event"] != "Without event", "n_sales"].sum() / x["n_sales"].sum())
    .reset_index(name="pct_sales_event_store")
)

# Unir todo
features_store = (
    features_store
    .merge(pct_zero_store, on="store_code", how="left")
    .merge(sales_event_flag_store, on="store_code", how="left")
    .fillna({"pct_sales_event_store": 0})
)


In [None]:
features_store.head(3)

### 7.3. Establecer numero de clusters para tiendas

In [None]:
# Selección dinámica solo de variables numéricas
X = (
    features_store
    .select_dtypes(include=[np.number])   # solo numéricas
    .replace([np.inf, -np.inf], np.nan)   # manejar infinitos
    .fillna(0.0)                          # manejar nulos
)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
sse = {}
for k in range(2, 10):   # probamos de 2 a 14 clusters
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(X_scaled)
    sse[k] = kmeans.inertia_


In [None]:
fig = plt.figure(figsize=(8, 4))
ax = fig.add_subplot()

x_values = list(sse.keys())
y_values = list(sse.values())

ax.plot(x_values, y_values, marker="o", label="Inercia (SSE)")
ax.set_xlabel("Número de clusters (k)", fontsize=12)
ax.set_ylabel("Inercia (SSE)", fontsize=12)
ax.legend()
fig.suptitle("Método del Codo (Elbow Method)", fontsize=16)
plt.show()

### 7.4. Creacion de pipeline

In [None]:
# Definir pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=4, random_state=175, n_init="auto"))
])

# Entrenar modelo
pipeline.fit(X)

# Obtener etiquetas de cluster
features_store["cluster"] = pipeline["kmeans"].labels_

In [None]:
# Distribución por cluster
print(features_store["cluster"].value_counts())

In [None]:
features_store.head(3)

In [None]:
df_store.head()

In [None]:
# Tomar las columnas categóricas únicas por item
df_cats_store = df_store[["store_code", "store", "region"]].drop_duplicates("store_code")

# Unirlas con tus features agregados
features_store = features_store.merge(df_cats_store, on="store_code", how="left")

In [None]:
df_store.shape

In [None]:
features_store.head(3)

In [None]:
features_store.to_csv("clustering_store_data.csv", index=False, encoding="utf-8")

In [None]:
pipeline["kmeans"].labels_.shape

In [None]:
pipeline[:2]

In [None]:
features_store["cluster"] = pipeline["kmeans"].labels_

In [None]:
features_store.head(3)

### 7.5. Verificacion de clusters por feature

In [None]:
(
    features_store
    .groupby(["cluster"])
    .describe()
    .T
    .style
    .format('{:.2f}')
    .background_gradient(cmap = 'Blues', axis = 1)
)

## 8. Ficha de tiendas

In [None]:
ficha_store = pd.DataFrame()

In [None]:
for i, col in enumerate(["mean_sales", "total_sales", "cv_revenue", "mean_revenue"]):
    resumen_data = features_store[["cluster", col]].groupby("cluster").describe().T[1:]
    ficha_store = pd.concat([ficha_store, resumen_data])

In [None]:
ficha_store

In [None]:
# generamos nuestro multiindex
out_index_store = [
    "Rendimiento base",
    "Volumen total",
    "Estabilidad",
    "Rentabilidad"
]

inner_index_store = [
    "mean_sales", "total_sales", "cv_revenue", "mean_revenue"
]

estadisticos_store = ["Media", "Desviación", "Mínimo", "Perc. 25", "Perc. 50", "Perc. 75", "Máximo"]

new_multi_index_store = []

for oi, ii, in zip(out_index_store, inner_index_store):
    for es in estadisticos_store:
        new_multi_index_store.append((oi, ii, es))

In [None]:
# def generate_multiindex(list_of_tuples, names):
#     return pd.MultiIndex.from_tuples(list_of_tuples, names = names)

In [None]:
names = ["Grupo Indicadores", "Indicador", "Estadístico"]
index_ficha_store = generate_multiindex(new_multi_index_store, names)
ficha_store.set_index(index_ficha_store, inplace = True)

In [None]:
tamaño_clusters_store = features_store.groupby("cluster").size().to_frame().T
tamaño_clusters_store.set_index(generate_multiindex([("General", "Clúster", "Tamaño")] , names), inplace = True)

In [None]:
ficha_store = pd.concat([tamaño_clusters_store, ficha_store])

In [None]:
ficha_store

In [None]:
float_format = '{:.2f}'

In [None]:
(
    ficha_store
    .rename(columns = {
        0 : "Tiendas irregulares",
        1 : "Tiendas estables",
        2 : "Tiendas premium",
        3 : "Tiendas bajo rendimiento"
    })
    .style
    .format(float_format)
    .background_gradient(cmap = 'Blues', axis = 1)
)

## 9. Creacion fichero para forecasting

In [None]:
df.head(3)

In [None]:
df = df.reset_index()

In [None]:
features_store.head(3)

In [None]:
features_item.head(3)

In [None]:
cluster_store = features_store[["store_code", "cluster"]].drop_duplicates()
cluster_item = features_item[["item", "cluster"]].drop_duplicates()

In [None]:
cluster_store

In [None]:
cluster_item

In [None]:
df.shape

In [None]:
df = (
    df
    .merge(cluster_store, on="store_code", how="left")
    .rename(columns={'cluster': 'cluster_store'})
    .merge(cluster_item, on="item", how="left")
    .rename(columns={'cluster': 'cluster_item'})
)

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.to_parquet('df_forecasting.parquet', engine='pyarrow', index=False, compression='snappy')

## Conclusion

### Hemos realizado clustering de tiendas, pese a que no es relevante sus resulados para el analisis de negocio, si consideramos puede tener influencia en la prediccion de ventas, por lo cual mantenemos esta variable en el archivo exportable del cual se alimentara el forecasting.