In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython import get_ipython
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50, preprocess_input
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.optim as optim
import missingno as msno

**Understanding the data**

In [None]:
df = pd.read_csv("../datathon/dataset/product_data.csv")
outfit_data = pd.read_csv("../datathon/dataset/outfit_data.csv")
print(df.head(), outfit_data.head())

In [None]:
df.describe()

In [None]:
df.info()

**Preprocessing Data**

In [None]:
# Creating new feature that is a list showcasing what outfits does each product appear in
outfits_agrupados = outfit_data.groupby('cod_modelo_color')['cod_outfit'].apply(list).reset_index()
outfits_agrupados.head()
df = pd.merge(df, outfits_agrupados, on='cod_modelo_color', how='left')
df.head()


In [None]:
# save
df.isna().sum()

In [None]:
msno.bar(df)

In [8]:
categorical_columns = ["cod_color_code","des_color_specification_esp","des_agrup_color_eng","des_sex","des_age","des_line","des_fabric","des_product_category","des_product_aggregated_family","des_product_family","des_product_type"]

for column in categorical_columns:
    df[column] = df[column].astype('category')

In [None]:
# contamos cuantos valores unicos hay en cada columna
for x in categorical_columns:
    print(x, ' = ', df[x].nunique())


Podemos ver que existen un cojon de categorias

In [None]:
plt.figure(figsize=(20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 12 and isinstance(df[column].dtype, pd.CategoricalDtype):
        ax = plt.subplot(3, 4, plotnumber)
        sns.countplot(x=df[column])
        plt.xlabel(column)
        plt.xticks(rotation=45)
        plotnumber += 1

plt.tight_layout()
plt.show()


In [11]:
# Step 1: Initialize ResNet50 (pre-trained on ImageNet, remove the final classification layer)
# Load the pre-trained ResNet model with locally saved weights
resnet_model = ResNet50(weights='../model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', include_top=False, pooling='avg')

# Step 2: Define function to process each image
def process_image(img_path):
    img = image.load_img(img_path, target_size=(239, 334))  # ResNet50 expects 224x224 input
    img_data = image.img_to_array(img)                      # Convert to array
    img_data = np.expand_dims(img_data, axis=0)             # Expand dimensions to match model input
    img_data = preprocess_input(img_data)                   # Preprocess (scale, normalize, etc.)
    return img_data

# Step 3: Define function to extract image embeddings
def extract_image_embedding(img_path):
    img_data = process_image(img_path)
    embedding = resnet_model.predict(img_data)  # Pass image through the pre-trained ResNet
    return embedding.flatten()                  # Flatten the output to get the feature vector

# Step 4: Load image paths and extract embeddings
image_embeddings = []

# Iterate through all image filenames in the dataset
i= 0
if not os.path.isfile('df_with_embeddings.csv'):
    for img_filename in df['des_filename']:
        print ('image number ', i)
        img_path = os.path.join(os.pardir, img_filename)
        embedding = extract_image_embedding(img_path)
        image_embeddings.append(embedding)
        i += 1

# Convert list of embeddings into a NumPy array
    df['image_embedding'] = list(image_embeddings)
    df.to_csv('df_with_embeddings.csv', index=False)



In [12]:
df = pd.read_csv('df_with_embeddings.csv')

In [None]:
encoder = OneHotEncoder(sparse_output=False)

print(df[categorical_columns].shape)
categorical_columns_encoded = encoder.fit_transform(df[categorical_columns])

df['image_embedding'] = df['image_embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))
image_embeddings = np.vstack(df['image_embedding'].values)

scaler = StandardScaler()
embeddings_normalized = scaler.fit_transform(image_embeddings)
print("Categorical columns encoded shape:", categorical_columns_encoded.shape)
print("Embeddings normalized shape:", embeddings_normalized.shape)

# Combine embeddings with encoded categorical features
combined_features = np.hstack((embeddings_normalized, categorical_columns_encoded))

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Definir el modelo DBSCAN
dbscan_model = DBSCAN(eps=1.5, min_samples=2, max_samples=1000)
dbscan_labels = dbscan_model.fit_predict(combined_features)

n_clusters = len(np.unique(dbscan_labels))
n_clusters -= 1
print(len(dbscan_labels))
print(f'Número de clusters: {n_clusters}')
if n_clusters >= 1:
    # Calcular el Silhouette Score
    silhouette_dbscan = silhouette_score(combined_features, dbscan_labels)
    print(f'Silhouette Score for DBSCAN: {silhouette_dbscan}')

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Definir el modelo de clustering jerárquico
hierarchical_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5, linkage='ward')
hierarchical_labels = hierarchical_model.fit_predict(combined_features)

# Calcular el Silhouette Score
silhouette_hierarchical = silhouette_score(combined_features, hierarchical_labels)
print(f'Numero de clusters: {len(np.unique(hierarchical_labels))}')
print(f'Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical}')

In [None]:
from collections import Counter

def cluster_purity(labels):
    total_samples = len(labels)
    cluster_purity = 0
    
    for label in set(labels):
        cluster_indices = np.where(labels == label)[0]
        outfit_counts = Counter(df.iloc[cluster_indices]['cod_outfit'].apply(lambda x: tuple(eval(x))))
        most_common_outfit_count = outfit_counts.most_common(1)[0][1]
        cluster_purity += most_common_outfit_count
    
    return cluster_purity / total_samples

# Evaluate purity for DBSCAN and Hierarchical clusters
purity_dbscan = cluster_purity(dbscan_labels)
purity_hierarchical = cluster_purity(hierarchical_labels)

print(f'Purity Score for DBSCAN: {purity_dbscan}')
print(f'Purity Score for Hierarchical Clustering: {purity_hierarchical}')

Vemos una mejor un score de pureza mayor en hierarchical clustering, por lo tanto, vamos a hacer un código para la combinaciómn de los tres parámetros del HC para ver que valores nos pueden propocionar los mejores resultados

In [None]:
# Define parameter grids
distance_thresholds = [0.5, 1.0, 1.5, 2.0]
linkages = ['ward', 'complete', 'average', 'single']
n_clusters_list = [5, 10, 15, 20]

# List to store the results
results = []

# Loop through all combinations of the three parameters
for distance_threshold in distance_thresholds:
    for linkage in linkages:
        for n_clusters in n_clusters_list:
            try:
                # Apply Agglomerative Clustering
                hierarchical_model = AgglomerativeClustering(
                    n_clusters=None, distance_threshold=distance_threshold, linkage=linkage
                )
                labels = hierarchical_model.fit_predict(combined_features)
                
                # Calculate the Silhouette Score
                silhouette_hierarchical = silhouette_score(combined_features, hierarchical_labels)
                
                # Store the result (parameters and Silhouette score)
                results.append({
                    'distance_threshold': distance_threshold,
                    'linkage': linkage,
                    'n_clusters': n_clusters,
                    'silhouette_score': silhouette_hierarchical
                })
            except Exception as e:
                # Store any exceptions that occur during the process
                results.append({
                    'distance_threshold': distance_threshold,
                    'linkage': linkage,
                    'n_clusters': n_clusters,
                    'silhouette_score': None,
                    'error': str(e)
                })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

results_df

In [None]:
duplicated_ids = df[df['cod_modelo_color'].duplicated()]

# Mostrar los valores duplicados
print(duplicated_ids)

In [None]:
x = results_df['error'][0]
print(x)

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster

# Assuming `combined_features` contains the features used for clustering
# and `hierarchical_labels` contains the labels from the hierarchical clustering

# Perform hierarchical clustering to get the linkage matrix
# Z = linkage(combined_features, method='ward')

# Store the information of the row at index 6469
new_data_point = df.iloc[6469].copy()

# Drop the row at index 6469
#df = df.drop(6469)

def predict_cluster(new_data_point, Z, threshold=1.5):
    """
    Predict the cluster for a new data point based on hierarchical clustering.
    
    Parameters:
    - new_data_point: The new data point to be clustered.
    - Z: The linkage matrix from hierarchical clustering.
    - threshold: The distance threshold for forming clusters.
    
    Returns:
    - The cluster label for the new data point.
    """
    # Combine the new data point with the existing data
    combined_data = np.vstack([combined_features, new_data_point])
    
    # Perform hierarchical clustering on the combined data
    #new_labels = fcluster(Z, t=threshold, criterion='distance')
    new_labels = hierarchical_model.fit_predict(combined_data)
    
    # The label for the new data point will be the last label in the array
    return new_labels[-1]

# Example usage:
new_data_point = combined_features[0]  # Replace with actual new data point
predicted_cluster = predict_cluster(new_data_point, Z)
print(f'The predicted cluster for the new data point is: {predicted_cluster}')

In [None]:
# Filter the dataframe to get items in the cluster with label XXXX
items_in_cluster = df[hierarchical_labels == predicted_cluster]

# Display the filtered items
print("Printing cod_modelo_color of each item in the cluster")

print(items_in_cluster["cod_modelo_color"])

In [None]:
def guardar_outputs(archivo_salida, caracter_inicial):
    # Obtener el historial del entorno de ejecución
    ipython_historial = get_ipython().history_manager
    outputs = get_ipython().user_ns['Out']
    
    # Abrir el archivo donde se guardarán los outputs
    with open(archivo_salida, 'w') as f:
        for cell_number, output in outputs.items():
            # Verificar si el número de celda empieza con el caracter especificado
            if str(cell_number).startswith(caracter_inicial):
                f.write(f"Output de la celda [{cell_number}]:\n")
                f.write(str(output))
                f.write("\n" + "="*50 + "\n")
            
    print(f"Outputs guardados en {archivo_salida}")

# Llamada a la función
guardar_outputs("outputs_notebook.txt","# save")