In [2]:
# Celda 1: Importar las librerías necesarias
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Input, concatenate
from tensorflow.keras.preprocessing import image as keras_img
from tqdm import tqdm
import ssl


ssl._create_default_https_context = ssl._create_unverified_context


In [3]:
# Celda 2: Cargar los datos
# Cargar los datasets
product_data = pd.read_csv('../archive/product_data.csv')
attribute_data = pd.read_csv('../archive/attribute_data.csv')
test_data = pd.read_csv('../archive/test_data.csv')


In [4]:
# Celda 3: Fusionar los datasets
# Fusionar los datos de atributos con la información de productos basado en 'cod_modelo_color'
merged_data = pd.merge(attribute_data, product_data, on='cod_modelo_color', how='left')
# Seleccionar solo las columnas necesarias
merged_data = merged_data[['des_filename'] + attribute_data.columns.tolist()]
merged_data = merged_data.drop(columns=['cod_modelo_color', 'cod_value'])
merged_data


Unnamed: 0,des_filename,attribute_name,des_value
0,85_1202950_37036315-99_B.jpg,silhouette_type,Slim
1,85_1202950_37036315-99_.jpg,silhouette_type,Slim
2,86_1217677_47024408-95_.jpg,silhouette_type,Oversize
3,86_1217677_47024408-95_B.jpg,silhouette_type,Oversize
4,84_1168477_27075766-99_B.jpg,silhouette_type,Slim
...,...,...,...
279850,86_1226349_47110061-09_.jpg,closure_placement,Cierre Delantero
279851,86_1226349_47110061-09_B.jpg,closure_placement,Cierre Delantero
279852,86_1213782_47054396-TM_B.jpg,closure_placement,Cierre Delantero
279853,86_1213782_47054396-TM_.jpg,closure_placement,Cierre Delantero


In [5]:
# Codificar la variable objetivo 'des_value' (para entrenamiento)
le_value = LabelEncoder()
merged_data['des_value_encoded'] = le_value.fit_transform(merged_data['des_value'])
target_classes = len(le_value.classes_)
merged_data = merged_data.drop(columns=['des_value'])
merged_data

Unnamed: 0,des_filename,attribute_name,des_value_encoded
0,85_1202950_37036315-99_B.jpg,silhouette_type,105
1,85_1202950_37036315-99_.jpg,silhouette_type,105
2,86_1217677_47024408-95_.jpg,silhouette_type,70
3,86_1217677_47024408-95_B.jpg,silhouette_type,70
4,84_1168477_27075766-99_B.jpg,silhouette_type,105
...,...,...,...
279850,86_1226349_47110061-09_.jpg,closure_placement,24
279851,86_1226349_47110061-09_B.jpg,closure_placement,24
279852,86_1213782_47054396-TM_B.jpg,closure_placement,24
279853,86_1213782_47054396-TM_.jpg,closure_placement,24


In [6]:
test_data

Unnamed: 0,cod_modelo_color,des_filename,cod_color,des_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,attribute_name,test_id
0,88_49711373,88_49711373_67080432-99_.jpg,99,NEGRO,Female,Adult,WOMAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Boots,cane_height_type,88_49711373_cane_height_type
1,88_49718802,88_49718802_67030656-99_.jpg,99,NEGRO,Male,Adult,MAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Ankle Boots,cane_height_type,88_49718802_cane_height_type
2,88_49709572,88_49709572_67030418-01_B.jpg,01,BLANCO,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49709572_cane_height_type
3,88_49722701,88_49722701_67066002-02_.jpg,02,OFFWHITE,Female,Baby,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49722701_cane_height_type
4,88_49724926,88_49724926_67056330-02_B.jpg,02,OFFWHITE,Male,Newborn,KIDS,WOVEN,Tops,Shirts,Shirt,Shirt,cane_height_type,88_49724926_cane_height_type
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71814,88_49727540,88_49727540_67069223-56_.jpg,56,NAVY,Male,Adult,MAN,WOVEN,Tops,Shirts,Shirt,Shirt,knit_structure,88_49727540_knit_structure
71815,88_49733648,88_49733648_67017145-56_.jpg,56,NAVY,Female,Adult,WOMAN,CIRCULAR,Tops,T-shirts,Poloshirts,Poloshirt,knit_structure,88_49733648_knit_structure
71816,88_49735572,88_49735572_67076755-81_.jpg,81,ROSA PASTEL,Female,Adult,WOMAN,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,knit_structure,88_49735572_knit_structure
71817,88_49713624,88_49713624_67092528-70_.jpg,70,ROJO,Female,Adult,WOMAN,WOVEN,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,knit_structure,88_49713624_knit_structure


In [7]:
test_data = test_data[['des_filename', 'attribute_name']]
test_data.head()

Unnamed: 0,des_filename,attribute_name
0,88_49711373_67080432-99_.jpg,cane_height_type
1,88_49718802_67030656-99_.jpg,cane_height_type
2,88_49709572_67030418-01_B.jpg,cane_height_type
3,88_49722701_67066002-02_.jpg,cane_height_type
4,88_49724926_67056330-02_B.jpg,cane_height_type


In [8]:
image_dir = "../archive/images/images"
image_files = os.listdir(image_dir)
print(image_files[:5])

['83_1124794_17001131-56_.jpg', '88_49720760_67060683-05_.jpg', '82_1118361_87077646-41_B.jpg', '84_1183038_27055806-99_.jpg', '84_1190357_27038646-99_B.jpg']


In [12]:
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.preprocessing import image as keras_img
from tensorflow.keras.applications.mobilenet import preprocess_input
import numpy as np
import os
from tqdm import tqdm

# Inicializar MobileNet para extracción de características
mobilenet = MobileNet(include_top=False, pooling='avg', input_shape=(224, 224, 3))

# Función para extraer características por lotes
def extract_image_features_batch_mobilenet(image_paths, batch_size=32):
    features = []
    batch = []
    failed_indexes = []

    for idx, image_path in enumerate(tqdm(image_paths, desc="Procesando imágenes")):
        try:
            # Cargar y procesar cada imagen
            img = keras_img.load_img(image_path, target_size=(160, 224))
            img_data = keras_img.img_to_array(img)
            img_data = keras_img.smart_resize(img_data, (224, 224))  # Esto hace el redimensionado a 224x224
            img_data = preprocess_input(img_data)  # Preprocesamiento de MobileNet
            batch.append(img_data)

            # Si el lote alcanza el tamaño definido
            if len(batch) == batch_size:
                batch_array = np.array(batch)
                batch_features = mobilenet.predict(batch_array)  # Predicción en lote
                features.extend(batch_features)
                batch = []  # Reiniciar el lote
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            failed_indexes.append(idx)

    # Procesar el último lote si no está vacío
    if batch:
        batch_array = np.array(batch)
        batch_features = mobilenet.predict(batch_array)
        features.extend(batch_features)

    return np.array(features), failed_indexes

# Rutas de imágenes (asume que 'des_filename' contiene nombres de archivo)
# image_dir = "path_to_images/"
train_image_paths = [os.path.join(image_dir, fname) for fname in merged_data['des_filename']]
test_image_paths = [os.path.join(image_dir, fname) for fname in test_data['des_filename']]

# Extracción de características para entrenamiento y prueba
train_image_features, train_failed_indexes = extract_image_features_batch_mobilenet(train_image_paths, batch_size=32)
test_image_features, test_failed_indexes = extract_image_features_batch_mobilenet(test_image_paths, batch_size=32)





Procesando imágenes:  46%|████▋     | 130085/279855 [20:25<33:37, 74.23it/s]



Procesando imágenes:  46%|████▋     | 130117/279855 [20:25<31:56, 78.13it/s]



Procesando imágenes:  47%|████▋     | 130149/279855 [20:26<31:17, 79.74it/s]



Procesando imágenes:  47%|████▋     | 130181/279855 [20:26<30:17, 82.34it/s]



Procesando imágenes:  47%|████▋     | 130213/279855 [20:26<29:45, 83.82it/s]



Procesando imágenes:  47%|████▋     | 130245/279855 [20:27<29:27, 84.63it/s]



Procesando imágenes:  47%|████▋     | 130277/279855 [20:27<29:44, 83.80it/s]



Procesando imágenes:  47%|████▋     | 130309/279855 [20:27<29:51, 83.47it/s]



Procesando imágenes:  47%|████▋     | 130372/279855 [20:28<24:24, 102.06it/s]



Procesando imágenes:  47%|████▋     | 130387/279855 [20:28<30:29, 81.70it/s] 



Procesando imágenes:  47%|████▋     | 130405/279855 [20:30<1:08:09, 36.55it/s]



Procesando imágenes:  47%|████▋     | 130437/279855 [20:30<54:49, 45.43it/s]  



Procesando imágenes:  47%|████▋     | 130469/279855 [20:31<46:25, 53.63it/s]



Procesando imágenes:  47%|████▋     | 130501/279855 [20:31<43:10, 57.64it/s]



Procesando imágenes:  47%|████▋     | 130533/279855 [20:31<38:46, 64.17it/s]



Procesando imágenes:  47%|████▋     | 130565/279855 [20:32<35:24, 70.28it/s]



Procesando imágenes:  47%|████▋     | 130597/279855 [20:32<34:01, 73.13it/s]



Procesando imágenes:  47%|████▋     | 130660/279855 [20:33<23:28, 105.94it/s]


KeyboardInterrupt: 

In [None]:
# Guardar características
np.save('train_image_features.npy', train_image_features)
np.save('test_image_features.npy', test_image_features)

# Cargar características
train_image_features = np.load('train_image_features.npy')
test_image_features = np.load('test_image_features.npy')


In [None]:
# Celda 7: Combinar características tabulares e imágenes
# Combinar características tabulares con características de imagen para el conjunto de entrenamiento
X_train_tabular = merged_data[categorical_cols].values
X_train_image = train_image_features

# Para el conjunto de prueba
X_test_tabular = test_data[categorical_cols].values
X_test_image = test_image_features

# Estandarizar las características tabulares (escalado opcional)
scaler = StandardScaler()
X_train_tabular = scaler.fit_transform(X_train_tabular)
X_test_tabular = scaler.transform(X_test_tabular)

# Combinar las características (tabulares + imagen)
def combine_features(tabular_data, image_data):
    return [tabular_data, image_data]

X_train = combine_features(X_train_tabular, X_train_image)
X_test = combine_features(X_test_tabular, X_test_image)

# Variable objetivo
y_train = to_categorical(merged_data['des_value_encoded'], num_classes=target_classes)


In [None]:
# Celda 8: Desarrollo del modelo usando redes neuronales y ResNet
# Definir la forma de los datos tabulares e imagen
tabular_input = Input(shape=(len(categorical_cols),))
image_input = Input(shape=(2048,))  # Tamaño de las características de ResNet (pooling='avg' da 2048 dim)

# Rama de datos tabulares
x1 = Dense(256, activation='relu')(tabular_input)
x1 = Dense(128, activation='relu')(x1)

# Rama de datos de imagen
x2 = Dense(256, activation='relu')(image_input)
x2 = Dense(128, activation='relu')(x2)

# Combinar las ramas
combined = concatenate([x1, x2])
combined = Dense(128, activation='relu')(combined)
combined = Dense(64, activation='relu')(combined)

# Capa de salida para clasificación multiclase
output = Dense(target_classes, activation='softmax')(combined)

# Crear el modelo
model = Model(inputs=[tabular_input, image_input], outputs=output)

# Compilar el modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Mostrar resumen del modelo
model.summary()


In [None]:
# Celda 9: Entrenamiento del modelo
# Entrenar el modelo utilizando una partición para validación
X_com = combine_features(X_train_tabular, X_train_image)
model.fit(X_com, y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Celda 10: Predicción y Generación de la Subida
# Predecir sobre el conjunto de prueba
X_test_com = combine_features(X_test_tabular, X_test_image)
predictions = model.predict(X_test_com)

# Decodificar las predicciones
predicted_classes = np.argmax(predictions, axis=1)
predicted_labels = le_value.inverse_transform(predicted_classes)

# Crear archivo de sumisión
submission = pd.DataFrame({
    'test_id': test_data['test_id'],
    'des_value': predicted_labels
})

# Guardar archivo de sumisión a CSV
submission.to_csv('submission.csv', index=False)


In [None]:
# Celda 11: Manejo de predicciones INVÁLIDAS
# Si es necesario, puedes volver a ejecutar una función de aplicabilidad que verifique si un atributo es válido para ciertos tipos de productos
def mark_as_invalid(df, rules):
    for idx, row in df.iterrows():
        if not rule_applies(row['attribute_name'], row['des_product_type']):  # Función personalizada de reglas
            df.at[idx, 'des_value'] = 'INVALID'
    return df

# Función de regla personalizada para reemplazar los inválidos
def rule_applies(attribute, product_type):
    # Definir las reglas de aplicabilidad aquí
    if attribute == 'heel_shape_type' and product_type not in ['Shoes', 'Sandals', 'Footwear']:
        return False
    return True

# Aplicarlo
submission = mark_as_invalid(submission, rules={})
