# Mango Attribute Predict Model

### Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython import get_ipython
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50, preprocess_input
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.optim as optim
import missingno as msno

### Data Preprocessing

In [None]:
df = pd.read_csv("../datathon-fme-mango/archive/product_data.csv")
attribute_data = pd.read_csv("../datathon-fme-mango/archive/attribute_data.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Group by 'cod_modelo_color' and aggregate 'des_filename' into a list, keeping other columns
df = df.groupby('cod_modelo_color').agg({'des_filename': lambda x: list(x), **{col: 'first' for col in df.columns if col != 'cod_modelo_color' and col != 'des_filename'}}).reset_index()
#df = df.groupby('cod_modelo_color').agg({'des_filename': lambda x: list(x)}).reset_index()

# Display the updated dataframe
print(df.head())

In [None]:
df.head()

In [None]:
#df.describe()

In [None]:
# Pivot the attribute_data dataframe
attribute_pivot = attribute_data.pivot(index='cod_modelo_color', columns='attribute_name', values='des_value')

# Reset the index to merge with df
attribute_pivot.reset_index(inplace=True)

# Merge the pivoted attribute data with the product data
df = pd.merge(df, attribute_pivot, on='cod_modelo_color', how='left')

# Display the merged dataframe
print(df.head())

In [None]:
# Check for missing values in the dataframe
missing_values = df.isnull().sum()
print(missing_values)
# Create a heatmap of the missing values
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Heatmap of Missing Values in df')
plt.show()

In [None]:
# Select the categorical columns to encode
categorical_columns = ['cod_color', 'des_sex', 'des_age', 'des_line', 'des_fabric', 'des_product_category', 'des_product_aggregated_family', 'des_product_family', 'des_product_type', 'des_color', 'cane_height_type', 'closure_placement', 'heel_shape_type', 'knit_structure', 'length_type', 'neck_lapel_type', 'silhouette_type', 'sleeve_length_type', 'toecap_type', 'waist_type', 'woven_structure']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

df_encoded = df.copy()

# Loop through each categorical column
for col in categorical_columns:
    # Fit and transform the current column
    encoded_columns = encoder.fit_transform(df[[col]])
    
    # Replace the original column with the array of encoded values
    df_encoded[col] = list(encoded_columns)  # Each row will contain an array of 0s and 1s

# Display the updated dataframe
df_encoded.head()

In [None]:
embeddings = pd.read_csv("../datathon-fme-mango/archive/64DimensionalEmbedding.csv") # CAMBIAR ENLACE CSV

# Assuming the 64 columns are named 'embedding_1', 'embedding_2', ..., 'embedding_64'
embedding_columns = [f'embeddings_{i}' for i in range(1, 65)]

# Combine the 64 columns into a single column
embeddings['embeddings'] = embeddings[embedding_columns].values.tolist()

# Drop the original 64 columns from embeddings dataframe
embeddings.drop(columns=embedding_columns, inplace=True)

# Display the updated embeddings dataframe
embeddings.head()

In [None]:
# Explode the des_filename column to have one filename per row
df_exploded = df_encoded.explode('des_filename')

# Merge the exploded dataframe with the embeddings dataframe on the filename column
df_with_embeddings = pd.merge(df_exploded, embeddings, left_on='des_filename', right_on='filename', how='left')

# Group by 'cod_modelo_color' and aggregate the embeddings into a list
df_with_embeddings = df_with_embeddings.groupby('cod_modelo_color').agg({'des_filename': lambda x: list(x), 'embeddings': lambda x: list(x), **{col: 'first' for col in df.columns if col != 'cod_modelo_color' and col != 'des_filename'}}).reset_index()

# Display the updated dataframe
df_with_embeddings.head()