In [None]:
import pandas as pd

# Assuming attribute_data and product_data are already loaded as DataFrames

# Pivot the attribute_data to have attribute names as columns
pivoted_attributes = attribute_data.pivot(index='cod_modelo_color', columns='attribute_name', values='des_value')

# Reset the index to merge with product_data
pivoted_attributes.reset_index(inplace=True)

# Merge the pivoted attributes with product_data
merged_data = pd.merge(product_data, pivoted_attributes, on='cod_modelo_color', how='left')

# Save the updated product_data to a new CSV file
merged_data.to_csv('updated_product_data.csv', index=False)


In [None]:
import pandas as pd

# Load the 256DimensionalEmbedding.csv file
embeddings_df = pd.read_csv('../datathon-fme-mango/archive/256DimensionalEmbedding.csv')

# Combine all embedding columns into a single column
embeddings_df['embedding'] = embeddings_df.iloc[:, :-1].apply(lambda row: row.values.tolist(), axis=1)

# Keep only the 'embedding' and 'file_name' columns
embeddings_df = embeddings_df[['embedding', 'filename']]

# Delete any rows that have a filename that doesn't end in _B.jpg
embeddings_df = embeddings_df[embeddings_df['filename'].str.endswith('_B.jpg')]

# Save the embeddings dataframe to a new CSV file
embeddings_df.to_csv('256DimensionalEmbedding_processed.csv', index=False)

In [None]:
# Load the updated_product_data_sorted.csv file
updated_product_data_sorted_df = pd.read_csv('updated_product_data_sorted.csv')

# Delete any rows that have a des_filename that doesn't end in _B.jpg
updated_product_data_sorted_df = updated_product_data_sorted_df[updated_product_data_sorted_df['des_filename'].str.endswith('_B.jpg')]

# Save the filtered dataframe to a new CSV file
updated_product_data_sorted_df.to_csv('updated_product_data_sorted_filtered.csv', index=False)

In [None]:
# Find the common filenames in both dataframes
common_filenames = set(updated_product_data_sorted_filtered_df['des_filename']).intersection(set(embeddings_sorted_df['filename']))

# Filter both dataframes to keep only the rows with common filenames
updated_product_data_sorted_filtered_df = updated_product_data_sorted_filtered_df[updated_product_data_sorted_filtered_df['des_filename'].isin(common_filenames)]
embeddings_sorted_df = embeddings_sorted_df[embeddings_sorted_df['filename'].isin(common_filenames)]

# Check if they have the same number of rows
same_number_of_rows = len(updated_product_data_sorted_filtered_df) == len(embeddings_sorted_df)
print(f"Do the files have the same number of rows? {same_number_of_rows}")

# Print the number of rows in each dataframe
print(f"Number of rows in updated_product_data_sorted_filtered_df: {len(updated_product_data_sorted_filtered_df)}")
print(f"Number of rows in embeddings_sorted_df: {len(embeddings_sorted_df)}")
# Combine the two dataframes by adding the columns of embeddings_sorted_df to updated_product_data_sorted_filtered_df
combined_df = updated_product_data_sorted_filtered_df.copy()
combined_df['embedding'] = embeddings_sorted_df['embedding'].values

# Save the combined dataframe to a new CSV file
combined_df.to_csv('256database.csv', index=False)

In [None]:
# Select the categorical columns to encode
categorical_columns = ['cod_color', 'des_sex', 'des_age', 'des_line', 'des_fabric', 'des_product_category', 'des_product_aggregated_family', 'des_product_family', 'des_product_type', 'des_color', 'cane_height_type', 'closure_placement', 'heel_shape_type', 'knit_structure', 'length_type', 'neck_lapel_type', 'silhouette_type', 'sleeve_length_type', 'toecap_type', 'waist_type', 'woven_structure']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

df_encoded = df.copy()

# Loop through each categorical column
for col in categorical_columns:
    # Fit and transform the current column
    encoded_columns = encoder.fit_transform(df[[col]])
    
    # Replace the original column with the array of encoded values
    df_encoded[col] = list(encoded_columns)  # Each row will contain an array of 0s and 1s

df_encoded.drop(columns=['cod_modelo_color'], inplace=True)

df_encoded.to_csv("updated256dataset.csv", index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom Dataset class
class MangoDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        # Extract embeddings (assumed as float vectors)
        self.features = np.stack(self.data['embedding'].apply(lambda x: np.array(eval(x))).values)
        # Dummy target for this example (replace with actual target if available)
        self.targets = np.random.randint(0, 2, size=len(self.data))

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)

# Neural Network Model
class MangoNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MangoNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Dataset and DataLoader preparation
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    return MangoDataset(df)

# Accuracy calculation
def calculate_accuracy(outputs, targets):
    # Convert probabilities to binary predictions
    predictions = (outputs >= 0.5).float()
    correct = (predictions == targets).sum().item()
    return correct / len(targets)

# Initialize and train the model
def train_model(dataset, input_size, hidden_size=128, output_size=1, epochs=50, batch_size=32, lr=0.001):
    # DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Model, Loss, Optimizer
    model = MangoNet(input_size, hidden_size, output_size).to(device)
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        total_accuracy = 0

        for features, targets in dataloader:
            features, targets = features.to(device), targets.to(device)

            # Forward pass
            outputs = model(features).squeeze()
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss and accuracy
            total_loss += loss.item()
            total_accuracy += calculate_accuracy(outputs, targets)

        avg_loss = total_loss / len(dataloader)
        avg_accuracy = total_accuracy / len(dataloader)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}")

    # Save model
    torch.save(model.state_dict(), "mangonet_model.pth")
    print("Model saved as 'mangonet_model.pth'.")

# Pre-check dataset
def sanity_check(dataframe):
    print("Preview:")
    print(dataframe.head())
    print("Dataset shape:", dataframe.shape)
    print("Columns:", dataframe.columns)
    print("Data types:", dataframe.dtypes)

# Main function
def main():
    # Load your dataset
    csv_path = "updated256dataset.csv"  # Replace with your dataset path
    df = pd.read_csv(csv_path)

    # Sanity check
    sanity_check(df)

    # Dataset preparation
    dataset = MangoDataset(df)
    input_size = dataset[0][0].shape[0]

    # Train model
    train_model(dataset, input_size)

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'