# Preprocessing the data

In [75]:
import pandas as pd
import numpy as np

In [76]:
outfit_data = pd.read_csv("dataset/outfit_data.csv")
product_data = pd.read_csv("dataset/product_data.csv")

In [77]:
print(outfit_data.shape)
print(product_data.shape)

(43582, 2)
(9222, 13)


In [78]:
non_interseting_codes = product_data[product_data['des_product_family'].isin(['Fragances', 'Deco Accessories', 'Intimate', 'Deco Textiles', 'Bedding']) | product_data["des_sex"].isin(["Male","Unisex"]) | product_data["des_age"].isin(["Kids"])]['cod_modelo_color']

product_data = product_data[~product_data['cod_modelo_color'].isin(non_interseting_codes)]
outfit_data = outfit_data[~outfit_data['cod_modelo_color'].isin(non_interseting_codes)]

In [79]:
print(outfit_data.shape)
print(product_data.shape)

(42827, 2)
(8986, 13)


### Product data

In [80]:
# product_data.head(10)

In [81]:
product_data['cod_color_code'] = pd.to_numeric(product_data['cod_color_code'], errors='coerce')

mean_values = product_data.groupby('des_agrup_color_eng')['cod_color_code'].mean()

def replace_nan_with_mean(row):
    if np.isnan(row['cod_color_code']):
        return mean_values[row['des_agrup_color_eng']]
    else:
        return row['cod_color_code']

product_data['cod_color_code'] = product_data.apply(replace_nan_with_mean, axis=1)  

In [82]:
product_data.head(5)

Unnamed: 0,cod_modelo_color,cod_color_code,des_color_specification_esp,des_agrup_color_eng,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename
0,41085800-02,2.0,OFFWHITE,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Trousers & leggings,Trousers,Trousers,datathon/images/2019_41085800_02.jpg
1,53000586-TO,53.825949,TEJANO OSCURO,BLUE,Female,Adult,SHE,J-JEANS,Bottoms,Jeans,Jeans,Jeans,datathon/images/2019_53000586_TO.jpg
2,53030601-81,81.0,ROSA PASTEL,PINK,Female,Adult,SHE,P-PLANA,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,datathon/images/2019_53030601_81.jpg
3,53050730-15,15.0,MOSTAZA,YELLOW,Female,Adult,SHE,P-PLANA,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,datathon/images/2019_53050730_15.jpg
4,53070773-70,70.0,ROJO,RED,Female,Adult,SHE,P-PLANA,Tops,Shirts,Shirt,Shirt,datathon/images/2019_53070773_70.jpg


In [83]:
product_data.describe()

Unnamed: 0,cod_color_code
count,8986.0
mean,51.208092
std,39.038892
min,1.0
25%,8.0
50%,52.0
75%,98.009992
max,99.0


In [84]:
product_data.isnull().sum()

cod_modelo_color                 0
cod_color_code                   0
des_color_specification_esp      0
des_agrup_color_eng              0
des_sex                          0
des_age                          0
des_line                         0
des_fabric                       0
des_product_category             0
des_product_aggregated_family    0
des_product_family               0
des_product_type                 0
des_filename                     0
dtype: int64

## Outfit Data

In [85]:
outfit_data.head(7)

Unnamed: 0,cod_outfit,cod_modelo_color
0,1,51000622-02
1,1,43067759-01
2,1,53060518-02
3,1,53030594-08
4,1,43077762-01
5,1,43063724-OR
6,1,43075794-OR


In [86]:
outfit_data.isnull().sum()

cod_outfit          0
cod_modelo_color    0
dtype: int64

# Product data

In [87]:


columns = ['cod_modelo_color', 'des_agrup_color_eng',
       'des_line', 'des_fabric', 'des_product_category',
       'des_product_type', 'des_filename','des_product_family']

In [90]:
product_data= product_data[columns]
columns_to_encode = product_data.columns.difference(['cod_modelo_color', 'des_filename'])
product_data.to_csv("dataset/product_data_cleaned.csv")
# Perform one-hot encoding
encoded_product_data = pd.get_dummies(product_data, columns=columns_to_encode, dtype=int)
encoded_product_data.shape
encoded_product_data.head(5)

Unnamed: 0,cod_modelo_color,des_filename,des_agrup_color_eng_BLUE,des_agrup_color_eng_BROWN,des_agrup_color_eng_GREEN,des_agrup_color_eng_GREY,des_agrup_color_eng_ORANGE,des_agrup_color_eng_PINK,des_agrup_color_eng_PURPLE,des_agrup_color_eng_RED,...,des_product_type_Tie,des_product_type_Top,des_product_type_Totes bag,des_product_type_Trainers,des_product_type_Trenchcoat,des_product_type_Trousers,des_product_type_Turban,des_product_type_Umbrella,des_product_type_Vest,des_product_type_Wallet
0,41085800-02,datathon/images/2019_41085800_02.jpg,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,53000586-TO,datathon/images/2019_53000586_TO.jpg,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53030601-81,datathon/images/2019_53030601_81.jpg,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,53050730-15,datathon/images/2019_53050730_15.jpg,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,53070773-70,datathon/images/2019_53070773_70.jpg,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Save preprocessed files

In [73]:
encoded_product_data.to_csv("dataset/product_data_preprocessed.csv", index=False)
outfit_data.to_csv("dataset/outfit_data_preprocessed.csv", index=False)