# Preprocessing the data

In [19]:
import pandas as pd
import numpy as np

In [20]:
outfit_data = pd.read_csv("../dataset/outfit_data.csv")
product_data = pd.read_csv("../dataset/product_data.csv")

In [21]:
print(outfit_data.shape)
print(product_data.shape)

(43582, 2)
(9222, 13)


We discard some categories, as they do not belong to clothes and also categories with really low value counts. 

In [22]:
non_interseting_codes = product_data[product_data['des_product_family'].isin(['Fragances', 'Deco Accessories', 'Intimate', 'Deco Textiles', 'Bedding']) | product_data["des_sex"].isin(["Male","Unisex"]) | product_data["des_age"].isin(["Kids"])]['cod_modelo_color']

product_data = product_data[~product_data['cod_modelo_color'].isin(non_interseting_codes)]
outfit_data = outfit_data[~outfit_data['cod_modelo_color'].isin(non_interseting_codes)]

In [23]:
print(outfit_data.shape)
print(product_data.shape)

(42827, 2)
(8986, 13)


### Product data

In [24]:
# product_data.head(10)

We imputate color value for the ones that do not have an integer code.

In [25]:
product_data['cod_color_code'] = pd.to_numeric(product_data['cod_color_code'], errors='coerce')

mean_values = product_data.groupby('des_agrup_color_eng')['cod_color_code'].mean()

def replace_nan_with_mean(row):
    if np.isnan(row['cod_color_code']):
        return mean_values[row['des_agrup_color_eng']]
    else:
        return row['cod_color_code']

product_data['cod_color_code'] = product_data.apply(replace_nan_with_mean, axis=1)  

In [26]:
product_data['des_filename'] = product_data['des_filename'].apply(lambda x: x.split('/')[-1])
product_data.head(5)

Unnamed: 0,cod_modelo_color,cod_color_code,des_color_specification_esp,des_agrup_color_eng,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename
0,41085800-02,2.0,OFFWHITE,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Trousers & leggings,Trousers,Trousers,2019_41085800_02.jpg
1,53000586-TO,53.825949,TEJANO OSCURO,BLUE,Female,Adult,SHE,J-JEANS,Bottoms,Jeans,Jeans,Jeans,2019_53000586_TO.jpg
2,53030601-81,81.0,ROSA PASTEL,PINK,Female,Adult,SHE,P-PLANA,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,2019_53030601_81.jpg
3,53050730-15,15.0,MOSTAZA,YELLOW,Female,Adult,SHE,P-PLANA,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,2019_53050730_15.jpg
4,53070773-70,70.0,ROJO,RED,Female,Adult,SHE,P-PLANA,Tops,Shirts,Shirt,Shirt,2019_53070773_70.jpg


In [27]:
product_data.isnull().sum()

cod_modelo_color                 0
cod_color_code                   0
des_color_specification_esp      0
des_agrup_color_eng              0
des_sex                          0
des_age                          0
des_line                         0
des_fabric                       0
des_product_category             0
des_product_aggregated_family    0
des_product_family               0
des_product_type                 0
des_filename                     0
dtype: int64

In [28]:
product_data.columns

Index(['cod_modelo_color', 'cod_color_code', 'des_color_specification_esp',
       'des_agrup_color_eng', 'des_sex', 'des_age', 'des_line', 'des_fabric',
       'des_product_category', 'des_product_aggregated_family',
       'des_product_family', 'des_product_type', 'des_filename'],
      dtype='object')

In [29]:
product_data['des_line'].value_counts()

des_line
SHE        8979
VIOLETA       7
Name: count, dtype: int64

In [30]:
product_data['des_fabric'].value_counts()

des_fabric
P-PLANA           2951
C-COMPLEMENTOS    2479
K-CIRCULAR        1532
T-TRICOT          1250
J-JEANS            606
O-POLIPIEL          97
L-PIEL              71
Name: count, dtype: int64

In [31]:
columns = ['cod_modelo_color', 'cod_color_code', 'des_agrup_color_eng', 'des_fabric', 'des_product_category',
           'des_product_type', 'des_filename', 'des_product_family']

In [32]:
product_data= product_data[columns]
columns_to_encode = product_data.columns.difference(['cod_modelo_color', 'des_filename', 'cod_color_code'])
product_data.to_csv("../dataset/product_data_cleaned.csv")

We perform one-hot encoding for the categorical data

In [33]:
encoded_product_data = pd.get_dummies(product_data, columns=columns_to_encode, dtype=int)
encoded_product_data.shape
encoded_product_data.head(5)

Unnamed: 0,cod_modelo_color,cod_color_code,des_filename,des_agrup_color_eng_BLUE,des_agrup_color_eng_BROWN,des_agrup_color_eng_GREEN,des_agrup_color_eng_GREY,des_agrup_color_eng_ORANGE,des_agrup_color_eng_PINK,des_agrup_color_eng_PURPLE,...,des_product_type_Tie,des_product_type_Top,des_product_type_Totes bag,des_product_type_Trainers,des_product_type_Trenchcoat,des_product_type_Trousers,des_product_type_Turban,des_product_type_Umbrella,des_product_type_Vest,des_product_type_Wallet
0,41085800-02,2.0,2019_41085800_02.jpg,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,53000586-TO,53.825949,2019_53000586_TO.jpg,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53030601-81,81.0,2019_53030601_81.jpg,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,53050730-15,15.0,2019_53050730_15.jpg,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,53070773-70,70.0,2019_53070773_70.jpg,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Outfit data

In [34]:
outfit_data.head(7)

Unnamed: 0,cod_outfit,cod_modelo_color
0,1,51000622-02
1,1,43067759-01
2,1,53060518-02
3,1,53030594-08
4,1,43077762-01
5,1,43063724-OR
6,1,43075794-OR


In [35]:
outfit_data.isnull().sum()

cod_outfit          0
cod_modelo_color    0
dtype: int64

### Save preprocessed files

In [36]:
encoded_product_data.to_csv("../dataset/product_data_preprocessed.csv", index=False)
outfit_data.to_csv("../dataset/outfit_data_preprocessed.csv", index=False)