In [1]:
import numpy as np
import pandas as pd
import os

In [3]:
data_path = os.path.join('../data/fashion-dataset/')

In [4]:
image_df = pd.read_csv(os.path.join(data_path, 'images.csv'))

In [5]:
image_df.head()

Unnamed: 0,filename,link
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


In [6]:
image_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  44446 non-null  object
 1   link      44446 non-null  object
dtypes: object(2)
memory usage: 694.6+ KB


In [7]:
styles_df = pd.read_csv(os.path.join(data_path, 'styles.csv'), on_bad_lines='skip')

In [8]:
styles_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [9]:
# Some are Null for product Display name, usage
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44425 entries, 0 to 44424
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44425 non-null  int64  
 1   gender              44425 non-null  object 
 2   masterCategory      44425 non-null  object 
 3   subCategory         44425 non-null  object 
 4   articleType         44425 non-null  object 
 5   baseColour          44410 non-null  object 
 6   season              44404 non-null  object 
 7   year                44424 non-null  float64
 8   usage               44108 non-null  object 
 9   productDisplayName  44418 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB


In [10]:
# unique master categories
print(np.unique(styles_df.masterCategory))

['Accessories' 'Apparel' 'Footwear' 'Free Items' 'Home' 'Personal Care'
 'Sporting Goods']


In [11]:
# master categories distribution.. Home, sporting goods and free items are very less in number
styles_df.groupby('masterCategory')['masterCategory'].count()

masterCategory
Accessories       11274
Apparel           21397
Footwear           9219
Free Items          105
Home                  1
Personal Care      2404
Sporting Goods       25
Name: masterCategory, dtype: int64

In [12]:
# lets remove these
styles_df = styles_df[styles_df['masterCategory'].isin(['Accessories', 'Apparel', 'Footwear', 'Personal Care'])]

In [13]:
styles_df.groupby('masterCategory')['masterCategory'].count()

masterCategory
Accessories      11274
Apparel          21397
Footwear          9219
Personal Care     2404
Name: masterCategory, dtype: int64

In [14]:
# unique subcategories
print(np.unique(styles_df.subCategory))

['Accessories' 'Apparel Set' 'Bags' 'Bath and Body' 'Beauty Accessories'
 'Belts' 'Bottomwear' 'Cufflinks' 'Dress' 'Eyes' 'Eyewear' 'Flip Flops'
 'Fragrance' 'Gloves' 'Hair' 'Headwear' 'Innerwear' 'Jewellery' 'Lips'
 'Loungewear and Nightwear' 'Makeup' 'Mufflers' 'Nails' 'Perfumes'
 'Sandal' 'Saree' 'Scarves' 'Shoe Accessories' 'Shoes' 'Skin' 'Skin Care'
 'Socks' 'Sports Accessories' 'Stoles' 'Ties' 'Topwear' 'Umbrellas'
 'Wallets' 'Watches' 'Water Bottle']


In [15]:
# checking subcategory distribution.
styles_df.groupby('subCategory')['subCategory'].count()

subCategory
Accessories                   129
Apparel Set                   106
Bags                         3055
Bath and Body                  12
Beauty Accessories              4
Belts                         811
Bottomwear                   2694
Cufflinks                     108
Dress                         478
Eyes                           43
Eyewear                      1073
Flip Flops                    913
Fragrance                    1012
Gloves                         20
Hair                           19
Headwear                      293
Innerwear                    1808
Jewellery                    1079
Lips                          527
Loungewear and Nightwear      470
Makeup                        307
Mufflers                       38
Nails                         329
Perfumes                        6
Sandal                        963
Saree                         427
Scarves                       118
Shoe Accessories               24
Shoes                        7343
Sk

In [16]:
# for simplicity while training lets delete instances having less than 15 samples
styles_df = styles_df.groupby('subCategory').filter(lambda x : len(x)>20)

In [17]:
styles_df.groupby('subCategory')['subCategory'].count()

subCategory
Accessories                   129
Apparel Set                   106
Bags                         3055
Belts                         811
Bottomwear                   2694
Cufflinks                     108
Dress                         478
Eyes                           43
Eyewear                      1073
Flip Flops                    913
Fragrance                    1012
Headwear                      293
Innerwear                    1808
Jewellery                    1079
Lips                          527
Loungewear and Nightwear      470
Makeup                        307
Mufflers                       38
Nails                         329
Sandal                        963
Saree                         427
Scarves                       118
Shoe Accessories               24
Shoes                        7343
Skin                           69
Skin Care                      77
Socks                         698
Stoles                         90
Ties                          258
To

In [18]:
styles_df.groupby('articleType')['articleType'].count()

articleType
Accessory Gift Set      97
Baby Dolls              16
Backpacks              722
Bangle                  85
Bath Robe               20
                      ... 
Waist Pouch             17
Waistcoat               15
Wallets                935
Watches               2542
Water Bottle             4
Name: articleType, Length: 131, dtype: int64

In [19]:
# removing items having less than 10
styles_df = styles_df.groupby('articleType').filter(lambda x : len(x)>10)

In [20]:
styles_df.groupby('articleType')['articleType'].count()

articleType
Accessory Gift Set      97
Baby Dolls              16
Backpacks              722
Bangle                  85
Bath Robe               20
                      ... 
Tunics                 229
Waist Pouch             17
Waistcoat               15
Wallets                935
Watches               2542
Name: articleType, Length: 102, dtype: int64

In [21]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44097 entries, 0 to 44424
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44097 non-null  int64  
 1   gender              44097 non-null  object 
 2   masterCategory      44097 non-null  object 
 3   subCategory         44097 non-null  object 
 4   articleType         44097 non-null  object 
 5   baseColour          44082 non-null  object 
 6   season              44076 non-null  object 
 7   year                44096 non-null  float64
 8   usage               43794 non-null  object 
 9   productDisplayName  44090 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.7+ MB


In [22]:
styles_df.groupby('baseColour')['baseColour'].count()

baseColour
Beige                 745
Black                9645
Blue                 4884
Bronze                 94
Brown                3468
Burgundy               42
Charcoal              226
Coffee Brown           31
Copper                 85
Cream                 381
Fluorescent Green       5
Gold                  621
Green                2100
Grey                 2736
Grey Melange          146
Khaki                 139
Lavender              162
Lime Green              6
Magenta               128
Maroon                576
Mauve                  29
Metallic               42
Multi                 392
Mushroom Brown         16
Mustard                96
Navy Blue            1784
Nude                   23
Off White             182
Olive                 407
Orange                525
Peach                 195
Pink                 1844
Purple               1628
Red                  2436
Rose                   27
Rust                   66
Sea Green              22
Silver               1087
S

In [23]:
# removing items with less than 10 colours
styles_df = styles_df.groupby('baseColour').filter(lambda x : len(x)>10)

In [24]:
styles_df.groupby('baseColour')['baseColour'].count()

baseColour
Beige              745
Black             9645
Blue              4884
Bronze              94
Brown             3468
Burgundy            42
Charcoal           226
Coffee Brown        31
Copper              85
Cream              381
Gold               621
Green             2100
Grey              2736
Grey Melange       146
Khaki              139
Lavender           162
Magenta            128
Maroon             576
Mauve               29
Metallic            42
Multi              392
Mushroom Brown      16
Mustard             96
Navy Blue         1784
Nude                23
Off White          182
Olive              407
Orange             525
Peach              195
Pink              1844
Purple            1628
Red               2436
Rose                27
Rust                66
Sea Green           22
Silver            1087
Skin               176
Steel              314
Tan                114
Taupe               11
Teal               120
Turquoise Blue      68
White             5491


In [25]:
styles_df.groupby('season')['season'].count()

season
Fall      11381
Spring     2901
Summer    21317
Winter     8451
Name: season, dtype: int64

In [26]:
# no need of year column.. delete it
styles_df.groupby('year')['year'].count()

year
2007.0        2
2008.0        7
2009.0       20
2010.0      826
2011.0    13636
2012.0    16173
2013.0     1207
2014.0      230
2015.0     2764
2016.0     5936
2017.0     2832
2018.0      405
2019.0       32
Name: year, dtype: int64

In [27]:
styles_df = styles_df.drop(columns='year')

In [28]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44071 entries, 0 to 44424
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44071 non-null  int64 
 1   gender              44071 non-null  object
 2   masterCategory      44071 non-null  object
 3   subCategory         44071 non-null  object
 4   articleType         44071 non-null  object
 5   baseColour          44071 non-null  object
 6   season              44050 non-null  object
 7   usage               43777 non-null  object
 8   productDisplayName  44069 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.4+ MB


In [29]:
styles_df.groupby('usage')['usage'].count()

usage
Casual          34163
Ethnic           3175
Formal           2332
Party              28
Smart Casual       67
Sports           3986
Travel             26
Name: usage, dtype: int64

In [30]:
# removing all items with product display name as null
styles_df = styles_df.dropna(subset=['productDisplayName'])
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44069 entries, 0 to 44424
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44069 non-null  int64 
 1   gender              44069 non-null  object
 2   masterCategory      44069 non-null  object
 3   subCategory         44069 non-null  object
 4   articleType         44069 non-null  object
 5   baseColour          44069 non-null  object
 6   season              44048 non-null  object
 7   usage               43775 non-null  object
 8   productDisplayName  44069 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.4+ MB


In [32]:
styles_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Puma Men Grey T-shirt


In [42]:
styles_df[styles_df['usage'].isnull()]

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,productNameNew
87,56489,Women,Personal Care,Nails,Nail Polish,Bronze,Spring,,Streetwear Ash Nail Polish # 31,nan Streetwear Ash Nail Polish # 31
92,52029,Unisex,Apparel,Topwear,Rain Jacket,Coffee Brown,Summer,,Just Natural Unisex Charcoal Rain Jacket,nan Just Natural Unisex Charcoal Rain Jacket
292,55001,Women,Personal Care,Lips,Lipstick,Pink,Spring,,Lakme Absolute Lip Last Day Kiss Lip Colour,nan Lakme Absolute Lip Last Day Kiss Lip Colour
479,57563,Women,Personal Care,Lips,Lipstick,Brown,Spring,,Lotus Herbals Pure Colours Nutty Brown Lipstic...,nan Lotus Herbals Pure Colours Nutty Brown Lip...
511,55006,Women,Personal Care,Lips,Lip Gloss,Copper,Spring,,Lakme Sheer Satin Lip Gloss 51,nan Lakme Sheer Satin Lip Gloss 51
...,...,...,...,...,...,...,...,...,...,...
43569,51044,Women,Apparel,Loungewear and Nightwear,Baby Dolls,Navy Blue,Summer,,Enamor Women Navy Blue Baby Doll Nightdress,nan Enamor Women Navy Blue Baby Doll Nightdress
43634,56605,Women,Personal Care,Makeup,Kajal and Eyeliner,Black,Spring,,Streetwear Black Eye Liner 01,nan Streetwear Black Eye Liner 01
44080,57715,Women,Personal Care,Lips,Lip Gloss,Red,Spring,,Lotus Herbals Seduction Sappy Watermelon Lip G...,nan Lotus Herbals Seduction Sappy Watermelon L...
44228,55045,Women,Personal Care,Lips,Lipstick,Purple,Spring,,Lakme Enrich Satins Lipstick 461,nan Lakme Enrich Satins Lipstick 461


In [43]:
styles_df = styles_df.fillna('')

In [44]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44069 entries, 0 to 44424
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44069 non-null  int64 
 1   gender              44069 non-null  object
 2   masterCategory      44069 non-null  object
 3   subCategory         44069 non-null  object
 4   articleType         44069 non-null  object
 5   baseColour          44069 non-null  object
 6   season              44069 non-null  object
 7   usage               44069 non-null  object
 8   productDisplayName  44069 non-null  object
 9   productNameNew      44069 non-null  object
dtypes: int64(1), object(9)
memory usage: 3.7+ MB


In [45]:
# Lets remove column usage and merge it with product display name creating new column productNameNew
styles_df['productNameNew'] = styles_df['usage'].astype(str) + ' ' + styles_df['productDisplayName']

In [46]:
styles_df.head(10)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,productNameNew
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,Casual,Turtle Check Men Navy Blue Shirt,Casual Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,Casual,Peter England Men Party Blue Jeans,Casual Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,Casual,Titan Women Silver Watch,Casual Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,Casual,Manchester United Men Solid Black Track Pants,Casual Manchester United Men Solid Black Track...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Puma Men Grey T-shirt,Casual Puma Men Grey T-shirt
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Inkfruit Mens Chain Reaction T-shirt,Casual Inkfruit Mens Chain Reaction T-shirt
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,Ethnic,Fabindia Men Striped Green Shirt,Ethnic Fabindia Men Striped Green Shirt
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,Casual,Jealous 21 Women Purple Shirt,Casual Jealous 21 Women Purple Shirt
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,Casual,Puma Men Pack of 3 Socks,Casual Puma Men Pack of 3 Socks
9,30039,Men,Accessories,Watches,Watches,Black,Winter,Casual,Skagen Men Black Watch,Casual Skagen Men Black Watch


In [48]:
import re

In [80]:
# Lets check numbers and special characters from productNameNew
def special_present(x):
    return bool(re.search(r'[\d#!*?.\/]', x))

In [81]:
styles_df['special_present'] = styles_df['productNameNew'].apply(lambda x : special_present(x))

In [82]:
styles_df[styles_df.special_present == True]

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,productNameNew,special_present
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,Casual,Jealous 21 Women Purple Shirt,Casual Jealous 21 Women Purple Shirt,True
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,Casual,Puma Men Pack of 3 Socks,Casual Puma Men Pack of 3 Socks,True
14,46885,Boys,Footwear,Flip Flops,Flip Flops,Navy Blue,Fall,Casual,Ben 10 Boys Navy Blue Slippers,Casual Ben 10 Boys Navy Blue Slippers,True
16,29928,Men,Accessories,Watches,Watches,Black,Winter,Casual,Police Men Black Dial Watch PL12889JVSB,Casual Police Men Black Dial Watch PL12889JVSB,True
18,51832,Women,Apparel,Innerwear,Bra,Beige,Summer,Casual,Bwitch Beige Full-Coverage Bra BW335,Casual Bwitch Beige Full-Coverage Bra BW335,True
...,...,...,...,...,...,...,...,...,...,...,...
44395,37603,Girls,Apparel,Topwear,Tshirts,Purple,Summer,Casual,Madagascar3 Girls Purple Printed T-Shirt,Casual Madagascar3 Girls Purple Printed T-Shirt,True
44397,56634,Women,Personal Care,Skin Care,Face Wash and Cleanser,Black,Spring,Casual,Olay Women Total Effects 7 in 1 Foaming Cleanser,Casual Olay Women Total Effects 7 in 1 Foaming...,True
44408,32143,Men,Apparel,Innerwear,Briefs,Red,Summer,Casual,Playboy Men Duet Pack of 2 Briefs,Casual Playboy Men Duet Pack of 2 Briefs,True
44414,40563,Women,Accessories,Watches,Watches,White,Winter,Casual,Titan Women White Dial Watch NB9701WM01,Casual Titan Women White Dial Watch NB9701WM01,True


In [106]:
def remove_special(x):
    sample = re.sub(r'[\d#!*?.\/]', '', x)
    sample = re.sub(r'\b[A-Z]+\b', '', sample)
    sample = sample.lower()
    return " ".join(sample.split())

In [107]:
styles_df['cleanProductName'] = styles_df['productNameNew'].apply(lambda x : remove_special(x))

In [109]:
styles_df.loc[44397]

id                                                                56634
gender                                                            Women
masterCategory                                            Personal Care
subCategory                                                   Skin Care
articleType                                      Face Wash and Cleanser
baseColour                                                        Black
season                                                           Spring
usage                                                            Casual
productDisplayName     Olay Women Total Effects 7 in 1 Foaming Cleanser
productNameNew        Casual Olay Women Total Effects 7 in 1 Foaming...
special_present                                                    True
cleanProductName      casual olay women total effects in foaming cle...
Name: 44397, dtype: object

In [110]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44069 entries, 0 to 44424
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44069 non-null  int64 
 1   gender              44069 non-null  object
 2   masterCategory      44069 non-null  object
 3   subCategory         44069 non-null  object
 4   articleType         44069 non-null  object
 5   baseColour          44069 non-null  object
 6   season              44069 non-null  object
 7   usage               44069 non-null  object
 8   productDisplayName  44069 non-null  object
 9   productNameNew      44069 non-null  object
 10  special_present     44069 non-null  bool  
 11  cleanProductName    44069 non-null  object
dtypes: bool(1), int64(1), object(10)
memory usage: 5.1+ MB


In [111]:
# remove the intermediate columns
styles_df = styles_df.drop(columns=['productDisplayName', 'productNameNew', 'special_present', 'usage'])
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44069 entries, 0 to 44424
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                44069 non-null  int64 
 1   gender            44069 non-null  object
 2   masterCategory    44069 non-null  object
 3   subCategory       44069 non-null  object
 4   articleType       44069 non-null  object
 5   baseColour        44069 non-null  object
 6   season            44069 non-null  object
 7   cleanProductName  44069 non-null  object
dtypes: int64(1), object(7)
memory usage: 4.0+ MB


In [143]:
styles_df[styles_df.season == '']

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,cleanProductName
1903,5376,Unisex,Footwear,Shoes,Sports Shoes,Black,,sports btwin mtb shoes
6122,5355,Men,Footwear,Shoes,Sports Shoes,Grey,,sports kalenji kapteren greywhite
9483,5391,Men,Footwear,Shoes,Sports Shoes,Blue,,sports kalenji kiprun blue ah
10095,5398,Men,Footwear,Shoes,Sports Shoes,Grey,,sports quechua arpenaz flex novadry grey
10450,5389,Unisex,Footwear,Shoes,Sports Shoes,Yellow,,sports kalenji kapteren yellow fw
11358,53781,Men,Apparel,Topwear,Tshirts,Blue,,sports puma men blue sless round neck -shirt
17424,5385,Women,Footwear,Shoes,Sports Shoes,Blue,,sports quechua arpenaz flex lady grey
18766,5354,Men,Footwear,Shoes,Sports Shoes,Grey,,sports kalenji ekiden whtgrey
21681,5370,Men,Footwear,Shoes,Sports Shoes,Black,,sports inesis canaveral black
22350,5401,Women,Footwear,Shoes,Sports Shoes,Blue,,sports kalenji ekiden lady blue


In [116]:
styles_df.groupby('gender')['gender'].count()

gender
Boys        818
Girls       643
Men       22051
Unisex     2094
Women     18463
Name: gender, dtype: int64

In [112]:
styles_df = styles_df.sample(frac=1).reset_index(drop=True)

In [117]:
# save prepared
styles_df.to_csv(os.path.join(data_path, 'final-styles_df.csv'), index=False)

In [145]:
styles_df.loc[styles_df.season == '', 'season'] = 'any'

In [146]:
styles_df.groupby('season')['season'].count()

season
Fall      11381
Spring     2901
Summer    21315
Winter     8451
any          21
Name: season, dtype: int64

In [118]:
from sklearn.preprocessing import LabelEncoder

In [119]:
le_gender = LabelEncoder()
le_gender.fit(styles_df.gender)

LabelEncoder()

In [124]:
gender_values = list(le_gender.classes_)
gender_values

['Boys', 'Girls', 'Men', 'Unisex', 'Women']

In [129]:
gender_keys = [i for i in range(len(gender_values))]
gender_keys

[0, 1, 2, 3, 4]

In [134]:
gender_dict = zip(gender_keys, gender_values)
dict(gender_dict)

{0: 'Boys', 1: 'Girls', 2: 'Men', 3: 'Unisex', 4: 'Women'}

In [147]:
def return_dict(column):
    le = LabelEncoder()
    le.fit(column)
    values = list(le.classes_)
    keys = [i for i in range(len(values))]
    return dict(zip(keys, values)), dict(zip(values, keys))

In [148]:
gender_dict, gender_inv = return_dict(styles_df.gender)
master_dict, master_inv = return_dict(styles_df.masterCategory)
sub_dict, sub_inv = return_dict(styles_df.subCategory)
article_dict, article_inv = return_dict(styles_df.articleType)
color_dict, color_inv = return_dict(styles_df.baseColour)
season_dict, season_inv = return_dict(styles_df.season)

In [149]:
gender_dict, master_dict, sub_dict, article_dict, color_dict, season_dict

({0: 'Boys', 1: 'Girls', 2: 'Men', 3: 'Unisex', 4: 'Women'},
 {0: 'Accessories', 1: 'Apparel', 2: 'Footwear', 3: 'Personal Care'},
 {0: 'Accessories',
  1: 'Apparel Set',
  2: 'Bags',
  3: 'Belts',
  4: 'Bottomwear',
  5: 'Cufflinks',
  6: 'Dress',
  7: 'Eyes',
  8: 'Eyewear',
  9: 'Flip Flops',
  10: 'Fragrance',
  11: 'Headwear',
  12: 'Innerwear',
  13: 'Jewellery',
  14: 'Lips',
  15: 'Loungewear and Nightwear',
  16: 'Makeup',
  17: 'Mufflers',
  18: 'Nails',
  19: 'Sandal',
  20: 'Saree',
  21: 'Scarves',
  22: 'Shoe Accessories',
  23: 'Shoes',
  24: 'Skin',
  25: 'Skin Care',
  26: 'Socks',
  27: 'Stoles',
  28: 'Ties',
  29: 'Topwear',
  30: 'Wallets',
  31: 'Watches'},
 {0: 'Accessory Gift Set',
  1: 'Baby Dolls',
  2: 'Backpacks',
  3: 'Bangle',
  4: 'Bath Robe',
  5: 'Belts',
  6: 'Booties',
  7: 'Boxers',
  8: 'Bra',
  9: 'Bracelet',
  10: 'Briefs',
  11: 'Camisoles',
  12: 'Capris',
  13: 'Caps',
  14: 'Casual Shoes',
  15: 'Churidar',
  16: 'Clutches',
  17: 'Compact',
 

In [150]:
gender_inv, master_inv, sub_inv, article_inv, color_inv, season_inv

({'Boys': 0, 'Girls': 1, 'Men': 2, 'Unisex': 3, 'Women': 4},
 {'Accessories': 0, 'Apparel': 1, 'Footwear': 2, 'Personal Care': 3},
 {'Accessories': 0,
  'Apparel Set': 1,
  'Bags': 2,
  'Belts': 3,
  'Bottomwear': 4,
  'Cufflinks': 5,
  'Dress': 6,
  'Eyes': 7,
  'Eyewear': 8,
  'Flip Flops': 9,
  'Fragrance': 10,
  'Headwear': 11,
  'Innerwear': 12,
  'Jewellery': 13,
  'Lips': 14,
  'Loungewear and Nightwear': 15,
  'Makeup': 16,
  'Mufflers': 17,
  'Nails': 18,
  'Sandal': 19,
  'Saree': 20,
  'Scarves': 21,
  'Shoe Accessories': 22,
  'Shoes': 23,
  'Skin': 24,
  'Skin Care': 25,
  'Socks': 26,
  'Stoles': 27,
  'Ties': 28,
  'Topwear': 29,
  'Wallets': 30,
  'Watches': 31},
 {'Accessory Gift Set': 0,
  'Baby Dolls': 1,
  'Backpacks': 2,
  'Bangle': 3,
  'Bath Robe': 4,
  'Belts': 5,
  'Booties': 6,
  'Boxers': 7,
  'Bra': 8,
  'Bracelet': 9,
  'Briefs': 10,
  'Camisoles': 11,
  'Capris': 12,
  'Caps': 13,
  'Casual Shoes': 14,
  'Churidar': 15,
  'Clutches': 16,
  'Compact': 17,
 

In [154]:
#### Rough work
styles_df[styles_df.id == 39401]

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,cleanProductName
37377,39401,Men,Apparel,Bottomwear,Jeans,Blue,Winter,casual polo assn denim co men blue slim straig...


In [156]:
image_df[image_df.filename == '39401.jpg']

Unnamed: 0,filename,link
32324,39401.jpg,undefined


In [160]:
undefined_files = image_df[image_df.link == 'undefined']['filename']
undefined_files = list(undefined_files)
undefined_files

['39403.jpg', '39410.jpg', '39401.jpg', '39425.jpg', '12347.jpg']

In [161]:
undefined_files = [39403, 39410, 39401, 39425, 12347]

In [162]:
# the filename with undefined link are not present in the dataset of images.Remove them from styles
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44069 entries, 0 to 44068
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                44069 non-null  int64 
 1   gender            44069 non-null  object
 2   masterCategory    44069 non-null  object
 3   subCategory       44069 non-null  object
 4   articleType       44069 non-null  object
 5   baseColour        44069 non-null  object
 6   season            44069 non-null  object
 7   cleanProductName  44069 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.7+ MB


In [164]:
styles_df = styles_df[~styles_df['id'].isin(undefined_files)]

In [165]:
styles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44065 entries, 0 to 44068
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                44065 non-null  int64 
 1   gender            44065 non-null  object
 2   masterCategory    44065 non-null  object
 3   subCategory       44065 non-null  object
 4   articleType       44065 non-null  object
 5   baseColour        44065 non-null  object
 6   season            44065 non-null  object
 7   cleanProductName  44065 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.0+ MB


In [167]:
styles_df.tail()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,cleanProductName
44064,28559,Men,Apparel,Topwear,Tshirts,Red,Summer,casual nike men blow up red -shirt
44065,56041,Women,Personal Care,Lips,Lipstick,Brown,Spring,casual colorbar coco liscious lipstick
44066,42209,Women,Apparel,Dress,Dresses,Purple,Summer,casual sepia women purple dress
44067,25534,Men,Apparel,Topwear,Shirts,Red,Summer,casual wrangler men american flag red shirt
44068,28563,Men,Apparel,Topwear,Tshirts,Orange,Summer,casual nike men brushed orange -shirt


In [168]:
styles_df.to_csv(os.path.join(data_path, 'final-styles_df.csv'), index=False)