In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from scipy.spatial.distance import cosine
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


# 1. Load Dataset

## 1.1 Product

In [2]:
%%time
# load product information
product = pd.read_excel('Behold+product+data+04262021.xlsx')
product.head(3)

Wall time: 11.2 s


Unnamed: 0,product_id,brand,brand_category,name,details,created_at,brand_canonical_url,description,brand_description,brand_name,product_active
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,True
2,01EY4Y1BW8VZW51BWG5VZY82XW,Cariuma,Unknown,IBI Slip On Raw Red Knit Sneaker Women,,2021-02-10 02:58:59.591 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,False


In [3]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61355 entries, 0 to 61354
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           61355 non-null  object
 1   brand                61355 non-null  object
 2   brand_category       60896 non-null  object
 3   name                 61354 non-null  object
 4   details              9200 non-null   object
 5   created_at           61355 non-null  object
 6   brand_canonical_url  61355 non-null  object
 7   description          51238 non-null  object
 8   brand_description    51234 non-null  object
 9   brand_name           61354 non-null  object
 10  product_active       61355 non-null  bool  
dtypes: bool(1), object(10)
memory usage: 4.7+ MB


## 1.2 Brand

In [4]:
# load brand information
brand = pd.read_csv('behold_brands.csv')
brand.head(3)

Unnamed: 0,brand_id,brand,brand_value,bio,quote,quote_attribute,intro,lifestyle_copy,short_bio,listing_bio
0,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Handmade / Artisan Crafted,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...
1,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Sustainable,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...
2,01ESKR0CH2KYC7KBNTN0S38EQA,Mari Giudicelli,Women Owned,,,,,,,Behold Mari Giudicelli! This Brazilian shoe de...


In [5]:
brand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand_id         162 non-null    object
 1   brand            162 non-null    object
 2   brand_value      154 non-null    object
 3   bio              157 non-null    object
 4   quote            157 non-null    object
 5   quote_attribute  154 non-null    object
 6   intro            154 non-null    object
 7   lifestyle_copy   156 non-null    object
 8   short_bio        157 non-null    object
 9   listing_bio      157 non-null    object
dtypes: object(10)
memory usage: 12.8+ KB


## 1.3 Outfit

In [6]:
outfit = pd.read_csv('outfit_combinations.csv')
outfit.head(5)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


In [7]:
outfit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5291 entries, 0 to 5290
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   outfit_id          5291 non-null   object
 1   product_id         5291 non-null   object
 2   outfit_item_type   5291 non-null   object
 3   brand              5291 non-null   object
 4   product_full_name  5291 non-null   object
dtypes: object(5)
memory usage: 206.8+ KB


## 1.3 Expert Tags

In [8]:
tags = pd.read_csv('usc_additional_tags.csv')
tags.head(3)

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,linenblend
1,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,cottonblend
2,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,modern


In [9]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97420 entries, 0 to 97419
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_id        97420 non-null  object
 1   product_color_id  97420 non-null  object
 2   attribute_name    97420 non-null  object
 3   attribute_value   97420 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


# 2. EDA

There is no duplicate product id in product.csv

In [10]:
len(product.product_id.unique())==len(product)

True

We have more brands in product.csv compared to brand.csv and outfit.csv

In [11]:
# number of unique brands in product.csv
len(product.brand.unique())

386

In [12]:
# number of unique brands in brand.csv
len(brand.brand.unique())

74

In [13]:
# number of unique brands in outfit.csv
len(outfit.brand.unique())

306

There are 6 different brand values which could be set as a new feature later on

In [14]:
brand.brand_value.value_counts()

Women Owned                   47
Emerging                      41
Handmade / Artisan Crafted    29
Sustainable                   27
BIPOC Owned                    6
Vegan                          4
Name: brand_value, dtype: int64

There are 7 different outfit item types, and five different outfit types:
- shoe
- accessory
- top
- bottom
- onepiece

In [15]:
outfit.outfit_item_type.value_counts()

shoe          1149
accessory1    1064
accessory2     978
top            950
bottom         928
onepiece       221
accessory3       1
Name: outfit_item_type, dtype: int64

There are 86 different attribute types, each type is assocaites with several values. Attributes could be added to features for different products later on.

In [16]:
len(tags.attribute_name.unique())

86

In [17]:
tags.attribute_name.value_counts()[:20]

style               14462
occasion            12624
materialclothing     5589
category             5522
primarycolor         5466
gender               5433
fit                  3814
sizing               3701
drycleanonly         3488
sleevelength         2619
additionalcolor      2572
neckline             2417
lengthtop            1837
pattern              1604
subcategorytop       1387
uppermaterial        1252
subcategoryshoe      1246
shoewidth            1221
toestyle             1217
closureshoe          1163
Name: attribute_name, dtype: int64

# 3. Data Preprocessing

In [18]:
%%time
# need to download 'en_core_web_lg' first
nlp = spacy.load('en_core_web_lg',disable=['ner','parser'])

Wall time: 2.43 s


### 3.1 lemmatization

In [19]:
def spacy_lem(data, field):
    '''
    do lemmatization for data[field]
    '''
    for i in range(len(data)):
        if type(data.loc[i,field])==str:
            rows = []
            doc = nlp(data.loc[i,field])
            for token in doc:
                rows.append(token.lemma_)
            data.loc[i,field] = ' '.join(rows)

In [20]:
%%time
# need to take ~7mins
spacy_lem(product,'description')

Wall time: 9min 46s


In [21]:
%%time
spacy_lem(product,'details')

Wall time: 1min 15s


In [22]:
%%time
for field in brand.columns[2:]:
    spacy_lem(brand,field)

Wall time: 6.35 s


### 3.2 Regex Cleaning

In [23]:
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

In [24]:
nltk_stopwords = set(stopwords.words("English"))
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

In [25]:
%%time
product = product.fillna("UNKNOWN_TOKEN")
for col in ['brand_category','name','details','description']:
    product[col]= product[col].astype('str')
    product[col] = product[col].str.replace('\.|\,|\:|\•|\d|\-|\n|\=',' ')
    product[col] = product[col].str.replace('\/',' ')
    product[col] = product[col].str.replace('\s\s',' ')
    product[col] = product[col].apply(lambda x: x.lower())
    product[col] = product[col].str.replace('Unknown','UNKNOWN_TOKEN')

Wall time: 2.33 s


# 4. Product Classification

### Preprocess category values in tag file

In [26]:
# filter out category information from tag file
tags.head()

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,linenblend
1,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,cottonblend
2,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,modern
3,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,businesscasual
4,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,classic


In [27]:
category_tag = tags[tags.attribute_name.str.contains('category')]

In [28]:
category_tag.attribute_value.unique()

array(['top', 'tee', 'active', 'sneakersathletic', 'shoe', 'jacket',
       'blazerscoatsjackets', 'onepiece', 'dress', 'bottom',
       'pantsleggings', 'blazer', 'skirts', 'croptop', 'buttondown',
       'handbags', 'accessory', 'tunic', 'sweater', 'pullover',
       'turtleneck', 'hooded', 'sweatshirthoodie', 'boots', 'camisole',
       'pumpsheels', 'jumpsuit', 'sandals', 'laceup', 'wrap', 'shorts',
       'blouse', 'coat', 'cropped', 'puffsleeve', 'knit', 'mulesslides',
       'flats', 'cardigan', 'booties', 'peplum', 'mockneck',
       'scarveswraps', 'shell', 'tank', 'sunglasses', 'open', 'bodysuit',
       'romper', 'vest', 'henley', 'belts', 'sportsbra', 'polo', 'poncho',
       'sweatercoat', 'wedges', 'duster', 'oversized', 'slippers',
       'bustier'], dtype=object)

In [29]:
category_tag.attribute_name.unique()

array(['category', 'subcategorytop', 'subcategoryshoe',
       'subcategoryblazerscoatsandjackets', 'subcategoryonepiece',
       'subcategorybottom', 'subcategoryaccessory', 'subcategorysweater',
       'subcategorysweatshirtandhoodie'], dtype=object)

In [30]:
category_tag.loc[category_tag["attribute_name"].str.contains(r"\bsubcategorytop|subcategoryblazerscoatsandjackets|subcategorysweater|subcategorysweatshirtandhoodie\b"),"attribute_value"] = "top"
category_tag.loc[category_tag["attribute_name"] == 'subcategoryshoe',"attribute_value"] = "shoe" 
category_tag.loc[category_tag["attribute_name"] == 'subcategoryonepiece',"attribute_value"] = "onepiece" 
category_tag.loc[category_tag["attribute_name"] == 'subcategorybottom',"attribute_value"] = "bottom" 
category_tag.loc[category_tag["attribute_name"] == 'subcategoryaccessory',"attribute_value"] = "accessory" 

In [31]:
category_tag.attribute_value.unique()

array(['top', 'shoe', 'blazerscoatsjackets', 'onepiece', 'bottom',
       'accessory', 'sweater', 'sweatshirthoodie'], dtype=object)

In [32]:
category_tag.loc[category_tag["attribute_value"].str.contains(r"\bblazerscoatsjackets|sweater|sweatshirthoodie\b"),"attribute_value"] = "top"
category_tag.attribute_value.unique()

array(['top', 'shoe', 'onepiece', 'bottom', 'accessory'], dtype=object)

### Preprocess category values in outfit file

In [33]:
outfit.outfit_item_type.unique()

array(['bottom', 'top', 'accessory1', 'shoe', 'onepiece', 'accessory2',
       'accessory3'], dtype=object)

In [34]:
outfit.product_full_name = outfit.product_full_name.str.lower()
outfit.loc[outfit["outfit_item_type"].str.contains(r"\baccessory1|accessory2|accessory3\b"),"outfit_item_type"] = "accessory"

### Classify products into their most relevant categories


Now, we have created two dataframes having category information: outfit and category_tag, which we would first use to match category. Here are three steps.

1): if "product_id" can be found in outfit dataframe, we can match "product_category" values with "outfit_item_type" in outfit.

2): if product_id can be found in category_tag dataframe, we can match can match "product_category" values with "attribute_value" in category_tag.

3): if both methods above do not work, we would check if "brand_category","name","description" values contain key words in clothing words lists corresponding to each category. It is needed to be mentioned that clothing words lists have been created based on information from brand websites.

Admittedly, some products, like gift card, can not be reasonably classified into any clothing category, so we just fill their category value with "UNKNOWN_TOKEN".

In [44]:
product['product_category'] = "UNKNOWN_TOKEN"
for i in range(len(product)):
    productid = product['product_id'].iloc[i]
    if productid in outfit.product_id.unique():
        product['product_category'].iloc[i] = outfit.loc[outfit["product_id"]== productid,"outfit_item_type"].iloc[-1]
    elif productid in category_tag.product_id.unique():
        product['product_category'].iloc[0] = category_tag.loc[category_tag["product_id"]== productid,"attribute_value"].iloc[-1]

# 
for col in ['brand_category','name','description']:
    product.loc[(product["product_category"]=="UNKNOWN_TOKEN")&(product[col].str.contains(r'\bpullovers?|croptops?|camisoles?|mockneck\
                |vests?|henley|sportsbra|bustiercoats?|jackets?|cardigans?|blazers?|cardi|(denim\s?jackets?)|ponchos?\
                |(t?\s?shirts?)|blouses?|tee|crewneck|polo|tunics?|sweaters?|sweatshirts?|sweat|turtleneck|\
                 ~(sweater\s?(?:ponchos?|skirts?|dress(?:es)?)?)|(hood(?:ie|y|ed))\
                |bras?|lingeries?|tanks?|sleeves?|tops?\b')),"product_category"]="top"
    product.loc[(product["product_category"]=="UNKNOWN_TOKEN")&(product[col].str.contains(r'\b((?:sweater)?\s?dress(?:es)?)|archive|midi|kimono|caftan|kaftan|sleepdress\
                |rompers?|jumpsuits?|leotards?\b')),"product_category"]="onepiece"
    product.loc[(product["product_category"]=="UNKNOWN_TOKEN")&(product[col].str.contains(r'\bpants?|trousers?|joggers?\
                |jeans?|(slim\s?kick)|((?:faux|cln)?\s?(?:pkt|pckt))|bootcut|skinny|(kimmie\s?straight)|slimmy|shorts?|(cropped\s?leg)\
                |skirts?|(wide\s?leg)|tapered|(straight\s?leg)|(skinny\s?leg)|flared|leggings?\b')),"product_category"]="bottom"
    product.loc[(product["product_category"]=="UNKNOWN_TOKEN")&(product[col].str.contains(r'\b(open\s?toed)|(close\s?toed)|(pointed\s?toe)|slingbacks|(pumps\s?stilettos)|pumps?\
                |(open\s?toe)|(toe\s?ring|sandals?)|(mules\s?slides)|mules?|sneakers?|booties|boots?|blockheels?|(heel(?:s|ed)?)\
                |flats?|shoes?|chucks?\b')),"product_category"]="shoe"
    product.loc[(product["product_category"]=="UNKNOWN_TOKEN")&(product[col].str.contains(r'\b(accessor(?:y|ies))|bag|tote?|clutche?|(crossbody(?:bags?)?)\
            |beltbags|shoulderbag?|wallet|bag?|pocket?|purse?|mask?|belt|beltbag?|ring?|necklace?|bracelet|ear\s?cuff|collar?|jewelry|earrings?\
            |shawl?|scarf?|scarves?|((?:sun)?glasses)|optical|socks?|hat?|stewart\b')),"product_category"]="accessory"

In [45]:
len(product.loc[product["product_category"]=="UNKNOWN_TOKEN"]) # 4383 products can hardly be classified into a specific category

4217

# 5. Export Processed Product Dataframe to csv File

In [46]:
product.to_csv(r'C:\Users\Carmen\Desktop\DSO560\project\processed_product.csv', index = False)