## Data Loading

In [1]:
import pandas as pd

In [2]:
product=pd.read_excel("Behold+product+data+04262021.xlsx")

In [3]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61355 entries, 0 to 61354
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           61355 non-null  object
 1   brand                61355 non-null  object
 2   brand_category       60896 non-null  object
 3   name                 61354 non-null  object
 4   details              9200 non-null   object
 5   created_at           61355 non-null  object
 6   brand_canonical_url  61355 non-null  object
 7   description          51238 non-null  object
 8   brand_description    51234 non-null  object
 9   brand_name           61354 non-null  object
 10  product_active       61355 non-null  bool  
dtypes: bool(1), object(10)
memory usage: 4.7+ MB


In [4]:
product.head()

Unnamed: 0,product_id,brand,brand_category,name,details,created_at,brand_canonical_url,description,brand_description,brand_name,product_active
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,True
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,True
2,01EY4Y1BW8VZW51BWG5VZY82XW,Cariuma,Unknown,IBI Slip On Raw Red Knit Sneaker Women,,2021-02-10 02:58:59.591 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,False
3,01EY50E27A0P5V6KCW01XPDB43,Cariuma,Unknown,IBI Slip On Black Knit Sneaker Women,,2021-02-10 03:40:52.842 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Black Knit Sneaker Women,IBI Slip On Black Knit Sneaker Women,IBI Slip On Black Knit Sneaker Women,False
4,01EY6DWHC2W5HPNEGXKEJ4A1CX,Cariuma,Unknown,CATIBA PRO Skate Black Suede and Canvas Contra...,,2021-02-10 16:55:13.024 UTC,https://cariuma.myshopify.com/products/catiba-...,,,CATIBA PRO Skate Black Suede and Canvas Contra...,False


In [5]:
tags=pd.read_csv('usc_additional_tags USC.csv')

In [6]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97420 entries, 0 to 97419
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_id        97420 non-null  object
 1   product_color_id  97420 non-null  object
 2   attribute_name    97420 non-null  object
 3   attribute_value   97420 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


In [7]:
tags.head()

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value
0,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,linenblend
1,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,materialclothing,cottonblend
2,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,modern
3,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,businesscasual
4,01E5ZXP5H0BTEZT9QD2HRZJ47A,01E5ZXP5JCREDC7WJVMWHK5Q40,style,classic


## Data Preprocessing

In [8]:
# Drop duplicates if applies

product.drop_duplicates(subset='product_id',keep='first',inplace=True)

In [9]:
# Remove irrelevant fields
cleaned_data=product.drop(['product_id','created_at','brand_canonical_url'],axis=1)

In [10]:
# Fill NaN values with "unknown"

cleaned_data = cleaned_data.fillna("unknown")

In [11]:
# Convert all the text values into lowercase and remove puctuations
import re

def clean(data):
    data=data.lower()
    data=re.sub(r'[^\w\s]','',data)
    return data

cols = cleaned_data.columns.values
for i in cols:
    if i=='brand':
        continue
    else:
        cleaned_data[i]=cleaned_data[i].astype(str).apply(clean)
# for i in cols:
#     if i=='product_active':
#         continue
#     else:
#         cleaned_data[i]=cleaned_data[i].astype(str).apply(lower)

In [12]:
# Perform lemmatization

import nltk
from nltk.stem import WordNetLemmatizer

def lemm(data):

    lemmatizer = WordNetLemmatizer()
    sentences = []
    
    for i in data:
        tokens = nltk.word_tokenize(i)
        words = []
        for word in tokens:
            words.append(lemmatizer.lemmatize(word))
        sentence = " ".join(words)
        sentences.append(sentence)
    return sentences

for i in cols:
    if i=='brand':
        continue
    else:
        cleaned_data[i]=lemm(cleaned_data[i])

In [13]:
# Get top 30 brands and classify the other brands as 'Others'

top_30_label = cleaned_data.groupby('brand')['brand'].count().sort_values(ascending=False).head(30).index.values

for i in range(len(cleaned_data)):
    if cleaned_data.iloc[i,0] in top_30_label:
        continue
    else:
        cleaned_data.iloc[i,0]='Others'

In [14]:
cleaned_data

Unnamed: 0,brand,brand_category,name,details,description,brand_description,brand_name,product_active
0,Others,unknown,khadi stripe shirtour signature shirt,unknown,our signature khadi shirt available in black a...,our signature khadi shirt available in black a...,khadi stripe shirtour signature shirt,true
1,Collina Strada,unknown,ruffle market dress loopy pink sistine tomato,unknown,midlength dress with ruffle and adjustable str...,midlength dress with ruffle and adjustable str...,ruffle market dress loopy pink sistine tomato,true
2,Others,unknown,ibi slip on raw red knit sneaker woman,unknown,ibi slip on raw red knit sneaker woman,ibi slip on raw red knit sneaker woman,ibi slip on raw red knit sneaker woman,false
3,Others,unknown,ibi slip on black knit sneaker woman,unknown,ibi slip on black knit sneaker woman,ibi slip on black knit sneaker woman,ibi slip on black knit sneaker woman,false
4,Others,unknown,catiba pro skate black suede and canvas contra...,unknown,unknown,unknown,catiba pro skate black suede and canvas contra...,false
...,...,...,...,...,...,...,...,...
61350,Others,sandalssales,bowvida mule in black suede kidskin,feminine flat mule square shape v line on the ...,the flat bowvida mule in black suede is the id...,the flat bowvida mule in black suede is the id...,bowvida mule in black suede kidskin,false
61351,Others,flat sandalsarchives,sandale vida mule in tangerine suede kidskin,feminine flat mule square shape v line on the ...,the flat vida mule in tangerine suede is the c...,the flat vida mule in tangerine suede is the c...,sandale vida mule in tangerine suede kidskin,false
61352,Others,flat sandalsarchives,bowvida mule in fuschia suede kidskin,feminine flat mule square shape v line on the ...,the flat bowvida mule in fuschia suede is the ...,the flat bowvida mule in fuschia suede is the ...,bowvida mule in fuschia suede kidskin,false
61353,Others,sandalssales,vida mule in silver metalized leather,feminine flat mule square shape v line on the ...,the flat vida mule in silver metalized leather...,the flat vida mule in silver metalized leather...,vida mule in silver metalized leather,false


In [15]:
# Look at the distribution of brands among all the observations

cleaned_data.groupby('brand')['brand'].count().sort_values(ascending=False)/61355*100

brand
Others                   25.407872
7 For All Mankind        14.686660
Rails                     4.667916
Intentionally Blank       4.130063
A.L.C.                    3.409665
Rachel Comey              3.391737
Misa                      3.308614
Studio 189                3.188004
ASTR the Label            3.165186
lemlem                    2.967973
Simon Miller              2.364925
Cynthia Rowley            2.195420
Outerknown                2.180751
Chufy                     1.970500
Faherty                   1.962350
M.M.LaFleur               1.942792
Janessa Leone             1.823812
Araks                     1.761878
Sea                       1.716242
BROCHU WALKER             1.631489
Tanya Taylor              1.615190
Clare V.                  1.502730
Nili Lotan                1.168609
Les Girls Les Boys        1.132752
Prism                     1.087116
Sandy Liang               1.080597
6397                      1.062668
Ancient Greek Sandals     1.007253
Alo Yoga      

In [16]:
# Split features and response variables

X=cleaned_data['details']+' '+cleaned_data['description']
y=cleaned_data['brand'].astype(str)

In [17]:
# Covert categorical labels into numerical values

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = to_categorical(encoder.fit_transform(y))

In [18]:
# Count vetorize all the features

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english" 
                             ,max_features=3000
                             ,token_pattern=r'\b[a-zA-Z]{3,}\b'
                            )
X=vectorizer.fit_transform(X)
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y)

## Build a Random Forest Model without Additional Feature Engineering

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf_rf = RandomForestClassifier(random_state=1, n_estimators = 200) 
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
score_test_rf = accuracy_score(y_test, y_pred_rf) 
print(score_test_rf)

0.8758862358405998


## Loading Data

In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [2]:
cleaned_data=pd.read_csv('after_feature_creation.csv')

In [3]:
cleaned_data.columns

Index(['product_id', 'brand', 'brand_category', 'name', 'details',
       'description', 'value_brand', 'bio', 'quote', 'quote_attribute',
       'intro', 'lifestyle_copy', 'short_bio', 'listing_bio', 'attribute_pair',
       'manuf_location', 'is_handmade', 'is_artisan_crafted', 'is_sustainable',
       'is_emerging', 'is_vegan', 'is_BIPOC', 'is_woman_owned', 'is_beach',
       'is_city', 'is_holiday', 'is_romance', 'is_chic', 'is_comfortable',
       'is_weekday', 'is_asymmetrical', 'is_timeless', 'is_classic', 'is_swim',
       'is_casual', 'is_pajama', 'is_sophisticated', 'is_modern', 'material'],
      dtype='object')

In [4]:
# Data for building classification models

model_data=cleaned_data[['brand','brand_category','details','description','manuf_location','is_beach'
                        ,'is_city', 'is_holiday', 'is_romance', 'is_chic', 'is_comfortable',
                       'is_weekday', 'is_asymmetrical', 'is_timeless', 'is_classic', 'is_swim',
                       'is_casual', 'is_pajama', 'is_sophisticated', 'is_modern', 'material']]
model_data

Unnamed: 0,brand,brand_category,details,description,manuf_location,is_beach,is_city,is_holiday,is_romance,is_chic,...,is_weekday,is_asymmetrical,is_timeless,is_classic,is_swim,is_casual,is_pajama,is_sophisticated,is_modern,material
0,Two,UNKNOWN_TOKEN,UNKNOWN_TOKEN,signature khadi shirt available black white ea...,UNKNOWN_TOKEN,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
1,Collina Strada,UNKNOWN_TOKEN,UNKNOWN_TOKEN,mid length dress ruffle adjustable strap bias ...,New York,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,100 rose
2,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,ibi slip Raw Red Knit Sneaker woman,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
3,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,ibi slip Black Knit Sneaker woman,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
4,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61350,Nomasei,SandalsSales,feminine flat mule Square shape v line front g...,flat BowVida mule black suede ideal spring sum...,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61351,Nomasei,Flat sandalsArchives,feminine flat mule Square shape v line front g...,flat Vida mule tangerine suede comfortable fem...,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61352,Nomasei,Flat sandalsArchives,feminine flat mule Square shape v line front g...,flat BowVida mule fuschia suede ideal spring s...,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61353,Nomasei,SandalsSales,feminine flat mule Square shape v line front g...,flat Vida mule silver metalized leather comfor...,UNKNOWN_TOKEN,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN


In [5]:
model_data2=cleaned_data[['brand','brand_category','details','description','name','attribute_pair','manuf_location','is_beach'
                        ,'is_city', 'is_holiday', 'is_romance', 'is_chic', 'is_comfortable',
                       'is_weekday', 'is_asymmetrical', 'is_timeless', 'is_classic', 'is_swim',
                       'is_casual', 'is_pajama', 'is_sophisticated', 'is_modern', 'material']]
model_data2

Unnamed: 0,brand,brand_category,details,description,name,attribute_pair,manuf_location,is_beach,is_city,is_holiday,...,is_weekday,is_asymmetrical,is_timeless,is_classic,is_swim,is_casual,is_pajama,is_sophisticated,is_modern,material
0,Two,UNKNOWN_TOKEN,UNKNOWN_TOKEN,signature khadi shirt available black white ea...,Khadi Stripe Shirt-our signature shirt,UNKNOWN_TOKEN,UNKNOWN_TOKEN,True,True,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
1,Collina Strada,UNKNOWN_TOKEN,UNKNOWN_TOKEN,mid length dress ruffle adjustable strap bias ...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,UNKNOWN_TOKEN,New York,False,False,False,...,False,False,False,False,False,False,False,False,False,100 rose
2,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,ibi slip Raw Red Knit Sneaker woman,IBI Slip On Raw Red Knit Sneaker Women,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
3,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,ibi slip Black Knit Sneaker woman,IBI Slip On Black Knit Sneaker Women,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
4,Cariuma,UNKNOWN_TOKEN,UNKNOWN_TOKEN,UNKNOWN_TOKEN,CATIBA PRO Skate Black Suede and Canvas Contra...,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61350,Nomasei,SandalsSales,feminine flat mule Square shape v line front g...,flat BowVida mule black suede ideal spring sum...,BowVida Mule in Black Suede Kidskin,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61351,Nomasei,Flat sandalsArchives,feminine flat mule Square shape v line front g...,flat Vida mule tangerine suede comfortable fem...,Sandale Vida Mule In Tangerine Suede Kidskin,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61352,Nomasei,Flat sandalsArchives,feminine flat mule Square shape v line front g...,flat BowVida mule fuschia suede ideal spring s...,BowVida Mule in Fuschia Suede Kidskin,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN
61353,Nomasei,SandalsSales,feminine flat mule Square shape v line front g...,flat Vida mule silver metalized leather comfor...,Vida Mule in Silver Metalized Leather,UNKNOWN_TOKEN,UNKNOWN_TOKEN,False,False,False,...,False,False,False,False,False,False,False,False,False,UNKNOWN_TOKEN


## Additional Preprocessing

In [6]:
# Remove special stopwords like 'description', 'details', 'product', 'item'

model_data['details']=model_data['details'].str.replace(r'\b(descriptions?|details?|products?|items?)\b','',case=False)
model_data['description']=model_data['description'].str.replace(r'\b(descriptions?|details?|products?|items?)\b','',case=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['details']=model_data['details'].str.replace(r'\b(descriptions?|details?|products?|items?)\b','',case=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['description']=model_data['description'].str.replace(r'\b(descriptions?|details?|products?|items?)\b','',case=False)


In [7]:
# Data Cleaning on the "brand_category" field

def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

nltk_stopwords = set(stopwords.words("English"))
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

# Perform lemmatization

import nltk
from nltk.stem import WordNetLemmatizer

def lemm(data):

    lemmatizer = WordNetLemmatizer()
    sentences = []
    
    for i in data:
        tokens = nltk.word_tokenize(i)
        words = []
        for word in tokens:
            words.append(lemmatizer.lemmatize(word))
        sentence = " ".join(words)
        sentences.append(sentence)
    return sentences

In [8]:
model_data['brand_category']=model_data['brand_category'].apply(removePunctuation)
model_data['brand_category']=model_data['brand_category'].apply(removeStopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['brand_category']=model_data['brand_category'].apply(removePunctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['brand_category']=model_data['brand_category'].apply(removeStopwords)


In [9]:
model_data['brand_category']=lemm(model_data['brand_category'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['brand_category']=lemm(model_data['brand_category'])


In [10]:
# Get top 50 brands and classify the other brands as 'Others'

top_50_label = model_data.groupby('brand')['brand'].count().sort_values(ascending=False).head(50).index.values

for i in range(len(model_data)):
    if model_data.iloc[i,0] in top_50_label:
        continue
    else:
        model_data.iloc[i,0]='Others'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [11]:
model_data.groupby('brand')['brand'].count().sort_values(ascending=False).head(50)

brand
7 For All Mankind        9011
Others                   8744
Rails                    2864
Intentionally Blank      2534
A.L.C.                   2092
Rachel Comey             2081
Misa                     2030
Studio 189               1956
ASTR the Label           1942
lemlem                   1821
Simon Miller             1451
Cynthia Rowley           1347
Outerknown               1338
Chufy                    1209
Faherty                  1204
M.M.LaFleur              1192
Janessa Leone            1119
Araks                    1081
Sea                      1053
BROCHU WALKER            1001
Tanya Taylor              991
Clare V.                  922
Nili Lotan                717
Les Girls Les Boys        695
Prism                     667
Sandy Liang               663
6397                      652
Ancient Greek Sandals     618
Alo Yoga                  525
Collina Strada            501
Whit                      489
Batsheva                  484
Jenni Kayne               483
Varl

In [12]:
# Pull response variable
# Covert categorical labels into numerical values

y=model_data['brand'].astype(str)
encoder = LabelEncoder()
y = to_categorical(encoder.fit_transform(y))

# Convert y to 1-D array
def get_new_labels(y):
    y_new = LabelEncoder().fit_transform([''.join(str(l)) for l in y])
    return y_new

y_new = get_new_labels(y)

In [13]:
encoder.classes_

array(['6397', '7 For All Mankind', 'A.L.C.', 'ANINE BING',
       'ASTR the Label', 'ATP Atelier', 'Aesther Ekme', 'Alo Yoga',
       'Ancient Greek Sandals', 'Andrea Iyamah', 'Araks', 'BROCHU WALKER',
       'Banjanan', 'Batsheva', 'Cariuma', 'Christina Lehr', 'Chufy',
       'Ciao Lucia', 'Citizens of Humanity', 'Clare V.', 'Collina Strada',
       'Cynthia Rowley', 'EMME PARSONS', 'Eleven Six', 'Esquivel',
       'Faherty', 'Frame', 'Gigi Burris', 'Intentionally Blank',
       'Janessa Leone', 'Jenni Kayne', 'Les Girls Les Boys',
       'M.M.LaFleur', 'Misa', 'Mother Denim', 'Nili Lotan', 'Others',
       'Outerknown', 'Prism', 'Rachel Comey', 'Rails', 'Sandy Liang',
       'Sea', 'Simon Miller', 'Studio 189', 'Tanya Taylor', 'Two',
       'Varley US', 'Want Les Essentiels', 'Whit', 'lemlem'], dtype=object)

In [14]:
y_new

array([ 4, 30, 36, ..., 14, 14, 14])

## Building Random Forest Models 

In [15]:
X=model_data['details']+' '+model_data['description']+' '+model_data['brand_category']

In [16]:
X

0        UNKNOWN_TOKEN signature khadi shirt available ...
1        UNKNOWN_TOKEN mid length dress ruffle adjustab...
2        UNKNOWN_TOKEN ibi slip Raw Red Knit Sneaker wo...
3        UNKNOWN_TOKEN ibi slip Black Knit Sneaker woma...
4                UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN
                               ...                        
61350    feminine flat mule Square shape v line front g...
61351    feminine flat mule Square shape v line front g...
61352    feminine flat mule Square shape v line front g...
61353    feminine flat mule Square shape v line front g...
61354    feminine flat mule Square shape v line front g...
Length: 61355, dtype: object

In [17]:
X2=model_data2['details']+' '+model_data2['description']+' '+model_data2['brand_category']+' '+model_data2['name']+' '+model_data2['attribute_pair']

In [18]:
X2

0        UNKNOWN_TOKEN signature khadi shirt available ...
1        UNKNOWN_TOKEN mid length dress ruffle adjustab...
2        UNKNOWN_TOKEN ibi slip Raw Red Knit Sneaker wo...
3        UNKNOWN_TOKEN ibi slip Black Knit Sneaker woma...
4        UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN CATI...
                               ...                        
61350    feminine flat mule Square shape v line front g...
61351    feminine flat mule Square shape v line front g...
61352    feminine flat mule Square shape v line front g...
61353    feminine flat mule Square shape v line front g...
61354    feminine flat mule Square shape v line front g...
Length: 61355, dtype: object

### Count Vectorization on Brand_Category, Details, and Description

In [19]:
# Count vetorize all the features

vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english" 
                             ,max_features=3000
                             ,token_pattern=r'\b[a-zA-Z]{3,}\b'
                            )
X_cv1=vectorizer.fit_transform(X)
X_cv1= pd.DataFrame(X_cv1.toarray(), columns=vectorizer.get_feature_names())

In [20]:
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english" 
                             ,max_features=5000
                             ,token_pattern=r'\b[a-zA-Z]{3,}\b'
                            )
X2_cv1=vectorizer.fit_transform(X2)
X2_cv1= pd.DataFrame(X2_cv1.toarray(), columns=vectorizer.get_feature_names())

In [21]:
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english" 
                             ,max_features=600
                             ,token_pattern=r'\b[a-zA-Z]{3,}\b'
                            )
X3_cv1=vectorizer.fit_transform(X2)
X3_cv1= pd.DataFrame(X3_cv1.toarray(), columns=vectorizer.get_feature_names())

In [22]:
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english" 
                             ,max_features=300
                             ,token_pattern=r'\b[a-zA-Z]{3,}\b'
                            )
X4_cv1=vectorizer.fit_transform(X2)
X4_cv1= pd.DataFrame(X4_cv1.toarray(), columns=vectorizer.get_feature_names())

### TF-IDF on Brand_Category, Details, and Description

In [23]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=3000)
X_tfidf1 = vectorizer.fit_transform(X)
X_tfidf1 = pd.DataFrame(X_tfidf1.toarray(), columns=vectorizer.get_feature_names())

In [24]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=5000)
X2_tfidf1 = vectorizer.fit_transform(X2)
X2_tfidf1 = pd.DataFrame(X2_tfidf1.toarray(), columns=vectorizer.get_feature_names())

In [25]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=600)
X3_tfidf1 = vectorizer.fit_transform(X2)
X3_tfidf1 = pd.DataFrame(X3_tfidf1.toarray(), columns=vectorizer.get_feature_names())

In [26]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=3000)
X4_tfidf1 = vectorizer.fit_transform(X2)
X4_tfidf1 = pd.DataFrame(X4_tfidf1.toarray(), columns=vectorizer.get_feature_names())

### One-Hot Encoding on New Features

In [27]:
X_new=pd.get_dummies(data=model_data
                     ,columns=['manuf_location','is_beach'
                                    ,'is_city', 'is_holiday', 'is_romance', 'is_chic', 'is_comfortable',
                                   'is_weekday', 'is_asymmetrical', 'is_timeless', 'is_classic', 'is_swim',
                                   'is_casual', 'is_pajama', 'is_sophisticated', 'is_modern', 'material']
                     ,drop_first=True)

In [28]:
X_new=X_new.drop(['brand','brand_category','details','description'],axis=1)

In [29]:
X_cv2= pd.concat([X_cv1, X_new], axis=1, join='inner')

In [30]:
X_tfidf2=pd.concat([X_tfidf1, X_new], axis=1, join='inner')

In [31]:
X2_cv2= pd.concat([X2_cv1, X_new], axis=1, join='inner')

In [32]:
X2_tfidf2=pd.concat([X2_tfidf1, X_new], axis=1, join='inner')

In [33]:
X3_tfidf2=pd.concat([X3_tfidf1, X_new], axis=1, join='inner')

In [34]:
X4_tfidf2=pd.concat([X4_tfidf1, X_new], axis=1, join='inner')

In [35]:
X3_cv2=pd.concat([X3_cv1, X_new], axis=1, join='inner')

In [36]:
X4_cv2=pd.concat([X4_cv1, X_new], axis=1, join='inner')

### Dimensionality Reduction

In [180]:
pca1 = PCA(n_components=10)
pca2 = PCA(n_components=0.9)

X_cv2_pca1 = pca1.fit_transform(X_cv2)
X_cv2_pca2 = pca2.fit_transform(X_cv2)

X_tfidf2_pca1 = pca1.fit_transform(X_tfidf2)
X_tfidf2_pca2 = pca2.fit_transform(X_tfidf2)

### Cross-Validation

In [38]:
kfolds = StratifiedKFold(n_splits = 10, random_state = 1, shuffle = True)

In [39]:
clf_rf = RandomForestClassifier(random_state=1, n_estimators = 100, criterion='gini') 


In [None]:
error_model_1_cv = cross_val_score(clf_rf, X_cv1,y_new, cv=kfolds, scoring = 'accuracy')

In [108]:
error_model_2_cv = cross_val_score(clf_rf, X_tfidf1,y_new, cv=kfolds, scoring = 'accuracy')

In [168]:
error_model_3_cv = cross_val_score(clf_rf, X_tfidf2,y_new, cv=kfolds, scoring = 'accuracy')

In [171]:
error_model_4_cv = cross_val_score(clf_rf, X_cv2,y_new, cv=kfolds, scoring = 'accuracy')

In [182]:
error_model_5_cv = cross_val_score(clf_rf, X_cv2_pca1,y_new, cv=kfolds, scoring = 'accuracy')

In [186]:
error_model_6_cv = cross_val_score(clf_rf, X_cv2_pca2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [188]:
error_model_7_cv = cross_val_score(clf_rf, X_tfidf2_pca1,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [189]:
error_model_8_cv = cross_val_score(clf_rf, X_tfidf2_pca2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [206]:
error_model_9_cv = cross_val_score(clf_rf, X2_cv2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [40]:
error_model_10_cv = cross_val_score(clf_rf, X2_tfidf2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [219]:
error_model_11_cv = cross_val_score(clf_rf, X3_tfidf2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [223]:
error_model_12_cv = cross_val_score(clf_rf, X4_tfidf2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [24]:
error_model_13_cv = cross_val_score(clf_rf, X3_cv2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [29]:
error_model_14_cv = cross_val_score(clf_rf, X4_cv2,y_new, cv=kfolds, scoring = 'accuracy',n_jobs=-1)

In [None]:
print("mean accuracy for model_1_cv:",np.mean(error_model_1_cv)) # -> Count Vectorization without Adding New Features
print("mean accuracy for model_2_cv:",np.mean(error_model_2_cv)) # -> TF-IDF without Adding New Features
print("mean accuracy for model_3_cv:",np.mean(error_model_3_cv)) # -> TF-IDF + New Features
print("mean accuracy for model_4_cv:",np.mean(error_model_4_cv)) # -> Count Vectorization + New Features
print("mean accuracy for model_5_cv:",np.mean(error_model_5_cv)) # -> Count Vectorization + New Features +PCA(comp=10)
print("mean accuracy for model_6_cv:",np.mean(error_model_6_cv)) # -> Count Vectorization + New Features +PCA(comp=0.9)
print("mean accuracy for model_7_cv:",np.mean(error_model_7_cv)) # -> TF-IDF + New Features +PCA(comp=10)
print("mean accuracy for model_8_cv:",np.mean(error_model_8_cv)) # -> TF-IDF + New Features +PCA(comp=0.9)
print("mean accuracy for model_9_cv:",np.mean(error_model_9_cv)) # -> Count Vectorization（5000 features） on All Text + New Features

In [41]:
print("mean accuracy for model_10_cv:",np.mean(error_model_10_cv)) # -> TF-IDF (5000 features) on All Text + New Features

mean accuracy for model_10_cv: 0.9594165659875742


In [None]:
print("mean accuracy for model_11_cv:",np.mean(error_model_11_cv)) # -> TF-IDF (600 features) on All Text + New Features
print("mean accuracy for model_12_cv:",np.mean(error_model_12_cv)) # -> TF-IDF (3000 features) on All Text + New Features


In [26]:
print("mean accuracy for model_13_cv:",np.mean(error_model_13_cv))

mean accuracy for model_13_cv: 0.9407221480189861


In [30]:
print("mean accuracy for model_14_cv:",np.mean(error_model_14_cv))

mean accuracy for model_14_cv: 0.9223536487271933


## Building RNN Model

In [4]:
cleaned_data=pd.read_csv('after_feature_creation.csv')

In [5]:
model_data2=cleaned_data[['brand','brand_category','details','description','name','attribute_pair','manuf_location','is_beach'
                        ,'is_city', 'is_holiday', 'is_romance', 'is_chic', 'is_comfortable',
                       'is_weekday', 'is_asymmetrical', 'is_timeless', 'is_classic', 'is_swim',
                       'is_casual', 'is_pajama', 'is_sophisticated', 'is_modern', 'material']]

In [6]:
top_50_label = model_data2.groupby('brand')['brand'].count().sort_values(ascending=False).head(50).index.values

for i in range(len(model_data2)):
    if model_data2.iloc[i,0] in top_50_label:
        continue
    else:
        model_data2.iloc[i,0]='Others'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:

def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word.lower() for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

# Perform lemmatization

import nltk
from nltk.stem import WordNetLemmatizer

def lemm(data):

    lemmatizer = WordNetLemmatizer()
    sentences = []
    
    for i in data:
        tokens = nltk.word_tokenize(i)
        words = []
        for word in tokens:
            words.append(lemmatizer.lemmatize(word))
        sentence = " ".join(words)
        sentences.append(sentence)
    return sentences

In [8]:
model_data2['brand_category']=model_data2['brand_category'].apply(removePunctuation)
model_data2['details']=model_data2['details'].apply(removePunctuation)
model_data2['description']=model_data2['description'].apply(removePunctuation)
model_data2['name']=model_data2['name'].apply(removePunctuation)
model_data2['attribute_pair']=model_data2['attribute_pair'].apply(removePunctuation)
model_data2['manuf_location']=model_data2['manuf_location'].apply(removePunctuation)
model_data2['material']=model_data2['material'].apply(removePunctuation)
model_data2['brand_category']=lemm(model_data2['brand_category'])
model_data2['details']=lemm(model_data2['details'])
model_data2['description']=lemm(model_data2['description'])
model_data2['name']=lemm(model_data2['name'])
model_data2['attribute_pair']=lemm(model_data2['attribute_pair'])
model_data2['manuf_location']=lemm(model_data2['manuf_location'])
model_data2['material']=lemm(model_data2['material'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data2['brand_category']=model_data2['brand_category'].apply(removePunctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data2['details']=model_data2['details'].apply(removePunctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data2['description']=model_data2['descripti

In [45]:
labels=model_data2['brand']
docs=model_data2['details']+' '+model_data2['description']+' '+model_data2['brand_category']+' '+model_data2['name']+' '+model_data2['attribute_pair']+' '+model_data2['manuf_location']+' '+model_data2['material']


In [46]:
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))

In [47]:
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from typing import List

stop_words = set(stopwords.words('english')) 
stopwords_removed_docs1 = []
for doc in docs:
  word_tokens = word_tokenize(doc) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  stopwords_removed_docs1.append(" ".join(filtered_sentence))

In [48]:
import re
stopwords_removed_docs=[]
for i in stopwords_removed_docs1:
    data=re.sub(r'\b\d+\b', '', i)
#     data=re.sub(r'\b\w{,2}\b','',data)
    stopwords_removed_docs.append(data)

In [49]:
stopwords_removed_docs

['unknown_token signature khadi shirt available black white easy wear beach city promise top go warm weather item perfect blazer hand loom woven stripe khadi cotton slightly sheer get soft every wash ship first week april color white black — length  width . — one size fit — grid khadi cotton unknown_token khadi stripe shirt-our signature shirt unknown_token unknown_token unknown_token',
 'unknown_token mid length dress ruffle adjustable strap bias cut side seam invisible zipper make new york model wear size small  rose sylk rose sylk organic cellulose fiber make natural waste rise bush stem unknown_token ruffle market dress loopy pink sistine tomato unknown_token new york  rose',
 'unknown_token ibi slip raw red knit sneaker woman unknown_token ibi slip raw red knit sneaker woman unknown_token unknown_token unknown_token',
 'unknown_token ibi slip black knit sneaker woman unknown_token ibi slip black knit sneaker woman unknown_token unknown_token unknown_token',
 'unknown_token unknown

In [50]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_docs)

In [51]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [52]:
from typing import List
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

In [32]:
max_length

654

In [63]:
from keras.preprocessing.sequence import pad_sequences
MAX_SEQUENCE_LENGTH = 300

encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)

padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2,stratify=labels)

In [35]:
len(X_train)

49084

In [65]:
VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

In [66]:
from numpy import array, argmax, asarray, zeros

def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


In [67]:
tokenizer.word_index

{'UNKNOWN_TOKEN': 1,
 'unknown': 2,
 'token': 3,
 '•': 4,
 'size': 5,
 'style': 6,
 'fit': 7,
 'make': 8,
 'wear': 9,
 'cotton': 10,
 'model': 11,
 'dry': 12,
 'detail': 13,
 'dress': 14,
 'waist': 15,
 'top': 16,
 'leather': 17,
 'sleeve': 18,
 'length': 19,
 'feature': 20,
 'clean': 21,
 'hand': 22,
 'wash': 23,
 'fabric': 24,
 'occasion': 25,
 'true': 26,
 'front': 27,
 'usa': 28,
 'pocket': 29,
 'back': 30,
 'small': 31,
 'black': 32,
 'button': 33,
 'print': 34,
 'high': 35,
 'classic': 36,
 'measurement': 37,
 'soft': 38,
 "'s": 39,
 'design': 40,
 'hip': 41,
 'long': 42,
 'hem': 43,
 'shoulder': 44,
 'closure': 45,
 'care': 46,
 'woman': 47,
 'x': 48,
 'polyester': 49,
 'bust': 50,
 'body': 51,
 'neckline': 52,
 'cm': 53,
 'one': 54,
 'skirt': 55,
 'look': 56,
 'pant': 57,
 'height': 58,
 'perfect': 59,
 'neck': 60,
 'short': 61,
 'cold': 62,
 'piece': 63,
 'shirt': 64,
 "'": 65,
 'strap': 66,
 'tie': 67,
 'white': 68,
 'relaxed': 69,
 'silk': 70,
 'silhouette': 71,
 'low': 72,


In [68]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

In [69]:
len(padded_docs)

61355

In [70]:
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
# define model
model = Sequential()
model.add(Embedding(embedding_matrix.shape[0], 
                    100, 
                    weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
model.add(Dense(64))
model.add(Dense(51, activation='softmax'))

In [71]:
from keras.utils.vis_utils import plot_model

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 100)          3327300   
_________________________________________________________________
masking_2 (Masking)          (None, 300, 100)          0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 64)                10560     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 51)                3315      
Total params: 3,345,335
Trainable params: 18,035
Non-trainable params: 3,327,300
_________________________________________________________________
None
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab

In [72]:
model.fit(X_train, y_train, validation_split = 0.2, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fcca9e1d4f0>

In [73]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 79.227448
