In [47]:
import io
import os
import numpy as np
import pandas as pd
import ast

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import re
import gensim
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from colour import Color

np.random.seed(42)

In [2]:
data = pd.read_csv('C:/Users/tring/Desktop/SSENSE Project/eda/all_info.csv', sep = '|', header = 0)

In [3]:
data.head()

Unnamed: 0,creation-date,sub-category,brand,name,sku,description,origin,composition,full-price,sale-price,discount-percent,remaining-sizes,image
0,2019-12-18,beanies,The Elder Statesman,Black Short Bunny Echo Beanie,201014M138001,Rib knit cashmere beanie in black. Rolled brim...,United States,100% cashmere.,355,355,0,['UNI'],https://img.ssensemedia.com/images/201014M1380...
1,2019-12-18,beanies,PS by Paul Smith,Red Wool Zebra Beanie,201422M138012,Rib knit lambswool beanie in red. Signature gr...,United Kingdom,100% lambswool.,125,125,0,['UNI'],https://img.ssensemedia.com/images/201422M1380...
2,2019-12-18,beanies,PS by Paul Smith,Black Wool Zebra Beanie,201422M138013,Rib knit lambswool beanie in navy. Signature g...,United Kingdom,100% lambswool.,125,125,0,['UNI'],https://img.ssensemedia.com/images/201422M1380...
3,2019-12-18,iphone cases,Maison Margiela,Black Pouch iPhone Case,201168M170261,Grained leather shoulder bag-style iPhone case...,Italy,Leather.,420,420,0,['UNI'],https://img.ssensemedia.com/images/201168M1702...
4,2019-12-18,necklaces,Maison Margiela,Silver Key Necklace,201168M145213,Curb chain necklace in sterling silver. Logo a...,Italy,925 sterling silver.,635,635,0,['UNI'],https://img.ssensemedia.com/images/201168M1452...


### Creation Date

Creation date does not seem to be neccessary for the recommendation system. It is however, useful for other tasks like trend analysis. Therefore, for the sake of the model, we drop the column.

In [4]:
data = data.drop(columns = ['creation-date'], axis = 1)
data.head()

Unnamed: 0,sub-category,brand,name,sku,description,origin,composition,full-price,sale-price,discount-percent,remaining-sizes,image
0,beanies,The Elder Statesman,Black Short Bunny Echo Beanie,201014M138001,Rib knit cashmere beanie in black. Rolled brim...,United States,100% cashmere.,355,355,0,['UNI'],https://img.ssensemedia.com/images/201014M1380...
1,beanies,PS by Paul Smith,Red Wool Zebra Beanie,201422M138012,Rib knit lambswool beanie in red. Signature gr...,United Kingdom,100% lambswool.,125,125,0,['UNI'],https://img.ssensemedia.com/images/201422M1380...
2,beanies,PS by Paul Smith,Black Wool Zebra Beanie,201422M138013,Rib knit lambswool beanie in navy. Signature g...,United Kingdom,100% lambswool.,125,125,0,['UNI'],https://img.ssensemedia.com/images/201422M1380...
3,iphone cases,Maison Margiela,Black Pouch iPhone Case,201168M170261,Grained leather shoulder bag-style iPhone case...,Italy,Leather.,420,420,0,['UNI'],https://img.ssensemedia.com/images/201168M1702...
4,necklaces,Maison Margiela,Silver Key Necklace,201168M145213,Curb chain necklace in sterling silver. Logo a...,Italy,925 sterling silver.,635,635,0,['UNI'],https://img.ssensemedia.com/images/201168M1452...


### Sub Category

In [5]:
category_mlb = MultiLabelBinarizer()

category_matrix = pd.DataFrame(category_mlb.fit_transform(data['sub-category'].str.split(' ')),columns=category_mlb.classes_, index=data.index)

print(category_matrix)

       accessories  backpacks  bags  bars  beanies  belts  biker  blankets  \
0                0          0     0     0        1      0      0         0   
1                0          0     0     0        1      0      0         0   
2                0          0     0     0        1      0      0         0   
3                0          0     0     0        0      0      0         0   
4                0          0     0     0        0      0      0         0   
...            ...        ...   ...   ...      ...    ...    ...       ...   
28987            0          0     0     0        0      0      0         0   
28988            0          0     0     0        0      0      0         0   
28989            0          0     0     0        0      0      0         0   
28990            0          0     0     0        0      0      0         0   
28991            0          0     0     0        0      0      0         0   

       blazers  boat  ...  up  ups  v  vests  waistcoats  walle

### Brand

Brands seem be unique to their own, however, there are certain collaborations, collections from the same designers seperated into different brands. We will convert them to the designers name manually as there are not that many cases.

In [6]:
data['brand'].unique()

array(['The Elder Statesman', 'PS by Paul Smith', 'Maison Margiela',
       'Fendi', 'Carhartt Work In Progress', 'AMI Alexandre Mattiussi',
       'R13', 'Alexander McQueen', 'Alan Crocetti', 'Paul Smith',
       'Raf Simons', 'Loewe', 'Ermenegildo Zegna', 'adidas Originals',
       'adidas Originals by Alexander Wang', 'Noah NYC', 'Saint Laurent',
       'Burberry', 'Dolce & Gabbana', 'Carne Bollente', 'Frenckenberger',
       'Givenchy', 'Vivienne Westwood', 'Balenciaga', 'Kuboraum', 'Dita',
       'Ray-Ban', 'Etro', 'Marni', 'Valentino', 'Juun.J',
       'Giorgio Armani', 'Eyevan 7285', 'Yuichi Toyama', 'Moschino',
       'Thom Browne', 'Tiger of Sweden', 'Norse Projects',
       'Saturdays NYC', 'A.P.C.', 'Gucci', 'Christian Louboutin',
       'Reebok by Pyer Moss', 'Stone Island', '1017 ALYX 9SM', 'Y-3',
       'Matsuda', 'Mykita', 'Bottega Veneta', 'Ralph Lauren Purple Label',
       'Prada', 'Moncler Genius', 'Versace', 'Heron Preston', '032c',
       'Marine Serre', 'McQ Alexa

In [7]:
data['brand'] = data['brand'].apply(lambda x: x.lower())

In [8]:
data.loc[data.brand == 'adidas originals by alexander wang' ,'brand'] = 'adidas,alexander wang'
data.loc[data.brand == 'adidas originals x pharrell williams' ,'brand'] = 'adidas' #PW does not have a private line on SSENSE
data.loc[data.brand == 'adidas x missoni' ,'brand'] = 'adidas,missoni' #PW does not have a private line on SSENSE
data.loc[(data.brand == '11 by boris bidjan saberi') & (data.name.str.contains('Salomon')) ,'brand'] = 'boris bidjan saberi,salomon'
data.loc[(data.brand == '424') & (data.name.str.contains('adidas')),'brand'] = '424,adidas'
data.loc[(data.brand == 'junya watanabe') & (data.name.str.contains('New Balance')) ,'brand'] = 'junya watanabe,new balance'
data.loc[(data.brand == 'harmony') & (data.name.str.contains('Asics')) ,'brand'] = 'harmony,asics'
data.loc[(data.brand == 'c2h4') & (data.name.str.contains('Asics')) ,'brand'] = 'c2h4,asics'
data.loc[(data.brand == 'kiko kostadinov') & (data.name.str.contains('Asics')) ,'brand'] = 'kiko kostadinov,asics'

In [9]:
def change_brand(df, old_brand, common_brand):
    df.loc[df.brand.str.contains(old_brand), 'brand'] = common_brand

In [10]:
name_change = {'11 by boris bidjan saberi': 'boris bidjan saberi',
             'adidas originals': 'adidas',
             'issey miyake': 'issey miyake',
             'boss': 'hugo boss',
             'hugo': 'hugo boss',
             'calvin klein': 'calvin klein',
             'comme des garçons': 'comme des garçons',
             'zegna': 'zegna',
             'jil sander': 'jil sander',
             'marni': 'marni',
             'alexander mcqueen': 'alexander mcqueen',
             'moncler': 'moncler',
             'oliver peoples': 'oliver peoples',
             'paul smith': 'paul smith',
             'ralph lauren': 'ralph lauren',
             'rick owens': 'rick owens',
             'versace': 'versace',
             'y-3': 'yohji yamamoto',
             'diesel': 'diesel',
             "levi's": "levi's",
             'mackintosh': 'mackintosh',
             'martine rose': 'martine rose',
             'nike': 'nike',
             'reebok': 'reebok',
             'stone island': 'stone island',
             'the north face': 'north face',
             'tiger of sweden': 'tiger of sweden',
             'yves salomon': 'yves salomon',
             'b by d':'kang dong',
             'd.gnak by kang.d': 'kang dong'
            }

In [11]:
for old_brand, common_brand in name_change.items():
    change_brand(data, old_brand, common_brand)

In [12]:
data['brand'].unique()

array(['the elder statesman', 'paul smith', 'maison margiela', 'fendi',
       'carhartt work in progress', 'ami alexandre mattiussi', 'r13',
       'alexander mcqueen', 'alan crocetti', 'raf simons', 'loewe',
       'zegna', 'adidas', 'adidas,alexander wang', 'noah nyc',
       'saint laurent', 'burberry', 'dolce & gabbana', 'carne bollente',
       'frenckenberger', 'givenchy', 'vivienne westwood', 'balenciaga',
       'kuboraum', 'dita', 'ray-ban', 'etro', 'marni', 'valentino',
       'juun.j', 'giorgio armani', 'eyevan 7285', 'yuichi toyama',
       'moschino', 'thom browne', 'tiger of sweden', 'norse projects',
       'saturdays nyc', 'a.p.c.', 'gucci', 'christian louboutin',
       'reebok', 'stone island', '1017 alyx 9sm', 'yohji yamamoto',
       'matsuda', 'mykita', 'bottega veneta', 'ralph lauren', 'prada',
       'moncler', 'versace', 'heron preston', '032c', 'marine serre',
       'off-white', 'le gramme', 'palm angels', 'jil sander', 'kenzo',
       'chin teo', 'thierry la

In [13]:
brand_mlb = MultiLabelBinarizer()

brand_matrix = pd.DataFrame(brand_mlb.fit_transform(data['brand'].str.split(',')),columns=brand_mlb.classes_, index=data.index)

print(brand_matrix)

       032c  1017 alyx 9sm  3.1 phillip lim  424  49winters  99% is  \
0         0              0                0    0          0       0   
1         0              0                0    0          0       0   
2         0              0                0    0          0       0   
3         0              0                0    0          0       0   
4         0              0                0    0          0       0   
...     ...            ...              ...  ...        ...     ...   
28987     0              0                0    0          0       0   
28988     0              0                0    0          0       0   
28989     0              0                0    0          0       0   
28990     0              0                0    0          0       0   
28991     0              0                0    0          0       0   

       a-cold-wall*  a. a. spectrum  a.p.c.  a_plan_application  ...  \
0                 0               0       0                   0  ...   
1  

### Name & Description

The goal for these 2 columns are similar: removing stopwords, punctuations, lemmatizing and vectorizing them. Then at the same time, extracting colors of of these 2 fields. In fact, these 2 fields could be combined together into one single column as they both contains detail of a product.

In [14]:
data['text_detail'] = data['name'] + data['description']

data['text_detail']

0        Black Short Bunny Echo BeanieRib knit cashmere...
1        Red Wool Zebra BeanieRib knit lambswool beanie...
2        Black Wool Zebra BeanieRib knit lambswool bean...
3        Black Pouch iPhone CaseGrained leather shoulde...
4        Silver Key NecklaceCurb chain necklace in ster...
                               ...                        
28987    Grey & Silver Asics Edition Gel-Kayano 25 Snea...
28988    Black Yearling BootsHandcrafted ankle-high buf...
28989    Black Crush Back LoafersGrained leather slip-o...
28990    Black No Cap Boat SneakersVegetable-tanned buf...
28991    Black Luis Mixed OxfordsPanelled buffed calfsk...
Name: text_detail, Length: 28992, dtype: object

We will perform the following steps:

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 2 characters are removed.
- All stopwords are removed.
- Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are stemmed — words are reduced to their root form.

In [15]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text= re.sub("(\r|\n|-|&|approx|supplier|length|height)+"," ",text,flags=re.IGNORECASE) 
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return " ".join(result)

In [16]:
text_sample = data.loc[2343,'text_detail']
text_sample

'Black Bee Card HolderTextured leather card holder in black. Logo stamp in gold-tone and signature hardware at face. Four card slots and one note slot. Tonal textile lining. Antiqued gold-tone hardware. Approx. 4" length x 3" height.\r\n\r\nSupplier color: Black'

In [17]:
print('original text: ')
words = []
for word in text_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized text: ')
print(preprocess(text_sample))

original text: 
['Black', 'Bee', 'Card', 'HolderTextured', 'leather', 'card', 'holder', 'in', 'black.', 'Logo', 'stamp', 'in', 'gold-tone', 'and', 'signature', 'hardware', 'at', 'face.', 'Four', 'card', 'slots', 'and', 'one', 'note', 'slot.', 'Tonal', 'textile', 'lining.', 'Antiqued', 'gold-tone', 'hardware.', 'Approx.', '4"', 'length', 'x', '3"', 'height.\r\n\r\nSupplier', 'color:', 'Black']


 tokenized and lemmatized text: 
black bee card holdertextur leather card holder black logo stamp gold tone signatur hardwar face card slot note slot tonal textil line antiqu gold tone hardwar color black


In [18]:
processed_detail = data['text_detail'].map(preprocess)
processed_detail[:10]

0    black short bunni echo beanierib knit cashmer ...
1    red wool zebra beanierib knit lambswool beani ...
2    black wool zebra beanierib knit lambswool bean...
3    black pouch iphon casegrain leather shoulder b...
4    silver key necklacecurb chain necklac sterl si...
5    black perf beltbuf calfskin belt black tonal t...
6    beig perfor logo beltbuf calfskin belt beig pe...
7    revers white logo beltrevers buff calfskin bel...
8    white buckl beltgrain leather belt white signa...
9    silver watch strap braceletsteel oyster link b...
Name: text_detail, dtype: object

Next, we will extract color from the processed text so color would have its own feature.

In [19]:
def check_color(color): #to check if a word is color
    try:
        Color(color)
        return True
    except ValueError:
        return False
    except AttributeError:
        return False

def get_color(text): #to extract colors from text
    
    if [i for i in text.split(' ') if check_color(i)]:
        return list(set([i for i in text.split(' ') if check_color(i)]))
    else: #return 'other' if nothing matches
        return ['other']

def remove_color(text): #to remove colors from text
    return " ".join([i for i in text.split(" ") if check_color(i) == False])

In [20]:
colors = processed_detail.map(lambda x: get_color(x))

In [21]:
processed_detail = processed_detail.map(lambda x: remove_color(x))

#### Topic Modelling based on Processed Text

Number of terms included in the bag of words matrix is restricted to the top 1000 words.

In [22]:
no_features = 1000

In [24]:
# NMF is able to use tf-idf  because it is a linear-algeabreic model

tfidf_vectorizer_model = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features) #Model to transform new text 
tfidf_matrix = tfidf_vectorizer_model.fit_transform(processed_detail)

tfidf_feature_names = tfidf_vectorizer_model.get_feature_names()
print(tfidf_feature_names[:10])

['accent', 'accordion', 'ace', 'acet', 'achill', 'acid', 'acn', 'adida', 'adjust', 'aglet']


In [25]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

tf_vectorizer_model = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features) #Model to transform new text 
tf_matrix = tf_vectorizer_model.fit_transform(processed_detail)

tf_feature_names = tf_vectorizer_model.get_feature_names()
print(tf_feature_names[:10])

['accent', 'accordion', 'ace', 'acet', 'achill', 'acid', 'acn', 'adida', 'adjust', 'aglet']


Next, we are limiting the number of topics both NMF and LDA models can generate to 50.

In [26]:
no_topics = 50

In [27]:
topic_columns = []

for i in range(no_topics):
    topic_columns.append(f"Topic {i}")

In [28]:
# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)

print(nmf_matrix)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.39523667e-05 0.00000000e+00 0.00000000e+00 ... 2.83745713e-04
  0.00000000e+00 5.24625643e-02]
 [5.31523504e-05 0.00000000e+00 0.00000000e+00 ... 3.42108803e-04
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 1.90640147e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.52338802e-02 ... 2.00106552e-03
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.40462123e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [29]:
# Run LDA
lda_model = LatentDirichletAllocation(n_components =no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
ldf_matrix = lda_model.fit_transform(tf_matrix)

print(ldf_matrix)

[[0.00222222 0.00222222 0.65610898 ... 0.00222222 0.00222222 0.00222222]
 [0.00133333 0.00133333 0.65874463 ... 0.00133333 0.00133333 0.00133333]
 [0.00133333 0.00133333 0.66004566 ... 0.00133333 0.00133333 0.00133333]
 ...
 [0.00142857 0.07353551 0.00142857 ... 0.07147497 0.07725732 0.00142857]
 [0.00095238 0.09680466 0.00095238 ... 0.11455262 0.07639048 0.00095238]
 [0.03887789 0.03402103 0.00066667 ... 0.07403356 0.00066667 0.00066667]]


In [30]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:{}" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10

display_topics(nmf_model, tfidf_feature_names, no_top_words)

Topic 0:{}
shirtshort jersey shirt crewneck cotton rib sleev knit collar chest
Topic 1:{}
fli trouser fit belt waistband rise mid trousersslim loop zip
Topic 2:{}
rubber pad heel sneaker lace tread tongu round toe sneakerslow
Topic 3:{}
button barrel spread singl shirtlong shirt shirttail closur poplin cuff
Topic 4:{}
compart face zipper strap width web main shoulder adjust interior
Topic 5:{}
hoodi hoodielong hood kangaroo drawstr terri french waist hem rib
Topic 6:{}
knit sweater sweaterlong rib hem crewneck sleev cuff collar turtleneck
Topic 7:{}
templ size nose acet case includ lens sunglass metal protect
Topic 8:{}
navi dark midnight colorblock ink cotton color flag stripe tiesilk
Topic 9:{}
slot bifold wallet note emboss interior card width face textil
Topic 10:{}
loung pant elastic fit drawstr waistband mid rise style pantsrelax
Topic 11:{}
card slot holder face note holderbuf calfskin holdergrain textil stamp
Topic 12:{}
multicolor graphic multi zebra print tiger appliqué wing 

In [31]:
display_topics(lda_model, tf_feature_names, no_top_words)

Topic 0:{}
strip orang signatur pull grosgrain stripe textil flag tricolor featur
Topic 1:{}
round sculpt core expos japanes clear leather wire ash color
Topic 2:{}
wool color blend virgin knit almond roll turtleneck turtlenecklong merino
Topic 3:{}
central face logo burgundi slot card color interior line textil
Topic 4:{}
shirt shirtshort crewneck pleat shirtlong garment asic tail chalk lyocel
Topic 5:{}
outer inner stack textur band color ring block logo stamp
Topic 6:{}
gunmet metal inset tone iridesc parkalong parka coyot plastic structur
Topic 7:{}
detail pale fine noir flip jacron oyster marin extra palladium
Topic 8:{}
sleev collar hem chest seam hood drawstr stand poplin neck
Topic 9:{}
pad mesh semi oxford runner mould eastpak backpackwat transpar backpack
Topic 10:{}
welt jacket jacketlong spread blazer blazerlong track armscy poloshort konst
Topic 11:{}
stitch tonal belt tone denim hardwar contrast logo antiqu topstitch
Topic 12:{}
fli loop twill fade silk color floral whisk

We now build a function to return topic with highest weight for a new text to test both models

In [32]:
def get_nmf_topic(text, features, no_top_words):
    text  = pd.Series(text).apply(preprocess).apply(remove_color)
    matrix = nmf_model.components_[np.argmax(nmf_model.transform(tfidf_vectorizer_model.transform(text)))]
    
    return [features[i] for i in matrix.argsort()[:-no_top_words-1:-1]]

def get_lda_topic(text, features, no_top_words):
    text  = pd.Series(text).apply(preprocess).apply(remove_color)
    matrix = lda_model.components_[np.argmax(lda_model.transform(tf_vectorizer_model.transform(text)))]
    
    return [features[i] for i in matrix.argsort()[:-no_top_words-1:-1]]

In [33]:
sample_text_1 = '''
burgundy wool crew neck panelled jumper from E. Tautz 
featuring a ribbed crew neck, long sleeves, elasticated cuffs, 
a panelled colour block design and a relaxed fit.
'''
get_nmf_topic(sample_text_1, tfidf_feature_names, 10)

['neck',
 'mock',
 'cardigan',
 'cardiganlong',
 'tank',
 'knit',
 'collar',
 'funnel',
 'rib',
 'topsleeveless']

In [34]:
get_lda_topic(sample_text_1, tf_feature_names, 10)

['logo',
 'collar',
 'heel',
 'toe',
 'color',
 'cotton',
 'knit',
 'jersey',
 'sole',
 'rib']

In [35]:
sample_text_2 = '''
Long sleeve stone wash calfskin jacket in black. 
Spread collar. Offset zip closure. Zippered pocket at chest. 
Zippered pockets and flap pocket at waist. Detachable epaulets 
featuring chain-link detailing with press-stud fastenings. Zippered 
vent at cuffs. Darts and tonal webbing carry handle-style trim at back. 
Zippered pockets at interior. Lined. Silver-tone hardware.
Supplier color: Black
'''
get_nmf_topic(sample_text_2, tfidf_feature_names, 10)

['compart',
 'face',
 'zipper',
 'strap',
 'width',
 'web',
 'main',
 'shoulder',
 'adjust',
 'interior']

In [36]:
get_lda_topic(sample_text_2, tf_feature_names, 10)

['pocket',
 'closur',
 'zip',
 'patch',
 'hardwar',
 'tonal',
 'lace',
 'line',
 'zipper',
 'tone']

In [37]:
sample_text_3 = '''
Rectangular metal-frame sunglasses in gold-tone. 
Transparent rubber nose pads. Green ZEISS© lenses with
100% UVA/UVB protection. Logo etched at temples. Tortoiseshell acetate temple 
tips in tones of brown. Size: 143.20 140.
Supplier color: Gold/Green
'''
get_nmf_topic(sample_text_3, tfidf_feature_names, 10)

['templ',
 'size',
 'nose',
 'acet',
 'case',
 'includ',
 'lens',
 'sunglass',
 'metal',
 'protect']

In [38]:
get_lda_topic(sample_text_3, tf_feature_names, 10)

['slip',
 'includ',
 'logo',
 'templ',
 'size',
 'pad',
 'case',
 'color',
 'nose',
 'acet']

NMF model seems to generate more meaningful topics than LDA in this particular case. This doesn't mean LDA, matematically is a worse model compared to NMF but it is generally easier to guess the items based on the topics generated by NMF. Therefore, we will use NMF for the recommendation system.

In [39]:
topic_matrix = pd.DataFrame(nmf_matrix,columns=topic_columns, index=data.index)

print(topic_matrix)

        Topic 0  Topic 1   Topic 2   Topic 3   Topic 4  Topic 5   Topic 6  \
0      0.000000      0.0  0.000000  0.000000  0.000000      0.0  0.004878   
1      0.000084      0.0  0.000000  0.000000  0.000000      0.0  0.005051   
2      0.000053      0.0  0.000000  0.000000  0.000000      0.0  0.005217   
3      0.000000      0.0  0.000000  0.000000  0.031042      0.0  0.000000   
4      0.000000      0.0  0.000440  0.000000  0.000000      0.0  0.000000   
...         ...      ...       ...       ...       ...      ...       ...   
28987  0.000000      0.0  0.062298  0.000000  0.000000      0.0  0.000000   
28988  0.000000      0.0  0.000000  0.000000  0.000000      0.0  0.000000   
28989  0.000000      0.0  0.019064  0.000000  0.000000      0.0  0.000000   
28990  0.000000      0.0  0.025234  0.000000  0.000000      0.0  0.000000   
28991  0.000000      0.0  0.034046  0.000882  0.000000      0.0  0.000000   

        Topic 7  Topic 8   Topic 9  ...  Topic 40  Topic 41  Topic 42  \
0 

### Colors

In [40]:
color_mlb = MultiLabelBinarizer()

color_matrix = color_mlb.fit_transform(colors)

color_matrix = pd.DataFrame(color_matrix ,columns = color_mlb.classes_, index = data.index)

print(color_matrix)

       aqua  black  blue  brown  coral  crimson  cyan  fuchsia  gold  gray  \
0         0      1     0      0      0        0     0        0     0     0   
1         0      0     0      0      0        0     0        0     0     0   
2         0      1     0      0      0        0     0        0     0     0   
3         0      1     1      0      0        0     0        0     0     0   
4         0      0     0      0      0        0     0        0     0     0   
...     ...    ...   ...    ...    ...      ...   ...      ...   ...   ...   
28987     0      1     0      0      0        0     0        0     0     0   
28988     0      1     0      0      0        0     0        0     0     0   
28989     0      1     0      1      0        0     0        0     0     0   
28990     0      1     0      0      0        0     0        0     0     0   
28991     0      1     0      0      0        0     0        0     0     0   

       ...  salmon  sienna  silver  snow  tan  tomato  violet  

### Origin

In [41]:
origin_mlb = MultiLabelBinarizer()

origin_matrix = origin_mlb.fit_transform(data['origin'].str.split(','))

origin_matrix = pd.DataFrame(origin_matrix ,columns = origin_mlb.classes_, index = data.index)

print(origin_matrix)

       Australia  Austria  Belgium  Canada  Denmark  France  Germany  \
0              0        0        0       0        0       0        0   
1              0        0        0       0        0       0        0   
2              0        0        0       0        0       0        0   
3              0        0        0       0        0       0        0   
4              0        0        0       0        0       0        0   
...          ...      ...      ...     ...      ...     ...      ...   
28987          0        0        0       0        0       0        0   
28988          1        0        0       0        0       0        0   
28989          0        0        0       0        0       0        0   
28990          0        0        0       0        0       0        0   
28991          0        0        0       0        0       0        0   

       Imported  Italy  Japan  Morocco  Netherlands  Portugal  South Korea  \
0             0      0      0        0            0      

### Material Composition

Composition text is a bit easier to process as we are disregarding the percentage, all the number inside and only want the name of the material. Specific items like jackets, shoes have words like Upper, Sole, Body, Trim, etc. in the composition text that are not related to material. To identify those words is a manual sampling process.

In [42]:
def process_composition(text):
    text= re.sub("(Upper|Sole|Body|Trim|Lining|Fill|American)"," ",text,flags=re.IGNORECASE) 
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return " ".join(result)

In [43]:
processed_composition = data['composition'].astype(str).apply(process_composition).str.split()

processed_composition

0                         [cashmer]
1                       [lambswool]
2                       [lambswool]
3                         [leather]
4                   [sterl, silver]
                    ...            
28987              [textil, rubber]
28988                     [leather]
28989             [leather, rubber]
28990            [calfskin, rubber]
28991    [calfskin, textil, rubber]
Name: composition, Length: 28992, dtype: object

In [44]:
composition_mlb = MultiLabelBinarizer()

composition_matrix = pd.DataFrame(composition_mlb.fit_transform(processed_composition),columns=composition_mlb.classes_, index=data.index)

print(composition_matrix)

       aceat  acet  acid  acryl  agat  alloy  alpaca  aluminium  aluminum  \
0          0     0     0      0     0      0       0          0         0   
1          0     0     0      0     0      0       0          0         0   
2          0     0     0      0     0      0       0          0         0   
3          0     0     0      0     0      0       0          0         0   
4          0     0     0      0     0      0       0          0         0   
...      ...   ...   ...    ...   ...    ...     ...        ...       ...   
28987      0     0     0      0     0      0       0          0         0   
28988      0     0     0      0     0      0       0          0         0   
28989      0     0     0      0     0      0       0          0         0   
28990      0     0     0      0     0      0       0          0         0   
28991      0     0     0      0     0      0       0          0         0   

       angora  ...  weav  wood  wool  yak  yarn  yellow  zamac  zamak  zinc

### Remaining Sizes

In [45]:
size_mlb = MultiLabelBinarizer()

size_matrix = pd.DataFrame(size_mlb.fit_transform(data['remaining-sizes'].apply(ast.literal_eval)),columns=size_mlb.classes_, index=data.index)

print(size_matrix)

       0  00  1  10  10.5  10/12  100  105  11  11.5  ...  T  U  UNI  XL  \
0      0   0  0   0     0      0    0    0   0     0  ...  0  0    1   0   
1      0   0  0   0     0      0    0    0   0     0  ...  0  0    1   0   
2      0   0  0   0     0      0    0    0   0     0  ...  0  0    1   0   
3      0   0  0   0     0      0    0    0   0     0  ...  0  0    1   0   
4      0   0  0   0     0      0    0    0   0     0  ...  0  0    1   0   
...   ..  .. ..  ..   ...    ...  ...  ...  ..   ...  ... .. ..  ...  ..   
28987  0   0  0   0     0      0    0    0   0     0  ...  0  0    0   0   
28988  0   0  0   0     0      0    0    0   0     0  ...  0  0    0   0   
28989  0   0  0   0     0      0    0    0   0     0  ...  0  0    0   0   
28990  0   0  0   0     0      0    0    0   0     0  ...  0  0    0   0   
28991  0   0  0   0     0      0    0    0   0     0  ...  0  0    0   0   

       XL/XXL  XS  XS/S  XXL  XXS  XXXL  
0           0   0     0    0    0     0  
1  

### Prices

As we already established during the EDA process that prices are right skewed for both full and sale prices, a useful normalization for these values are log transformation.

In [116]:
data['full-price'] = data['full-price'].map(np.log)
data['sale-price'] = data['sale-price'].map(np.log)

### Putting Everything Together

Now we combine all matrices with new features we have created earlier into a single new dataframe for the recommendation system to train on later. We include the SKU numbers in the dataframe so the specific product can be looked up.

In [117]:
ssense_rec_df = pd.concat([data['sku'],
              category_matrix,brand_matrix,topic_matrix,
              color_matrix, origin_matrix,
              composition_matrix, size_matrix,
              data['full-price'], data['sale-price']], 
              axis=1, ignore_index=False)

print(ssense_rec_df)

                 sku  accessories  backpacks  bags  bars  beanies  belts  \
0      201014M138001            0          0     0     0        1      0   
1      201422M138012            0          0     0     0        1      0   
2      201422M138013            0          0     0     0        1      0   
3      201168M170261            0          0     0     0        0      0   
4      201168M145213            0          0     0     0        0      0   
...              ...          ...        ...   ...   ...      ...    ...   
28987  191678M237003            0          0     0     0        0      0   
28988  191993M223011            0          0     0     0        0      0   
28989  191358M237006            0          0     0     0        0      0   
28990  191232M237005            0          0     0     0        0      0   
28991  181813M225005            0          0     0     0        0      0   

       biker  blankets  blazers  ...  UNI  XL  XL/XXL  XS  XS/S  XXL  XXS  \
0         

From 9 orignal features that we deemed useful for the recommandation system, we have engineered and created 1069 new features that will be used to train the system.

In [119]:
import pickle

pickle_out = open("ssense_rec_df.pickle","wb")
pickle.dump(ssense_rec_df,pickle_out)
pickle_out.close()