# Outfit Recommendation For USERS (Collaborative Filtering)

# Cleaning + Exploration

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
!pip install implicit
import implicit
from sklearn.utils.extmath import randomized_svd

import string
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.metrics.pairwise import cosine_similarity
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/5a/d8/6b4f1374ffa2647b72ac76960c71b984c6f3238090359fb419d03827d87a/implicit-0.4.2.tar.gz (1.1MB)
[K     |▎                               | 10kB 17.8MB/s eta 0:00:01[K     |▋                               | 20kB 6.4MB/s eta 0:00:01[K     |▉                               | 30kB 8.9MB/s eta 0:00:01[K     |█▏                              | 40kB 10.9MB/s eta 0:00:01[K     |█▌                              | 51kB 7.2MB/s eta 0:00:01[K     |█▊                              | 61kB 8.4MB/s eta 0:00:01[K     |██                              | 71kB 9.5MB/s eta 0:00:01[K     |██▍                             | 81kB 10.4MB/s eta 0:00:01[K     |██▋                             | 92kB 8.4MB/s eta 0:00:01[K     |███                             | 102kB 9.2MB/s eta 0:00:01[K     |███▎                            | 112kB 9.2MB/s eta 0:00:01[K     |███▌                            | 122kB 9.2MB/s eta 0:00:01



In [2]:
outfit_combination_df = pd.read_csv('outfit_combinations_clean_type.csv')
outfit_combination_df.head(20)

Unnamed: 0.1,Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory,kate spade new york,medium margaux leather satchel
3,3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
5,5,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
6,6,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
7,7,01DMHCX50CFX5YNG99F3Y65GQW,01DMHCNT41E14QWP503V7CT9G6,accessory,Nina,Crystal Clutch
8,8,01DMHRX35M2DPVYVQ1PNER4S4B,01DMBRYVA2Q2ST7MNYR6EEY4TK,onepiece,Equipment,Chemelle Midi Dress
9,9,01DMHRX35M2DPVYVQ1PNER4S4B,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump


In [3]:
# check for unique item types
outfit_combination_df.outfit_item_type.unique()

array(['bottom', 'top', 'accessory', 'shoe', 'onepiece'], dtype=object)

In [4]:
# check for duplicated recommendations
outfit_combination_df.duplicated().sum()

0

In [0]:
# create lookup tables/dicts
item_lookup_tb = outfit_combination_df[['product_id', 'outfit_item_type', 'brand', 'product_full_name']].drop_duplicates()
outfit_ids = {num:outfit_id for num, outfit_id in enumerate(outfit_combination_df.outfit_id.unique())}
product_ids = {num:product_id for num, product_id in enumerate(outfit_combination_df.product_id.unique())}
product_ids_num = {product_id:num for num, product_id in enumerate(outfit_combination_df.product_id.unique())}
product_type_lookup = outfit_combination_df[['product_id', 'outfit_item_type']].drop_duplicates().set_index('product_id')['outfit_item_type']
product_name_lookup = outfit_combination_df[['product_id', 'product_full_name']].drop_duplicates().set_index('product_id')['product_full_name'].to_dict()
product_brand_lookup = outfit_combination_df[['product_id', 'brand']].drop_duplicates().set_index('product_id')['brand'].to_dict()

In [0]:
def factorize(outfit):
    products = outfit_combination_df[outfit_combination_df.outfit_id == outfit.name].product_id
    for product in products:
        outfit.loc[product] = 1
    return outfit

In [0]:
# construct sparse matrix
sparse_outfit_matrix = pd.DataFrame(
    0,
    columns=product_ids.values(),
    index=outfit_ids.values()).apply(factorize,axis=1).T

In [8]:
#sanity check
sparse_outfit_matrix.loc['01DMBRYVA2P5H24WK0HTK4R0A1','01DMHCX50CFX5YNG99F3Y65GQW']

1

In [9]:
# check for sparsity
print('Sparsity: {:4.3f}%'.format(float(sparse_outfit_matrix.shape[0])/float(len(outfit_ids)*len(product_ids))*100))

Sparsity: 0.088%


# Classic Factorization Models

In [0]:
# def create_SVD_dfs(df, n_components=50):
#     U, sig, V = randomized_svd(np.array(df), n_components=n_components)
#     U_df = pd.DataFrame(U, index=df.index)
#     V_df = pd.DataFrame(V, columns=df.columns)
#     return U_df, sig, V_df.T
# outfits_df, sig, products_df = create_SVD_dfs(sparse_outfit_matrix)

In [0]:
# recommend for items given 1 product id
def recommend_products(recommended_products, recommendations, outfits):
  for rec in recommendations:
    onepiece, bottom_top = True, True
    if 'bottom' in recommended_products or 'top' in recommended_products:
      onepiece = False
    if 'onepiece' in recommended_products:
      bottom_top = False
    if (len(recommended_products) == 3 and onepiece) | \
        (len(recommended_products) == 4 and bottom_top):
      return recommended_products
    if product_type_lookup.loc[product_ids[rec[0]]] in recommended_products:
      continue
    if (product_type_lookup.loc[product_ids[rec[0]]] in ['bottom', 'top'] and not bottom_top) | \
        (product_type_lookup.loc[product_ids[rec[0]]] == 'onepiece' and not onepiece):
      continue
    recommended_products[product_type_lookup.loc[product_ids[rec[0]]]] = product_ids[rec[0]]

In [0]:
def beautify_output(recommended_output):
  recommended_out_df = pd.DataFrame(
      [[product_brand_lookup[item_id], product_name_lookup[item_id]] for item_id in recommended_output.values()], 
      index=recommended_output.keys(),
      columns=['Brand','Item'])
  return recommended_out_df

In [13]:
# single id recommender. default model is ALS
def outfit_recommender(
    sparse_outfit_matrix, 
    item_id, 
    model=implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=50, use_gpu=False),
    verbose=True,
    N=100):
  sparse_outfit_matrix = sparse.csr_matrix(sparse_outfit_matrix)
  model.fit(sparse_outfit_matrix)
  recommendations = model.similar_items(product_ids_num[item_id], N=N)
  input_type = product_type_lookup.loc[item_id]
  outfits = product_type_lookup.unique().tolist()
  recommended_products = {input_type:item_id}
  recommended_output = recommend_products(recommended_products, recommendations, outfits)
  if verbose:
    print('---Based on your query, we recommend the following outfit---')
  return beautify_output(recommended_output)



In [0]:
# different recommendation models from implicit package
ALS = implicit.als.AlternatingLeastSquares(factors=50, iterations=50, use_gpu=False)
baysian_ranking = implicit.bpr.BayesianPersonalizedRanking(factors=50, iterations=50, use_gpu=False)
logisticMF = implicit.lmf.LogisticMatrixFactorization(factors=50, iterations=50, use_gpu=False)
models = [ALS, baysian_ranking, logisticMF]

In [0]:
# recommend multiple outfits given the same 1 unique item id
def multiple_outfits(models, item_id):
  outfits = []
  for model in models:
    outfit = outfit_recommender(sparse_outfit_matrix, item_id, model, verbose=False)
    outfits.append(outfit)
  output_outfits = [pd.DataFrame(), pd.DataFrame()]
  for outfit in outfits:
    if outfit.sort_values('Item').equals(output_outfits[-1]) | outfit.sort_values('Item').equals(output_outfits[-2]):
      continue
    output_outfits.append(outfit.sort_values('Item'))
  return output_outfits[2:]

In [0]:
# dynamic user prompts product_id_recommender with built in id spell correction and prompt user for multiple for single recommendation
def product_id_recommender(input_id=None,called=False):
  """
  Please input the correct Product ID and the system will recommend outfit(s) based on your query
  """
  import getpass
  sparse_outfit_matrix = pd.DataFrame(
    0,
    columns=product_ids.values(),
    index=outfit_ids.values()).apply(factorize,axis=1).T
  swapped = True
  if not called:
    input_id = str(getpass.getpass('Please enter your product ID to get a recommended fit from us'))
  if input_id not in product_ids_num:
    swapped = False
    suggested_ids = pd.Series(list(product_ids_num.keys()),index=product_ids_num.keys()).\
          apply(lambda idx: fuzz.ratio(idx.lower(),input_id.lower())).\
          sort_values(ascending=False)
    for i in range(3):
      switch_id = str(getpass.getpass(f'Did you mean {suggested_ids.index[i]}?\nY/N'))
      if switch_id.lower() == 'y':
        input_id = suggested_ids.index[i]
        swapped = True
        break
  if not swapped:
    return 'Could not find the item you were looking for'
  
  multiple_fits = str(getpass.getpass('Do you want multiple outfits?\nY/N'))
  if multiple_fits.lower() == 'y':
    multiple_outs = multiple_outfits(models, input_id)
    for outfit in multiple_outs:
      print(outfit)
      print('############################################################################')
    return multiple_outs
  final_outfit = outfit_recommender(sparse_outfit_matrix, input_id)
  print(final_outfit)
  return final_outfit

In [17]:
outfit_recommender(sparse_outfit_matrix, '01DPCWEJRBVWZE397FMF9QXBBY')

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


---Based on your query, we recommend the following outfit---


Unnamed: 0,Brand,Item
onepiece,J.Crew,Gingham button-front midi dress with ruffle he...
accessory,Veronica Beard,Theron Jacket
shoe,ANINE BING,Stella Boots


# Neural CF

In [18]:
!pip install git+https://github.com/maciejkula/spotlight.git
from spotlight.interactions import Interactions
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score, mrr_score, precision_recall_score
from spotlight.factorization.representations import BilinearNet
from spotlight.layers import BloomEmbedding, ScaledEmbedding

Collecting git+https://github.com/maciejkula/spotlight.git
  Cloning https://github.com/maciejkula/spotlight.git to /tmp/pip-req-build-2raqowzi
  Running command git clone -q https://github.com/maciejkula/spotlight.git /tmp/pip-req-build-2raqowzi
Building wheels for collected packages: spotlight
  Building wheel for spotlight (setup.py) ... [?25l[?25hdone
  Created wheel for spotlight: filename=spotlight-0.1.6-cp36-none-any.whl size=33920 sha256=8fcaeab541c949bff106a6821ccf0202a4a242dff76069e3755d6b301517d431
  Stored in directory: /tmp/pip-ephem-wheel-cache-6prk33xs/wheels/0a/33/c8/e8510ea648aaacf6031e128dfa92bcd3750f02db2aaf0922fe
Successfully built spotlight
Installing collected packages: spotlight
Successfully installed spotlight-0.1.6


Testing implicit model

In [0]:
# look up tables
product_ids_num = {product_id:num for num, product_id in enumerate(outfit_combination_df.product_id.unique())}
outfit_ids_num = {outfit_id:num for num, outfit_id in enumerate(outfit_combination_df.outfit_id.unique())}
outfit_num = outfit_combination_df.outfit_id.map(outfit_ids_num)
product_num = outfit_combination_df.product_id.map(product_ids_num)

In [0]:
# dataset prep and train test split
implicit_interactions = Interactions(np.array(outfit_num),np.array(product_num))
train, validation = random_train_test_split(implicit_interactions,test_percentage=0.1)
validation, test = random_train_test_split(validation,test_percentage=0.5)

In [21]:
# testing out performance of classic factorization model
implicit_factorization_model = ImplicitFactorizationModel()
implicit_factorization_model.fit(train)
mrr_score(implicit_factorization_model,test).mean()

0.23540950329272706

Neural CF

In [0]:
#Cred to https://github.com/maciejkula/spotlight/blob/master/examples/bloom_embeddings/example.py

In [0]:
# neural cf model
def get_neural_model(train):

    item_embeddings = BloomEmbedding(train.num_items, 100,
                    compression_ratio=1,
                    num_hash_functions=4,
                    padding_idx=0)
    user_embeddings = BloomEmbedding(train.num_users, 100,
                    compression_ratio=1,
                    num_hash_functions=4,
                    padding_idx=0)

    network = BilinearNet(train.num_users,
                train.num_items,
                user_embedding_layer=user_embeddings,
                item_embedding_layer=item_embeddings)

    model = ImplicitFactorizationModel(n_iter=10,
                      l2=0,
                      representation=network)

    return model

In [0]:
# evaluate recommendation models
def evaluate_model(model, train, test, validation):
    model.fit(train, verbose=True)

    test_mrr = mrr_score(model, test)
    val_mrr = mrr_score(model, test.tocsr() + validation.tocsr())

    return test_mrr, val_mrr

In [25]:
# training and testing neural model
neural_model = get_neural_model(train)
neural_test_mrr, neural_val_mrr = evaluate_model(neural_model, train, test, validation) 

Epoch 0: loss 0.9859252760284826
Epoch 1: loss 0.7178048271881906
Epoch 2: loss 0.401774641714598
Epoch 3: loss 0.21184666376364858
Epoch 4: loss 0.1326316112750455
Epoch 5: loss 0.09493397489974373
Epoch 6: loss 0.07454693729155942
Epoch 7: loss 0.06013986507528707
Epoch 8: loss 0.05126650023617243
Epoch 9: loss 0.04500420244508668


In [26]:
# testing factorization model
factorization_test_mrr, factorization_mrr = evaluate_model(implicit_factorization_model, train, test, validation)

Epoch 0: loss 0.2603602048597838
Epoch 1: loss 0.23354175686836243
Epoch 2: loss 0.21534800843188637
Epoch 3: loss 0.19102981686592102
Epoch 4: loss 0.17305503315047213
Epoch 5: loss 0.16362100761187703
Epoch 6: loss 0.1473006362977781
Epoch 7: loss 0.13329527723161796
Epoch 8: loss 0.12258461902016088
Epoch 9: loss 0.11589096212073376


In [27]:
print(f'The MRR for the neural CF is {neural_test_mrr.mean()}')
print(f'The MRR for the classic CF is {factorization_mrr.mean()}')

The MRR for the neural CF is 0.08722703703618319
The MRR for the classic CF is 0.23042754545252622


***Neural Models might be unreliable for small datasets so going forward used classic factorization model***

# Run for recommendations

Single ID Recommendation

In [28]:
outfit_recommender(sparse_outfit_matrix, '01DMBRYVA2S5T9W793F4CY41HE')

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


---Based on your query, we recommend the following outfit---


Unnamed: 0,Brand,Item
accessory,kate spade new york,medium margaux leather satchel
onepiece,Equipment,Chemelle Midi Dress
shoe,Tory Burch,Penelope Mid Cap Toe Pump


In [29]:
outfit_recommender(sparse_outfit_matrix, product_ids[8], logisticMF)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


---Based on your query, we recommend the following outfit---


Unnamed: 0,Brand,Item
bottom,Reformation,Benson Skirt
top,Veronica Beard,Ashlynn Blouse
shoe,J.Crew,Pointed-toe flats in suede
accessory,J.Crew,Bembien® Jeanne leather woven market tote bag


Multiple outfits recommendation Demonstration

In [30]:
multiple_outs = multiple_outfits(models, '01DMBRYVA2S5T9W793F4CY41HE')
for outfit in multiple_outs:
  print(outfit)
  print('############################################################################')

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                         Brand                                               Item
shoe          Loeffler Randall                                   Goldy Tall Boots
onepiece            Zimmermann  Zinnia halterneck polka-dot linen and cotton-b...
accessory  kate spade new york                     medium margaux leather satchel
############################################################################
                         Brand                                               Item
bottom            Envelope1976                     Lembongan wool wrap midi skirt
top              Eileen Fisher                                 Rib Mock Neck Tank
shoe           Common Projects  Special Edition Retro Low Nubuck and Suede Sne...
accessory  kate spade new york                     medium margaux leather satchel
############################################################################
                         Brand                            Item
shoe                Tory Burch       Penelop

Dynamic input system with product ID spell check Demonstration

In [32]:
user_outfits = product_id_recommender()

Please enter your product ID to get a recommended fit from us··········
Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


---Based on your query, we recommend the following outfit---
                         Brand                            Item
accessory  kate spade new york  medium margaux leather satchel
onepiece             Equipment             Chemelle Midi Dress
shoe                Tory Burch       Penelope Mid Cap Toe Pump


# Description / Detail / Brand Vectorization

In [33]:
# from https://radimrehurek.com/gensim/models/word2vec.html
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import spacy.cli
spacy.cli.download("en_core_web_md")
spacy.cli.download("en_core_web_sm")
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, f1_score, accuracy_score, hamming_loss
import re

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


Using TensorFlow backend.


In [0]:
# grab additional info from full data and join with outfit data
full_data = pd.read_csv('full_data.csv')
outfit_full_df = outfit_combination_df.merge(full_data,how='left',left_on='product_id',right_on='product_id')

In [0]:
# text preprocessing(lemmatization, stopword removal, lower, strip punctuations)
def deep_cleaned(dirty_text, return_tokens=False):
  lem = WordNetLemmatizer()
  remove_digits = str.maketrans('', '', string.digits)
  dirty_text = dirty_text.translate(remove_digits)

  dirty_text = dirty_text.lower()
  dirty_text = dirty_text.translate(str.maketrans('', '', string.punctuation))
  dirty_text = remove_stopwords(dirty_text)
  tokens = gensim.utils.tokenize(dirty_text)
  lemmed_tokens = [lem.lemmatize(token) for token in tokens]
  if return_tokens:
    return lemmed_tokens
  return ' '.join(lemmed_tokens)

outfit_full_df.index = outfit_full_df.product_id

# Description2Brand

In [0]:
# data prep for Description2Brand Classification model. Was not used later so can skip this section
nlp = spacy.load('en_core_web_md')
description_brand_df = outfit_full_df.set_index('product_id')[['brand_x','description','brand_category']].drop_duplicates().fillna('')
X = np.array([nlp(desc).vector for desc in description_brand_df.description]).reshape(-1,1,300)
Y = description_brand_df.brand_x.apply(str.lower).values
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
dummy_y = np_utils.to_categorical(encoded_Y)
#https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

In [0]:
# dectionption 2 x classifier model
def Description2x():
  model = Sequential()
  model.add(LSTM(units=64,input_shape=(1,300)))
  model.add(Dense(16, activation='relu'))
  model.add(Dense(len(np.unique(Y)), activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [38]:
# training description 2 brand model
X_train, X_test, y_train, y_test = train_test_split(X,dummy_y,test_size=0.1,random_state=24)
brandclassifier = KerasClassifier(build_fn=Description2x, validation_split=0.1, epochs=15, batch_size=16, verbose=1)
brandclassifier.fit(X_train,y_train)

Train on 658 samples, validate on 74 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7fe08e03ce80>

In [39]:
# test accurcy
y_pred = brandclassifier.predict(X_test)
y_test_list = [idx for y in y_test for idx in range(len(y)) if y[idx] == 1]
accuracy_score(y_test_list,y_pred)



0.12195121951219512

In [40]:
# prediction demonstration
encoder.inverse_transform(brandclassifier.predict(nlp('Beige cashmere Slips on 100% cashmere Dry clea').vector.reshape(-1,1,300)))[0]



'theory'

# Cleaning brand categories

In [0]:
#regex cleaning for brand category
outfit_full_df.brand_category = outfit_full_df.brand_category.\
                          apply(lambda x: ','.join(pd.Series\
                          (re.findall(r'[/:]?(\w+[\s:]\w+[\w+:\w+]*[\s]\w*|\w+[\s:]\w+[\w+:\w+]*|\w+)[/:]?', str(x).lower()))\
                          .drop_duplicates().tolist()))

In [0]:
# make a set of all the categories
def add_cat(x):
  cats = re.findall(r'[,]?(\w+[\s]\w+[\s]\w+|\w+[\s]?\w+|\w+)[,]?',x)
  for cat in cats:
    categories.add(cat)
  return

In [0]:
#remove duplicates
def remove_dupe(alist):
  blist = []
  for cat in alist:
    blist.append(cat.strip())
  return ','.join(pd.Series(blist).drop_duplicates().tolist())

In [0]:
#remove duplicates in a single line of brand category
remove_digits = str.maketrans('', '', string.digits)
outfit_full_df.brand_category = outfit_full_df.brand_category\
                  .apply(lambda x: x.replace(':', ','))\
                  .apply(lambda x: x.translate(remove_digits))\
                  .apply(lambda x: remove_dupe(x.split(',')))

In [0]:
#merge duplicate categories into ones
to_replace = [[r'(\bt,shirts\b)','tees'],
[r'(\baccessories\b|\baccessory\b|\ball_accessories\b)','accessories'],
[r'(\bbelt bags\b|\bbeltbags\b)','beltbags'],
[r'(\bcoatsjacketswa\b)','coats_and_jackets'],
[r'(\bcross body\b|\bcrossbody\b)','crossbody'],
[r'(\bflat shoes\b|\bflat\b)','flats'],
[r'(\bhandbagsshoes\b)','handbags,shoes'],
[r'(\bjacketsvests\b)','jackets,vests'],
[r'(\bjackets\b|\bjacket\b)','jackets'],
[r'(\bdressesandjumpsuits\b)','dresses,jumpsuits'],
[r'(\bbolerosjacketsvests\b)','boleros,jackets,vests'],
[r'(\bapparelaccessories\b)','apparel,accessories'],
[r'(\blow top\b|\blowtop\b)','lowtop'],
[r'(\bmini bags\b)','minibags'],
[r'(\bpantsshortsjumpsuits\b)','pants,shorts,jumpsuits'],
[r'(\bshirts_tops\b)','shirts,tops'],
[r'(\bshorts\b)','short'],
[r'(\bshoulder bags\b|\bshoulder_bags\b)','shoulderbags'],
[r'(\bsweatshirts_sweatpants\b)','sweatshirts,sweatpants'],
[r'(\btshirts_tanktops\b)','tees,tanktops'],
[r'(\btshirtstanks\b)','tees,tanktops'],
[r'(\bwomensapparel\b)','women'],
[r'(\bclutch bags\b)','clutches'],
[r'(\bjumpsuit\b)','jumpsuits'],
[r'(\bmidi dresses\b)','midi'],
[r'(\bromper\b)','rompers'],
[r'(\bskirt\b)','skirts'],
[r'(\bsweater\b)','sweaters'],
[r'(\btote bags\b)','totes'],
[r'(\btop\b)','tops']]

for replace in to_replace:
    outfit_full_df.brand_category = outfit_full_df.brand_category.apply(lambda x: re.sub(replace[0],replace[1], x.lower()))

# Word2Vec

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import tokenize

In [47]:
# data prep for word2vec model
description_brand_df = outfit_full_df.set_index('product_id')[['brand_x','product_full_name_x','description','brand_category']].drop_duplicates().fillna('')
description_brand_df.columns = ['brand','product_full_name','description','brand_category']
all_brands = description_brand_df.brand.str.lower().unique()
categories = set()
description_brand_df.brand_category.apply(add_cat)
for col in description_brand_df.columns:
  description_brand_df[col] = description_brand_df[col].apply(deep_cleaned)
descriptions_tokens = [[token for token in tokenize(sent)] for sent in description_brand_df.description.drop_duplicates().values]
description_model = Word2Vec(descriptions_tokens, size=96, min_count=2)



In [0]:
def tfidf(series):
    vectorizer = TfidfVectorizer(max_df=0.3)
    X = vectorizer.fit_transform(series)

    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())


    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table.columns 
    available_tf_idf_scores = list(map( lambda x: x.lower(), available_tf_idf_scores)) 
    return tf_idf_lookup_table, available_tf_idf_scores

In [0]:
#make embed function cr to Prof Yu Chen
def embed(textdf, available_tf_idf_scores, tf_idf_lookup_table, text_only=False):
    text_vectors = {}
    indices = textdf.index
    for idx, text in enumerate(textdf): 
        tokens = nlp(text)
        total_tf_idf_score_per_document = 0

        running_total_word_embedding = np.zeros(96) 
        for token in tokens:
            if token.text in description_model.wv.vocab and token.text.lower() in available_tf_idf_scores:

                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text.lower()]
            
                running_total_word_embedding += tf_idf_score * description_model.wv.get_vector(token.text)

                total_tf_idf_score_per_document += tf_idf_score

        document_embedding = running_total_word_embedding / (total_tf_idf_score_per_document + 1e-6)
        if text_only:
          return document_embedding
        text_vectors[indices[idx]] = document_embedding
    return text_vectors

# Define final recommender class

In [0]:
# fuzzy matching function
def fuzzy_match(fuzz_list, query):
  fuzz_series = pd.Series(fuzz_list,index=fuzz_list)
  for f in fuzz_list:
    fuzz_return = fuzz_series.apply(lambda fz: fuzz.partial_ratio(fz.lower(),query.lower())).sort_values(ascending=False)
  if fuzz_return.iloc[0] > 85:
    return fuzz_return.index[0]
  return None

In [0]:
# final outfits_recommender class
class outfits_recommender:
  #initialize query
  def __init__(self, query):
    self.query = deep_cleaned(query)
    self.brand = None
    self.category = None
    self.name = None
    self.query_embed = None

  # get the closest id description item with given query vector
  def closest_id(self, compare_indices, d_text_vector):
    closest = 0
    closest_product = None
    for idx in compare_indices:
      closeness = cosine_similarity([d_text_vector[idx],self.query_embed])
      if closeness[0][1] > closest:
        closest = closeness[0][1]
        closest_product = idx
    return closest_product

  # main freeform seach
  def free_form_search(self):
    # grab all vectors for all descriotions and query
    query_desc_df = description_brand_df.drop_duplicates().description.apply(deep_cleaned).append(pd.Series([self.query],index=['query']))
    d_tf_idf_lookup_table, d_available_tf_idf_scores = tfidf(query_desc_df)
    d_text_vector = embed(query_desc_df, d_available_tf_idf_scores, d_tf_idf_lookup_table)

    # fuzzy match brand, category and name
    self.brand = fuzzy_match(all_brands, self.query)
    self.category = fuzzy_match(list(categories), self.query)
    self.name = fuzzy_match(description_brand_df.product_full_name.drop_duplicates().apply(deep_cleaned).tolist(), self.query)

    # setting query vector. if query vector has all vocabs not in our word2vec model, try spacy, if not random initialize
    self.query_embed = d_text_vector['query']
    if np.count_nonzero(self.query_embed) == 0:
      self.query_embed = spacy.load('en_core_web_sm')(self.query).vector
    if np.count_nonzero(self.query_embed) == 0:
      self.query_embed = np.random.randn(self.query_embed.shape[0])
    del d_text_vector['query']

    # if the query contains brand and category
    if self.brand and self.category and self.name:
      return_df = description_brand_df[(description_brand_df.brand==self.brand)&(description_brand_df.brand_category.isin([self.category]))]
      if not return_df.empty:
        closest_product = self.closest_id(return_df.index,d_text_vector)
        return product_id_recommender(input_id=closest_product,called=True)
    
    # if query contains brand
    if self.brand:
      return_df = description_brand_df[(description_brand_df.brand==self.brand)]
      if not return_df.empty:
        closest_product = self.closest_id(return_df.index,d_text_vector)
        return product_id_recommender(input_id=closest_product,called=True)
    
    # if query contains full name
    if self.name:
      return_df = description_brand_df[(description_brand_df.product_full_name==self.name)]
      if not return_df.empty:
        closest_product = self.closest_id(return_df.index,d_text_vector)
        return product_id_recommender(input_id=closest_product,called=True)

    # if query contains category
    if self.category:
      return_df = description_brand_df[(description_brand_df.brand_category.isin([self.category]))]
      if not return_df.empty:
        closest_product = self.closest_id(return_df.index,d_text_vector)
        return product_id_recommender(input_id=closest_product,called=True)

    # none of the above
    closest_product = self.closest_id(d_text_vector.keys(),d_text_vector)
    return product_id_recommender(input_id=closest_product,called=True)
  
  # method for doing pure id seach
  def product_id_search(self):
    return product_id_recommender(input_id=self.query,called=True)

# Final Demonstrations

In [52]:
# demostrating free-form brand search
outfitQ = outfits_recommender('bananarepub')
banana_outfits = outfitQ.free_form_search()

Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                     Brand                                           Item
accessory  Banana Republic                  Herringbone Wool-Blend  Scarf
bottom     Banana Republic  JAPAN EXCLUSIVE Herringbone Midi Pencil Skirt
shoe       Banana Republic                    Madison 12-Hour Loafer Pump
top           Marissa Webb             Piper Mock Neck Wool-Blend Sweater
############################################################################
                         Brand                                               Item
shoe       Christian Louboutin                          Galativi Mesh Suede Pumps
bottom         Banana Republic      JAPAN EXCLUSIVE Herringbone Midi Pencil Skirt
top                  Derek Lam  Oversized Draped Silk Blouse With Self Tie Nec...
accessory      Banana Republic                     Unlined Double-Faced Maxi Coat
############################################################################
                     Brand                                       

In [53]:
# demostrating free-form brand search
outfitQ = outfits_recommender('sneakerz')
shoes_outfits = outfitQ.free_form_search()

Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                        Brand                                    Item
accessory  Madeleine Thompson  Danny Wool And Cashmere-Blend Cardigan
bottom             NILI LOTAN              Paris Cashmere Track Pants
shoe           Veronica Beard           Sami Lace-Up Leather Sneakers
top                    Khaite                  Viola Cashmere Sweater
############################################################################
                        Brand                                    Item
accessory  Madeleine Thompson  Danny Wool And Cashmere-Blend Cardigan
top                NILI LOTAN      Jimi Hendrix Cotton-Jersey T-Shirt
bottom             NILI LOTAN              Paris Cashmere Track Pants
shoe           Veronica Beard           Sami Lace-Up Leather Sneakers
############################################################################


In [54]:
# demostrating free-form description search
outfitQ = outfits_recommender('Sexy silky, a-line mini skirt zipper Benson skirt')
product_id_outfits = outfitQ.free_form_search()

Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                    Brand                       Item
top        Veronica Beard             Ashlynn Blouse
bottom        Reformation               Benson Skirt
shoe            ALLSAINTS         Donita Combat Boot
accessory           Frame  Les Second - Medium--NOIR
############################################################################
                               Brand                                      Item
accessory                Sam Edelman  65mm Gradient Oversize Square Sunglasses
bottom                   Reformation                              Benson Skirt
shoe                Loeffler Randall                          Goldy Tall Boots
top        REMAIN Birger Christensen         Halyn pussy-bow silk-satin blouse
############################################################################
                    Brand                          Item
top        Veronica Beard                Ashlynn Blouse
bottom        Reformation                  Benson Skirt
accessory    Sole

In [58]:
# demostrating free-form brand search
outfitQ = outfits_recommender('01E5ZYHZA7186DVWET99Q4D2PM')
product_id_outfits = outfitQ.product_id_search()

Did you mean 01E5ZYHZA7186DVWEJ99Q4D2PM?
Y/N··········
Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                 Brand                                      Item
accessory  Sam Edelman  65mm Gradient Oversize Square Sunglasses
onepiece        Xirena                             Cameron Dress
shoe            Khaite                         Suede ankle boots
############################################################################
                 Brand                                       Item
accessory  Sam Edelman   65mm Gradient Oversize Square Sunglasses
shoe       DR. MARTENS                Fenimore Triple Buckle Boot
onepiece         GANNI  Floral Print Crepe Long Sleeve Shirtdress
############################################################################


In [56]:
# Trying for out of vocab search
outfitQ = outfits_recommender('Natural Language Processing')
product_id_outfits = outfitQ.free_form_search()

Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                    Brand                                          Item
bottom     Veronica Beard  Beverly stretch-cotton corduroy flared pants
accessory      Rag & Bone                            CLASSIC WOOL SCARF
top                  Joie                    Dreamy Crewneck Sweatshirt
shoe                Chloé        Rylee snake-effect leather ankle boots
############################################################################
                Brand                                              Item
bottom     Nili Lotan  Arliss cropped Lyocell-blend twill tapered pants
accessory  Rag & Bone                                CLASSIC WOOL SCARF
shoe            Chloé            Rylee snake-effect leather ankle boots
top         The Great  The Painter's Smock checked cotton-flannel shirt
############################################################################


In [65]:
# Another free-form search
outfitQ = outfits_recommender('silky trousers classic tight fit')
product_id_outfits = outfitQ.free_form_search()

Do you want multiple outfits?
Y/N··········


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


                  Brand                           Item
top         Reformation                       Anne Top
accessory  Sole Society                     Chusy Tote
shoe             J.Crew              Lucie suede pumps
bottom            Frame  Plaid Cropped Perfect Trouser
############################################################################
                                 Brand                            Item
accessory  Aleksandre Akhalkatsishvili  Breast Cut Faux Leather Blazer
bottom                           Frame   Plaid Cropped Perfect Trouser
shoe             SARTO BY FRANCO SARTO                       Visa Mule
top                              Vince           Wool-Cashmere Sweater
############################################################################
                     Brand                           Item
shoe              MADEWELL    Boardwalk Post Slide Sandal
accessory     Sole Society                     Chusy Tote
top        ENGLISH FACTORY              E

In [67]:
outfit_full_df[outfit_full_df.product_full_name_x=='Plaid Cropped Perfect Trouser']

Unnamed: 0_level_0,Unnamed: 0,outfit_id,product_id,outfit_item_type,brand_x,product_full_name_x,brand_y,mpn,product_full_name_y,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
01DPGV9PFVQBXZQ8BKT0EVK2DB,53,01DQBBXQ5N8TBD77VCRTR9NF8H,01DPGV9PFVQBXZQ8BKT0EVK2DB,bottom,Frame,Plaid Cropped Perfect Trouser,Frame,LWWB0374,Plaid Cropped Perfect Trouser,"Versatile And Sleek, These Classic Crop Trouse...",pants,2019-10-06 15:34:07.094000+00:00,2020-03-25 23:20:37.543000+00:00,2020-03-23 19:44:45.158000+00:00,https://frame-store.com/products/plaid-cropped...,,[],191.0
01DPGV9PFVQBXZQ8BKT0EVK2DB,53,01DQBBXQ5N8TBD77VCRTR9NF8H,01DPGV9PFVQBXZQ8BKT0EVK2DB,bottom,Frame,Plaid Cropped Perfect Trouser,Frame,LWWB0374,Plaid Cropped Perfect Trouser,"Versatile And Sleek, These Classic Crop Trouse...",pants,2019-10-06 15:34:07.094925+00,2019-12-19 20:40:30.786144+00,,https://frame-store.com/products/plaid-cropped...,,"{""Needs Attributes""}",
