In [16]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
lem = WordNetLemmatizer()

def lem_sentences(sentence):
    tokens = nltk.word_tokenize(sentence)
    lemmed_tokens = [lem.lemmatize(token) for token in tokens]
    return ' '.join(lemmed_tokens)

#I copy this function from this stackoverflow website
# https://stackoverflow.com/questions/43795310/apply-porters-stemmer-to-a-pandas-column-for-each-word

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def keepAlpha(sentence):
    alpha_sent = ""
    for word in word_tokenize(sentence):
        alpha_word = re.sub('[^\w]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

In [3]:
outfit_df = pd.read_csv('outfit_combinations_clean_type.csv', index_col=0)
full_data = pd.read_csv('full_data.csv')

In [4]:
def id_based_recommendation(product_id):
    #fuzzywuzzy matching the product id if there is typo
    product_id=process.extractOne(product_id, outfit_df['product_id'].values)[0]
    outfit_recommend_group=outfit_df.loc[outfit_df['product_id']==product_id]['outfit_id'].unique()
    
    #if a product belongs to multiple outfit combination, randomly choose one
    #so that different combincations of suitable items all have chance be picked up by customer
    #this algorithm can also handle if a product only belong to one outfit group
    outfit_recommend=np.random.choice(outfit_recommend_group, 1, replace=False)[0] 
    outfit_combination=outfit_df.loc[outfit_df['outfit_id']==outfit_recommend]
    return outfit_combination

In [5]:
#using the correct productid '01DMBRYVA2ZFDYRYY5TRQZJTBD'
# '01DMBRYVA2ZFDYRYY5TRQZJTBD' belongs to 4 different outfit group
# so it is reasonable that the output could be different every time
id_based_recommendation('01DMBRYVA2ZFDYRYY5TRQZJTBD')

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
11,01DMHRYZDTF2NBJ13ZEARKHA3T,01DMBRYVA2Q2ST7MNYR6EEY4TK,onepiece,Equipment,Chemelle Midi Dress
12,01DMHRYZDTF2NBJ13ZEARKHA3T,01DMBRYVA2S5T9W793F4CY41HE,accessory,kate spade new york,medium margaux leather satchel
13,01DMHRYZDTF2NBJ13ZEARKHA3T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump


In [6]:
#using '01DMBRYVA2ZFDYRYY5TRQZJTB"A"' instead of the correct '01DMBRYVA2ZFDYRYY5TRQZJTB"D"'
id_based_recommendation('01DMBRYVA2ZFDYRYY5TRQZJTBA')

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
8,01DMHRX35M2DPVYVQ1PNER4S4B,01DMBRYVA2Q2ST7MNYR6EEY4TK,onepiece,Equipment,Chemelle Midi Dress
9,01DMHRX35M2DPVYVQ1PNER4S4B,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
10,01DMHRX35M2DPVYVQ1PNER4S4B,01DMHCNT41E14QWP503V7CT9G6,accessory,Nina,Crystal Clutch


In [7]:
full_datam=full_data[['product_id','description','brand_category','details']]\
.drop_duplicates(subset='product_id',keep='last')

In [8]:
outfit_full = outfit_df.merge(full_datam,on=['product_id'],how='left')

In [10]:
outfit_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5282 entries, 0 to 5281
Data columns (total 8 columns):
outfit_id            5282 non-null object
product_id           5282 non-null object
outfit_item_type     5282 non-null object
brand                5282 non-null object
product_full_name    5282 non-null object
description          5138 non-null object
brand_category       5250 non-null object
details              4657 non-null object
dtypes: object(8)
memory usage: 371.4+ KB


In [11]:
#keep the unique value of "product_full_name","brand_category","description","details" 
outfit_text=outfit_full.drop_duplicates(subset='product_id').reset_index(drop=True).fillna('')

#conduct text preprocessing 
outfit_text['product_full_name']=outfit_text['product_full_name'].str.lower()
outfit_text['product_full_name']=outfit_text['product_full_name'].apply(lem_sentences)
outfit_text['product_full_name']=outfit_text['product_full_name'].apply(cleanHtml)
outfit_text['product_full_name']=outfit_text['product_full_name'].apply(cleanPunc)
outfit_text['product_full_name']=outfit_text['product_full_name'].apply(keepAlpha)

outfit_text['brand_category']=outfit_text['brand_category'].str.lower()
outfit_text['brand_category']=outfit_text['brand_category'].apply(lem_sentences)
outfit_text['brand_category']=outfit_text['brand_category'].apply(cleanHtml)
outfit_text['brand_category']=outfit_text['brand_category'].apply(cleanPunc)
outfit_text['brand_category']=outfit_text['brand_category'].apply(keepAlpha)

outfit_text['description']=outfit_text['description'].str.lower()
outfit_text['description']=outfit_text['description'].apply(lem_sentences)
outfit_text['description']=outfit_text['description'].apply(cleanHtml)
outfit_text['description']=outfit_text['description'].apply(cleanPunc)
outfit_text['description']=outfit_text['description'].apply(keepAlpha)

outfit_text['details']=outfit_text['details'].str.lower()
outfit_text['details']=outfit_text['details'].apply(lem_sentences)
outfit_text['details']=outfit_text['details'].apply(cleanHtml)
outfit_text['details']=outfit_text['details'].apply(cleanPunc)
outfit_text['details']=outfit_text['details'].apply(keepAlpha)

In [12]:
def text_preprocessing(text):
    input_text=text.strip().lower()
    #Using Lemmatization to keep the original form of word
    input_text=lem_sentences(input_text)
    #clean the html sign since it would not give us insight
    input_text=cleanHtml(input_text)
    #removing special chracter such as trademark
    input_text=keepAlpha(input_text)
    #clean punctuation
    input_text=cleanPunc(input_text)
    input_text =pd.Series(input_text)
    return input_text

vectorizer = CountVectorizer(stop_words="english",token_pattern=r'\b[a-zA-Z0-9]{3,}\b', binary=True)
vectorizer_input = CountVectorizer(stop_words="english",token_pattern=r'\b[a-zA-Z0-9]{3,}\b', binary=True)

def similarity_column(text,outfit_text,column):
    
    #preporcess the input text
    input_text=text_preprocessing(text)
    #countvectorize it
    input_vec= vectorizer_input.fit_transform(input_text)
    #turn the result into dataframe
    input_df = pd.DataFrame(input_vec.toarray(), columns=vectorizer_input.get_feature_names())
    
    #countvectorize the column that the input text belongs to
    X = vectorizer.fit_transform(outfit_text[column].values)
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    #concat two result, make the input text dataframe the last row of the concatnated dataframe
    compare_df=pd.concat([vectorized_df,input_df],sort=True).fillna(0).reset_index(drop='True')
    
    score_list=[]
    for i in range(len(compare_df)-1):
        #for each row compute the cosine similarity score with the input text(the last row)
        socre=cosine_similarity([compare_df.iloc[-1]], [compare_df.iloc[i]])[0][0]
        score_list.append(socre)
    #make a copy of the outfit_text so that the new column would not added to the original dataframe
    outfit_score=outfit_text.copy()
    #the column represent the consine similarity between input text and its counterpart in the outfit_text
    outfit_score['similarity_score']=score_list
    return outfit_score

In [25]:
def text_based_recommendation(product_name,brand,brand_category,description,details,outfit_text,outfit_df):
    if brand != '':
        #match the closest brand name if there is a typo
        brand_match=process.extractOne(brand, outfit_text['brand'].values)[0]
        #narrow down the search scope by keeping only the same brand
        outfit_brand = outfit_text[outfit_text['brand']==brand_match]
        
        #find the product ID based on its "product_name","brand_category","description","details"
        #based on the largest cosine similarity
        #noted that this algorithm implicitly assign weight to different kind of  text
        #for example, if "product_full_name" and "description" are inserted simultaneously
        #this alogrithm only use "product_full_name" to find the most similar product
        if product_name != '':
            outfit_score=similarity_column(product_name,outfit_brand,'product_full_name')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif brand_category != '':
            outfit_score=similarity_column(brand_category,outfit_brand,'brand_category')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif description !='':
            outfit_score=similarity_column(description,outfit_brand,'description')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif details !='':
            outfit_score=similarity_column(details,outfit_brand,'details')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
        
        #the same process as the id_based recommendation
        outfit_recommend_group=outfit_df.loc[outfit_df['product_id']==product_id]['outfit_id'].unique()
        outfit_recommend=np.random.choice(outfit_recommend_group, 1, replace=False)[0] 
        outfit_combination=outfit_df.loc[outfit_df['outfit_id']==outfit_recommend]
        return outfit_combination
        
        
    else:
        if product_name != '':
            outfit_score=similarity_column(product_name,outfit_text,'product_full_name')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif brand_category != '':
            outfit_score=similarity_column(brand_category,outfit_text,'brand_category')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif description !='':
            outfit_score=similarity_column(description,outfit_text,'description')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        elif details !='':
            outfit_score=similarity_column(details,outfit_text,'details')
            product_id=outfit_score[outfit_score['similarity_score']==outfit_score['similarity_score'].max()]\
            ['product_id'].values[0]
            
        outfit_recommend_group=outfit_df.loc[outfit_df['product_id']==product_id]['outfit_id'].unique()
        outfit_recommend=np.random.choice(outfit_recommend_group, 1, replace=False)[0] 
        outfit_combination=outfit_df.loc[outfit_df['outfit_id']==outfit_recommend]
        return outfit_combination
        

In [30]:
product_name='Slim Knit Skirt'
brand=''
brand_category=''
description=''
details=''
text_based_recommendation(product_name,brand,brand_category,description,details,outfit_text,outfit_df)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
5,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
6,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
7,01DMHCX50CFX5YNG99F3Y65GQW,01DMHCNT41E14QWP503V7CT9G6,accessory,Nina,Crystal Clutch


In [31]:
product_name=''
brand=''
brand_category=''
description='Sexy silky, a-line mini skirt zipper Benson skirt'
details=''
text_based_recommendation(product_name,brand,brand_category,description,details,outfit_text,outfit_df)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
14,01DQ63P636Q4BQVCKT6Z4S41G5,01DPCRZWX4S2Z8Q5HYDFM4HNEG,shoe,J.Crew,Pointed-toe flats in suede
15,01DQ63P636Q4BQVCKT6Z4S41G5,01DPET2NWSA221STZF740BZ9SW,top,Veronica Beard,Ashlynn Blouse
16,01DQ63P636Q4BQVCKT6Z4S41G5,01DPKMGJ33SDFXM7XHGPQJWQ12,bottom,Reformation,Benson Skirt
