# TF-IDF

In [1]:
import pandas as pd
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Import produdct file

In [2]:
product = pd.read_csv("processed_product.csv")
product['all'] = product['brand_category'].astype(str).replace("unknown_token","").replace("unknown","")+" "+\
                product['name'].astype(str).replace("unknown_token","").replace("unknown","")+" "+\
                product['description'].astype(str).replace("unknown_token","").replace("unknown","")
product['all'] = product['all'].replace("\s\s","\s")

Import outfit file and created "all" column, combining "name","brand_category","description" information

In [3]:
outfit = pd.read_csv('outfit_combinations.csv')
outfit.product_full_name = outfit.product_full_name.str.lower()
outfit.loc[outfit["outfit_item_type"].str.contains(r"\baccessory1|accessory2|accessory3\b"),"outfit_item_type"] = "accessory"
outfit = outfit[outfit['product_full_name'] != "#name?"]

Vectorize "all" values in "product" dataframe and "product_full_name" values in "outfit" dataframe


In [4]:
vectorizer_info = TfidfVectorizer(max_features=1000)
info_vectorizer= vectorizer_info.fit_transform(product["all"].values)
info_tf_idf = pd.DataFrame(info_vectorizer.toarray(), columns=vectorizer_info.get_feature_names()) 

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(outfit.product_full_name.values)
tf_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

**Logic of Recommending Outfit:**

Calculate the cosine similarity scores between query and all products and find the most similar product, which has highest score.

1) If the product with the highest score does not have category value, the query can hardly be assigned to a clothing category and thus we can not give any outfit recommendation.

2) Else: if the max score is above good threshold, we would then check if the most similar product in the outfit dataframe. If the product already exists in the outfit, functions would return the rest of its outfit recommendations from outfit table.

3) Else: if the query can match a similar product in outfit table based on cosine similarity, functions would return that product's combo in outfit table.

4) Else: if the query can match two products from different categories in product table based on cosine similarity, functions would return these two products with the most similar product as outfit.

Good Threshold for all cosine similarity measurements is 0.4.

In [5]:
def recommend_outfit(query):
    
    words = nltk.word_tokenize(query)
    newWords = [word.lower() for word in words if (word.lower() not in string.punctuation)&(word.lower() not in stopwords.words("English"))]
    query = " ".join(newWords)
    vectorizer_q = TfidfVectorizer(max_features=1000)
    query_vectorizer= vectorizer_q.fit_transform(pd.Series(query))
    query_tf_idf = pd.DataFrame(query_vectorizer.toarray(), columns=vectorizer_q.get_feature_names()) 
    
    info_query = pd.concat([info_tf_idf,query_tf_idf],sort=True).fillna(0)
    score1 = []
    for i in range(len(info_tf_idf)):
        score1.append(cosine_similarity([info_query.iloc[i]], [info_query.iloc[-1]])[0][0])
        
    product_score = product.copy()
    product_score['similarity_score'] = score1
    product_score.sort_values("similarity_score",inplace=True,ascending=False)
    
    productid = product_score.product_id.iloc[0]     
    category = product_score.product_category.iloc[0]
    
    if category == "UNKNOWN_TOKEN":
        print("Sorry, we cannot recommend an appropriate outfit. Maybe you can try another input.")
    
    elif (productid in outfit.product_id.values) and (product_score['similarity_score'].max()>0.4):
        best_outfit  = outfit_score[outfit_score['outfit'] == product_id].outfit_id[0].iloc[0]
        output = outfit[outfit["outfit_id"]==best_outfit].copy().drop(columns = ['outfit_id']).reset_index(drop=True)
        print("Wow! You get a perfect outfit!")
        for i in range(len(output)):
            print(output['outfit_item_type'].iloc[i],":\n\tbrand: ",output['brand'].iloc[i],"\n\tproduct: ",output['product_full_name'].iloc[i])
        
    else:
        # calculate similarity scores
        X_query = pd.concat([tf_idf,query_tf_idf],sort=True).fillna(0)
        score2 = []
        for i in range(len(X_query)-1):
            score2.append(cosine_similarity([X_query.iloc[i]], [X_query.iloc[-1]])[0][0])

        # find the best combo
        outfit_score = outfit.copy()
        outfit_score['similarity_score'] = score2
        
        if outfit_score['similarity_score'].max()>0.4:
            best_outfit  = outfit_score[outfit_score['similarity_score'] == outfit_score['similarity_score'].max()].outfit_id.iloc[0]
            output = outfit[outfit['outfit_id']==best_outfit].copy().drop(columns = ['outfit_id']).reset_index(drop=True)
            print("Here is the best outfit we can recommend for you. Hope you like it~")
            for i in range(len(output)):
                print(output['outfit_item_type'].iloc[i],":\n\tbrand: ",output['brand'].iloc[i],"\n\tproduct: ",output['product_full_name'].iloc[i])
            
        else:
            
            productid2 = product_score[(product_score.product_category!= catgory)&(product_score.product_category!="UNKNOWN_TOKEN")].product_id.iloc[0]     
            catgory2 = product_score[product_score['product_id'] == productid2].product_category.iloc[0]
            
            productid3 = product_score[(product_score.product_category!= catgory)&(product_score.product_category!= catgory2)\
                                         &(product_score.product_category!="UNKNOWN_TOKEN")].product_id.iloc[0]     
            catgory3 = product_score[product_score['product_id'] == productid2].product_category.iloc[0]
            
            if product_score[product_score['product_id']==productid2].similarity_score.iloc[0]>0.4 \
            and product_score[product_score['product_id']==productid3].similarity_score.iloc[0]>0.4:
                output = product_score[(product_score['product_id']==productid)|(product_score['product_id']==productid2)|\
                                         (product_score['product_id']==productid3)]\
                        .rename(columns={'product_category':'outfit_item_type', 'name':'product_full_name'})
                print("Here is the best outfit we can recommend for you. Hope you like it~")
                for i in range(len(output)):
                    print(output['outfit_item_type'].iloc[i],":\n\tbrand: ",output['brand'].iloc[i],"\n\tproduct: ",output['product_full_name'].iloc[i])
            else:
                print("Sorry, we cannot recommend an appropriate outfit. Maybe you can try another input.")          

**Example 1**

In [6]:
recommend_outfit("slim fitting, straight leg pant with a center back zipper and slightly cropped leg")

Here is the best outfit we can recommend for you. Hope you like it~
top :
	brand:  Jacquemus 
	product:  double-layer paneled blouse
shoe :
	brand:  COACH 
	product:  heather c-chain leopard-print calf hair & leather sandals
bottom :
	brand:  Prada 
	product:  cropped wool straight-leg pants
accessory :
	brand:  Chylak 
	product:  croc-effect leather belt bag
accessory :
	brand:  LE 17 SEPTEMBRE 
	product:  cotton-blend twill trench coat


**Example 2**

In [7]:
recommend_outfit("pink shirt crewneck")

Here is the best outfit we can recommend for you. Hope you like it~
bottom :
	brand:  Goldsign 
	product:  the high rise slim
shoe :
	brand:  GOLDEN GOOSE 
	product:  superstar low top sneaker
top :
	brand:  MADEWELL 
	product:  whisper cotton ribbed crewneck t-shirt


**Example 3**

In [8]:
recommend_outfit("high rise straight leg jeans")

Here is the best outfit we can recommend for you. Hope you like it~
bottom :
	brand:  Re/done 
	product:  high-rise straight-leg jeans
shoe :
	brand:  Alexandre Birman 
	product:  clarita bow-embellished suede sandals
top :
	brand:  Ulla Johnson 
	product:  harper cotton eyelet blouse
