# Fashion Recommender

Front End

Sep 2021 - new and updated version


The Flow

- Front End
- Calls API - Middle layer (Reads and processes data file )
- Recommendation Algorithm
- Color Algorithm

In [50]:
color_choice = "#00000"
fabric_choice = "linen"
number_of_rec = 1

recommendationEngine(color_choice, fabric_choice)

Unnamed: 0,title,desc,score,url,keywords,color
0,saree maruti,Women's Linen Saree With Blouse Piece,0.55,./images/download_filename_1.jpg,linen blouse maruti,red
0,saree maruti,Women's Linen Saree With Blouse Piece,0.573,./images/download_filename_1.jpg,linen blouse maruti,black


In [34]:
def recommendationEngine(color_choice, fabric_choice, N=1):
    #color_name = find_name(color_choice)
    color_name = "red"
    parseDataFile()
    createModel()
    search_term = color_name + " " + fabric_choice
    #getRecommendations(search_term, N=5)
    df1 = getRecommendations(search_term, N)
    df1['color'] = color_name
    complementary = complementaryColor(color_name)
    search_term = complementary + " " + fabric_choice
    df2 = getRecommendations(search_term, N)
    df2['color'] = complementary
    frames = [df1, df2]
    result = pd.concat(frames)
    return result
    

# Build the intermediate layer

In [10]:
# path to all files
DATA_PATH = "./data/saree_data.csv"
CLEAN_PATH = "./data/data_parsed_new.csv"
TFIDF_ENCODING_PATH = "./model/data_tfidf_encodings.pkl"
TFIDF_MODEL_PATH = "./model/data_tfidf.pkl"

In [4]:
# Load EDA
import numpy as np
import pandas as pd 
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

# other packages
import nltk
import string
import ast
import re
import unidecode
import unicodedata

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from collections import Counter

import pickle 
import unidecode, ast

In [47]:
import pandas as pd

COLUMN_NAME = ['title', 'desc', 'keywords', 'url','score']

# Load Our Dataset
def loadData(fileName): 
    df = pd.read_csv(fileName)
    return df 

#save the file
def saveData(df, fileName): 
    df.to_csv(fileName, index=False)  
    

def parseDataFile():
    
    # parses the data into words
    rec_df = loadData(DATA_PATH)
    rec_df[COLUMN_NAME[1]] = rec_df['ingredients']
    
    # change the way the sentence is arranged in the data
    rec_df['ingredients'] = rec_df['ingredients'].map(str) + ',' + rec_df['recipe_name'].map(str)
    rec_df['ingredients'] = rec_df['ingredients'].str.split()

    rec_df['ingredients_parsed'] = rec_df['ingredients'].apply(lambda x: ingredient_parser(x))

    df = rec_df[['recipe_name', 'desc', 'ingredients_parsed', 'ingredients', 'recipe_urls']]
    df = rec_df.dropna()
    # delete the ingredients column
    df.drop(['ingredients'], axis=1,inplace=True)
    # rename all the columns 
    df.rename(columns={'recipe_name': COLUMN_NAME[0], 'ingredients_parsed': COLUMN_NAME[2], 'recipe_urls': COLUMN_NAME[3]}, inplace=True)

    saveData(df,CLEAN_PATH)

def createModel():
   
    # load in parsed recipe dataset     
    df_rec = loadData(CLEAN_PATH)
    df_rec[COLUMN_NAME[2]] = df_rec.desc.values.astype('U')

    # TF-IDF feature extractor 
    tfidf = TfidfVectorizer()
    tfidf.fit(df_rec[COLUMN_NAME[2]])
    tfidf_recipe = tfidf.transform(df_rec[COLUMN_NAME[2]])

    # ------
    #Printing the feature names
    #print(tfidf.get_feature_names())
    #matrix = tfidf_recipe.todense()
    #tfidf_list = matrix.tolist()
    #tfidf_df = pd.DataFrame(tfidf_list, columns = vectorizer.get_feature_names())
    #print(tfidf_df)
    # ------
    
    # save the tfidf model and encodings 
    with open(TFIDF_MODEL_PATH, "wb") as f:
        pickle.dump(tfidf, f)

    with open(TFIDF_ENCODING_PATH, "wb") as f:
        pickle.dump(tfidf_recipe, f)


def getRecommendations(ingredients, N):
    """
    The reccomendation system takes in a list of ingredients and returns a list of top 5 
    recipes based of of cosine similarity. 
    :param ingredients: a list of ingredients
    :param N: the number of reccomendations returned 
    :return: top 5 reccomendations for cooking recipes
    """
    # load in tdidf model and encodings 
    with open(TFIDF_ENCODING_PATH, 'rb') as f:
        tfidf_encodings = pickle.load(f)

    with open(TFIDF_MODEL_PATH, "rb") as f:
        tfidf = pickle.load(f)

    # parse the ingredients using the ingredient_parser 
    try: 
        ingredients_parsed = ingredient_parser(ingredients)
    except:
        ingredients_parsed = ingredient_parser([ingredients])
    
    # use our pretrained tfidf model to encode our input ingredients
    ingredients_tfidf = tfidf.transform([ingredients_parsed])

    # calculate cosine similarity between actual recipe ingreds and test ingreds
    cos_sim = map(lambda x: cosine_similarity(ingredients_tfidf, x), tfidf_encodings)
    scores = list(cos_sim)

    # Filter top N recommendations 
    filtered_recommendations = filterRecommendations(N, scores)
    return filtered_recommendations


# Top-N recomendations order by score
def filterRecommendations(N, scores):
    # load in recipe dataset 
    df_rec = loadData(CLEAN_PATH)
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations 
    # added "dtype=" to fix a pandas dataframe error
    #recommendation = pd.DataFrame(columns = ['recipe', 'desc', 'ingredients', 'score', 'url'], dtype=object)
    recommendation = pd.DataFrame(columns = [COLUMN_NAME[0], COLUMN_NAME[1], COLUMN_NAME[4], COLUMN_NAME[3]], dtype=object)
    #print (recommendation)
    count = 0
    for i in top:
        #recommendation.at[count, 'url'] = df_rec['recipe_urls'][i]
        recommendation.at[count, COLUMN_NAME[3]] = "./images/download_filename_1.jpg"
        recommendation.at[count, COLUMN_NAME[0]] = title_parser(df_rec[COLUMN_NAME[0]][i])
        recommendation.at[count, COLUMN_NAME[1]] = title_parser(df_rec[COLUMN_NAME[1]][i])
        recommendation.at[count, COLUMN_NAME[2]] = df_rec[COLUMN_NAME[2]][i]
        recommendation.at[count, COLUMN_NAME[4]] = "{:.3f}".format(float(scores[i])) #error here?
        count += 1
    return recommendation

# this is the parser algorithm 

In [49]:
#Initialising stopwords for english
stop_words = set(stopwords.words('english'))

# neaten the ingredients being outputted
# this is not used anymore 
def ingredient_parser_final(ingredient):
    
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ast.literal_eval(ingredient)
    
    ingredients = ','.join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients

def title_parser(title):
    title = unidecode.unidecode(title)
    return title 

def ingredient_parser(ingreds):
    
    #showStatus("ingredient parser")
    words_to_remove = ['(',')','.','\'','saree', 'matching', 'ba', 'gld', 'without', 'women', 'woman','shubh','self','fresh', 'trendz','oil', 'a', 'and',  'or',  'large', 'extra',  'free', 'small', 'from', 'higher', 'for', 'finely', 'freshly', 'to', 'organic', 'the', 'plain', 'plus' ]
    # The ingredient list is now a string so we need to turn it back into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input 
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None. 
    translator = str.maketrans('', '', string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(' |-', i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
      
        # remove stop words
        items = [word for word in items if word not in stop_words]
        
        # remove accents
        items = [unidecode.unidecode(word) for word in items] #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        items = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in items]
        
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        # remove all square brackets
        items = [remove_between_square_brackets(word) for word in items]
        # remove all special characters
        items = [remove_special_characters(word) for word in items]
        if items:
            ingred_list.append(' '.join(items)) 
    ingred_list = " ".join(ingred_list)
    return ingred_list

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text



# Other algorithms for testing


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
df_rec = loadData(CLEAN_PATH)
news_articles_temp = df_rec.copy()
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(news_articles_temp[COLUMN_NAME[2]])
headline_features

<1000x1132 sparse matrix of type '<class 'numpy.int64'>'
	with 7668 stored elements in Compressed Sparse Row format>

In [40]:
# bag of words model
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'desc':news_articles_temp['desc'][indices].values,
               'headline':news_articles_temp['title'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried details","="*30)
    print('headline : ',news_articles_temp['title'][indices[0]])
    print("\n","="*25,"Recommended : ","="*23)
    #print(df.head(5))
    return df.iloc[1:,]
    #return df.iloc[1:,]

bag_of_words_based_model(1, 3) # Change the row index for any other queried article

headline :  saree trilok fab



Unnamed: 0,desc,headline,Euclidean similarity with the queried article
1,Silk Saree with Blouse Piece,saree satrani,1.732051
2,Women's kanchipuram Silk Saree With Blouse Piece,saree varni fab,1.732051


In [41]:
# using TF-IDF method
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles_temp[COLUMN_NAME[2]])

In [42]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'desc':news_articles_temp['desc'][indices].values,
               'headline':news_articles_temp['title'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles_temp['title'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(1, 2)

headline :  saree trilok fab



Unnamed: 0,desc,headline,Euclidean similarity with the queried article
1,Traditional Readymade Saree Blouse RED,blouse fab 2 fashion,1.164623


In [13]:
# color functionality 

import re
re_color = re.compile('#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})')
from math import sqrt

def color_to_rgb(color):
    return tuple(int(x, 16) / 255.0 for x in re_color.match(color).groups())

def similarity(color1, color2):
    """Computes the pearson correlation coefficient for two colors. The result
    will be between 1.0 (very similar) and -1.0 (no similarity)."""
    c1 = color_to_rgb(color1)
    c2 = color_to_rgb(color2)

    s1 = sum(c1)
    s2 = sum(c2)
    sp1 = sum(map(lambda c: pow(c, 2), c1))
    sp2 = sum(map(lambda c: pow(c, 2), c2))
    sp = sum(map(lambda x: x[0] * x[1], zip(c1, c2)))

    try:
            computed = (sp - (s1 * s2 / 3.0)) / sqrt((sp1 - pow(s1, 2) / 3.0) * (sp2 - pow(s2, 2) / 3.0))
    except:
            computed = 0
    
    return computed

color_names = {
    '#000000': 'black',
    '#ffffff': 'white',
    '#808080': 'dark gray',
    '#b0b0b0': 'light gray',
    '#ff0000': 'red',
    '#800000': 'dark red',
    '#00ff00': 'green',
    '#008000': 'dark green',
    '#0000ff': 'blue',
    '#000080': 'dark blue',
    '#ffff00': 'yellow',
    '#808000': 'olive',
    '#00ffff': 'cyan',
    '#ff00ff': 'magenta',
    '#800080': 'purple'
    }

def find_name(color):
    sim = [(similarity(color, c), name) for c, name in color_names.items()]
    return max(sim, key=lambda x: x[0])[1]



import random 

def complementaryColor(color_choice):
    random_choice = ['red','blue','black','white','green','olive']
    return random.choice(random_choice)

# end color functionality