In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

Для создания рекомендательной модели используем набор данных для рекомендательной системы парфюма, содержащий информацию о названии, изготовителе, составе запаха, url-ссылку на изображение упаковки, а также текстовое описание, на основе которого, мы и будем составлять соотвествующие рекомендации:

Загрузим датасет:

In [6]:
df_perf_all = pd.read_csv('data/perfume.csv', sep=",", encoding = 'ansi')
df_perf_all.head()


Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...


In [7]:
df_perf_all.shape

(2191, 5)

Убедимся, что в нашем рабочем датафрейме не будет записей с отсутсвующим текстовым описанием: 

In [8]:
df_perf_with_description = df_perf_all[df_perf_all['Description'].notnull()]
df_perf_with_description = df_perf_with_description[~df_perf_with_description['Description'].str.isspace()]

In [9]:
name = df_perf_with_description['Name'].values
name[0:6]

array(['Tihota Eau de Parfum', 'Sola Parfum', 'Kagiroi Parfum',
       'Velvet Fantasy Eau de Parfum',
       'A Blvd. Called Sunset Eau de Parfum',
       'Freckled and Beautiful Eau de Parfum'], dtype=object)

In [11]:
notes = df_perf_with_description['Notes'].values
notes[0:5]

array([' Vanilla bean, musks',
       ' Lavender, Yuzu, Lemongrass, Magnolia, Geranium, Jasmine, Frankincense, Myrrh',
       ' Green yuzu, green shikuwasa, sansho seed, coriander, ylang-ylang, shiso, rosewood, vetiver, hinoki, cypriol, patchouli, agarwood',
       ' tangerine,  pink pepper,  black coffee,  leather,  violet,  jasmine,  lily of the valley,  heliotrope powder,  vanilla,  amber, sandalwood,  toffee,  musk,  oakmoss',
       ' Bergamot, almond, violet, jasmine, leather, sandalwood, vanilla, tonka'],
      dtype=object)

In [13]:
descr = df_perf_with_description['Description'].values
descr[0:3]

array([" Rapa Nui for sugar, Tihota is, quite simply, The One. The One that will call to you every moment you're not smelling it, The One that you've only had hints of in other vanilla perfumes, The One that lasts and lasts, The One that has perfectly captured the essence of the pure nature of fresh vanilla beans and has harnessed it beyond your wildest dreams. It inspires obsessive devotion with its fragrance of smooth, vanilla bean pods dipped in honeyed water and left to steep. The result is the pure magic, an unbridled vanilla, sweet, raw and achingly desirable. Of course, Tihota isn't the first perfume to focus on the dark sweetness of vanilla... perfumery is filled with vanillas, but this smells like the dream of a master perfumer who was obsessed with vanilla and was finally given free rein to worship the note with no apologies. It is, without a sliver of a doubt, the finest pure vanilla we've ever had the honor of carrying, and it's so beautiful it pains us to put the bottle do

Векторизуем описания с помощью Tf-Idf Vectorizer

In [16]:
tfidfv = TfidfVectorizer()
descr_matrix = tfidfv.fit_transform(descr)
descr_matrix

<2191x18753 sparse matrix of type '<class 'numpy.float64'>'
	with 216102 stored elements in Compressed Sparse Row format>

И с помощью CountVectorizer:

In [42]:
countv = CountVectorizer()
descr_matrix_co = countv.fit_transform(descr)
descr_matrix_co

<2191x18753 sparse matrix of type '<class 'numpy.int64'>'
	with 216102 stored elements in Compressed Sparse Row format>

In [30]:
class SimpleKNNRecommender:
    
    def __init__(self, X_matrix, X_names, X_notes, X_descr):
        """
        Входные параметры:
        X_matrix - обучающая выборка (матрица объект-признак)
        X_ids - массив идентификаторов объектов
        X_title - массив названий объектов
        X_overview - массив описаний объектов
        """
        #Сохраняем параметры в переменных объекта
        self._X_matrix = X_matrix
        self.df = pd.DataFrame(
            {'Perfume Name': pd.Series(X_names, dtype='str'),
            'Notes': pd.Series(X_notes, dtype='str'),
            'Description': pd.Series(X_descr, dtype='str'),
            'Dist': pd.Series([], dtype='float')})
            
            
    def recommend_for_single_object(self, K: int, \
                X_matrix_object, cos_flag = True, manh_flag = False):
        """
        Метод формирования рекомендаций для одного объекта.
        Входные параметры:
        K - количество рекомендуемых соседей 
        X_matrix_object - строка матрицы объект-признак, соответствующая объекту
        cos_flag - флаг вычисления косинусного расстояния
        manh_flag - флаг вычисления манхэттэнского расстояния
        Возвращаемое значение: K найденных соседей
        """
        
        scale = 1000000
        # Вычисляем косинусную близость
        if cos_flag:
            dist = cosine_similarity(self._X_matrix, X_matrix_object)
            self.df['Dist'] = dist * scale
            res = self.df.sort_values(by='Dist', ascending=False)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['Dist'] < scale]
        
        else:
            if manh_flag:
                dist = manhattan_distances(self._X_matrix, X_matrix_object)
            else:
                dist = euclidean_distances(self._X_matrix, X_matrix_object)
            self.df['Dist'] = dist * scale
            res = self.df.sort_values(by='Dist', ascending=True)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['Dist'] > 0.0]            
        
        # Оставляем К первых рекомендаций
        res = res.head(K)
        return res

Выберем тестовый образец, на основе которого мы будем давать рекомендации:

In [35]:
test_perfume_name = 1000
name[test_perfume_name]

'California Snow Eau de Parfum'

Зададим его матрицу: 

In [37]:
test_perfume_matrix = descr_matrix[test_perfume_name]
test_perfume_matrix

<1x18753 sparse matrix of type '<class 'numpy.float64'>'
	with 146 stored elements in Compressed Sparse Row format>

In [39]:
skr1 = SimpleKNNRecommender(descr_matrix, name, notes, descr)

In [40]:
test = df_perf_with_description.iloc[test_perfume_name]
test

Name                               California Snow Eau de Parfum
Brand                                              A Lab on Fire
Description     California Snow crackles with the electricity...
Notes           Sage, tea, chamomile, coumarin, narcissus, ro...
Image URL      https://static.luckyscent.com/images/products/...
Name: 1000, dtype: object

Делаем рекомендацию на основании описания векторизованного Tf-Idf и косинусного  расстояния:

In [41]:
rec1 = skr1.recommend_for_single_object(15, test_perfume_matrix)
rec1

Unnamed: 0,Perfume Name,Notes,Description,Dist
1000,California Snow Eau de Parfum,"Sage, tea, chamomile, coumarin, narcissus, ro...",California Snow crackles with the electricity...,1000000.0
1900,1 Parfum Extrait,"Fresh tangerine blossom, cassis, neroli bigar...","One of the world's most exclusive fragrances,...",201793.045093
1895,Arso Eau de Parfum,"Leather, incense, pine resin, cedar leaves","The sharp, evocative scent of wood smoke - t...",201572.053201
1921,De Bachmakov Eau de Parfum,"Cedar wood, bergamot, shiso leaves, nutmeg, c...",In a tribute to his Russian ancestry and to c...,183499.803113
1220,Clementine California Cologne Absolue,"Clementine, mandarin, juniper berries, star a...","In terms of sheer, natural joy, how many sens...",181620.621872
845,Woody Mood Eau de Parfum,"bergamot, ginger, clary sage, saffron, sequoi...","With Woody Mood, perfumer Betrand Duchaufour ...",174702.171909
1496,Gypsy Water Hair Perfume,"Bergamot, lemon, pepper, juniper berries, inc...",BYREDO presents a hair perfume collection tha...,174182.876741
1640,Gypsy Water Eau de Parfum,"Bergamot, lemon, pepper, juniper berries, inc...",This entrancing shape-shifter is one of the m...,174153.486327
469,Oliver Eau de Parfum,"bitter orange, mojito, grapefruit, bergamot, ...",Oliver is an original take on the aromatic he...,172561.033154
1151,Sole di Positano Eau de Parfum,"Calabrian bergamot, bitter orange, mandarin, ...",There is nothing like the thrill of a good ci...,171114.096042


In [43]:
test_perfume_matrix_co = descr_matrix_co[test_perfume_name]
test_perfume_matrix_co

<1x18753 sparse matrix of type '<class 'numpy.int64'>'
	with 146 stored elements in Compressed Sparse Row format>

In [44]:
skr2 = SimpleKNNRecommender(descr_matrix_co, name, notes, descr)

Делаем рекомендации по описаниям векторизованным CountVectorizer и на основе Евклидова расстояния:

In [46]:
rec2 = skr2.recommend_for_single_object(15, test_perfume_matrix_co, cos_flag = False)
rec2

Unnamed: 0,Perfume Name,Notes,Description,Dist
1895,Arso Eau de Parfum,"Leather, incense, pine resin, cedar leaves","The sharp, evocative scent of wood smoke - t...",18466190.0
182,L'Etre Aime - Homme Eau de Parfum,"bergamot, lavender, ginger, basil, cardamom, ...","Yvon Mouchel, dedicated the 7th addition to t...",19026300.0
1664,Hinoki Eau de Toilette,"Cypress, turpentine, camphor, cedar, thyme, p...",Hinoki takes its name from the hinoki cypress...,19209370.0
1437,Ambre Gris Eau de Parfum,"Davana, rose, geranium, sandalwood, cedar, am...","Ambergris, the legendarily rare ingredient pr...",19287300.0
820,And The World Is Yours Extrait de Parfum,"Neroli, cumin, orange blossom absolute, rose,...",Few photographs capture the sumptuousness of ...,19313210.0
1170,Brooklyn Eau de Parfum,"Bergamot, squeezed lemon, orange juice, incen...","The beating creative heart of the East Coast,...",19339080.0
20,Gris Charnel Eau de Parfum,"Fig, Black tea, Cardamom essence, Absolute of...","Quai Saint-Bernard, near the Jardin des Plant...",19364920.0
565,Oeilleres Parfum Extrait,"Eucalyptus, broom, chamomile, lavender, cumin...",“I want to echo the flowers I photograph witho...,19416490.0
2071,Tubereuse Couture Eau de Parfum,"kalamanzi oil, green jasmine shoots, ylang-yl...","Created in the spirit of High Fashion, Tubere...",19442220.0
609,Arancia di Sicilia Eau de Parfum,Blood orange (brown extraction & sfuma torchi...,In perfumery the essential oil from bitter or...,19442220.0
