In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('hebe_dane.csv')

In [3]:
df

Unnamed: 0,ProductName,Ingredients,Rate,Price,Reviews,BrandName,PriceRange
0,LaQ Kozioł,"Aqua, Sodium Coco-Sulfate, Coco-Glucoside, Coc...",0.0,19.99,0,LaQ,medium
1,Chi Infra,"Aqua,Water,Eau, Sodium C14-16 Olefin Sulfonate...",5.0,49.99,1,Chi,high
2,Barwa Naturalna,"Aqua,Sodium Laureth Sulfate,Cocamidopropyl Bet...",4.4,6.99,80,Barwa,low
3,Alfaparf Semi Di Lino Reconstruction,"Aqua,Water, Disodium Laureth Sulfosuccinate, S...",0.0,53.51,0,Alfaparf,high
4,Head&Shoulders Classic Clean,"Aqua,Sodium Laureth Sulfate,Sodium Lauryl Sulf...",1.5,13.99,2,brak,low
...,...,...,...,...,...,...,...
877,Hebe Cosmetics Volumizing Shampoo,"Aqua, Sodium Coco-Sulfate, Decyl Glucoside, Co...",4.3,14.99,18,Hebe Cosmetics,low
878,Hebe Cosmetics Smoothing Shampoo,"Aqua, Sodium Coco-Sulfate, Decyl Glucoside, Co...",4.7,14.99,9,Hebe Cosmetics,low
879,Radical,"Aqua ,Water,, Sodium Laureth Sulfate, Cocamido...",3.7,12.99,3,Radical,low
880,Green Garden,"ulfate, ocamidopropyl Betaine, Coamide Dea, So...",0.0,29.00,0,Green Garden,medium


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 882 entries, 0 to 881
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductName  882 non-null    object 
 1   Ingredients  882 non-null    object 
 2   Rate         882 non-null    float64
 3   Price        882 non-null    float64
 4   Reviews      882 non-null    int64  
 5   BrandName    882 non-null    object 
 6   PriceRange   882 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 48.4+ KB


In [5]:
ingredient_idx = {}
corpus = []
idx = 0

# tokenizacja tekstu za pomocą pętli
for i in range(len(df)):    
    ingredients = df['Ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1
            
# sprawdzanie wyniku
print("The index for Aqua is", ingredient_idx['aqua'])
print("The index for Sodium Coco-Sulfate is", ingredient_idx['sodium coco-sulfate'])

The index for Aqua is 0
The index for Sodium Coco-Sulfate is 1


In [6]:
# uzyskanie ilości zmiennych i tokenów 
M = len(df)
N = len(ingredient_idx)
# stworzenie macierzy z zerami
A = np.zeros((M,N))

In [7]:
M

882

In [8]:
N

2903

In [9]:
# definiowanie funkcji oh_encoder do zliczania tokenów dla każdego wiersza
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        idx = ingredient_idx[ingredient]
        x[idx] = 1
    return x

In [10]:
# Tworzenie macierzy (1: składnik występuje, 0: składnik nie występuje)
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i +=1

In [11]:
A

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
from sklearn.manifold import TSNE

In [13]:
# Zmniejszenie wymiaru danych za pomocą t-SNE
model = TSNE(n_components=2,learning_rate=200,random_state=123)
tsne_features =model.fit_transform(A)

df['X'] = tsne_features[:,0]
df['Y'] = tsne_features[:,1]

In [14]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool

In [15]:
# Stworzenie wykresu punktowego
output_notebook()
source = ColumnDataSource(df)
plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 1000, height = 900)
plot.circle(x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8)


hover = HoverTool(tooltips = [('Item', '@ProductName'),
                              ('Brand', '@BrandName'),
                              ('Price', 'zł @Price'),
                              ('Rate', '@Rate')])
plot.add_tools(hover)

In [17]:
show(plot)

In [18]:
# Sprawdzenie składników oraz danych dla dwóch podobnych produktów
cosmetic_1 = df[df['ProductName'] == "Dessange Professional Hair Luxury Reveil Color"]
cosmetic_2 = df[df['ProductName'] == "Matrix Total Results Dark Envy"]

display(cosmetic_1)
print(cosmetic_1.Ingredients.values)
display(cosmetic_2)
print(cosmetic_2.Ingredients.values)

Unnamed: 0,ProductName,Ingredients,Rate,Price,Reviews,BrandName,PriceRange,X,Y
782,Dessange Professional Hair Luxury Reveil Color,"Aqua,Water, Sodium Laureth Sulfate, Dimethico...",4.4,34.99,5,brak,high,75.485809,16.763231


[' Aqua,Water, Sodium Laureth Sulfate, Dimethicone, Coco-Betaine, Sodium Chloride, Glycol Distearate, Cocamide Mipa, Fumaric Acid, Rosa Canina Flower Extract, Carbomer, Alpha-Isomethyl Ionone, Sodium Benzoate, Sodium Hydroxide, Salicylic Acid, Coumarin, Citric Acid, Guar Hydroxypropyltrimonium Chloride, Caprylic,Capric Glycerides, Hexylene Glycol, Ethylhexyl Salicylate, Benzyl Salicylate, Benzyl Benzoate, Benzyl Alcohol, Parfum,Fragrance ,F,I,L, C231998,1,']


Unnamed: 0,ProductName,Ingredients,Rate,Price,Reviews,BrandName,PriceRange,X,Y
483,Matrix Total Results Dark Envy,"AQUA , WATER , EAU, SODIUM LAURETH SULFATE, DI...",0.0,50.0,0,Matrix,high,83.756294,14.701017


['AQUA , WATER , EAU, SODIUM LAURETH SULFATE, DIMETHICONE, COCO-BETAINE, SODIUM CHLORIDE, GLYCOL DISTEARATE, PARFUM , FRAGRANCE, COCAMIDE MIPA, SODIUM BENZOATE, SODIUM HYDROXIDE, CITRIC ACID, HEXYLENE GLYCOL, CI 60730 , ACID VIOLET 43, SALICYLIC ACID, GUAR HYDROXYPROPYLTRIMONIUM CHLORIDE, CI 19140 , ACID YELLOW 23, CARBOMER, BENZYL SALICYLATE, LINALOOL, BENZYL ALCOHOL, LIMONENE, COUMARIN, HYDROXYCITRONELLAL, GERANIOL, FUMARIC ACID,']
