In [15]:
import pandas as pd 
import numpy as np
%pip install gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
df= pd.read_csv("shirinliklar.csv")
df.head(5)

Unnamed: 0,#,name,type,description
0,1,Shokoladli tort,tort,Yumshoq biskvit asosida tayyorlangan shokoladl...
1,2,Qora shokoladli tort,tort,Qora shokolad qo‘shilgan to‘yimli tort bayraml...
2,3,Qulupnayli tort,tort,Yangi qulupnay va vanilli krem bilan tayyorlan...
3,4,Vanilli tort,tort,Vanil aromatli biskvit va sutli krem bilan tay...
4,5,Karamelli tort,tort,Karamel sousi va qaymoqli krem bilan bezatilga...


In [17]:
import re

In [18]:
def clean_text_uz(text):
    text= text.lower()

    text = text.replace("’", "").replace("‘", "").replace("'", "").replace("ʻ", "")

    text= re.sub(r"[^a-z0-9\s]", " ", text)
    text= re.sub(r"\s+", " ", text).strip()

    return text

In [19]:
df['full_text'] = df['name'] + ' ' + df['type'] + ' ' + df['description']
df['clean_text'] = df['full_text'].apply(clean_text_uz)

In [20]:
uz_stopwords= {'tort',
    'va', 'bilan', 'uchun', 'ham', 'bu', 'shu', 'juda', ' eng', 'tort', 'mazali', 'shirin', 'bo‘ladi', 'qiladi', 'tayyorlanadi'}

def tokenize_and_remove_stopwords(text):
    tokens= simple_preprocess(text, min_len=2)
    tokens= [t for t in tokens if t not in uz_stopwords]
    return tokens

df['tokens']= df['clean_text'].apply(tokenize_and_remove_stopwords)

In [21]:
w2v_model= Word2Vec(
    sentences=df['tokens'],
    vector_size= 100,
    window= 5,
    min_count= 2,
    workers= 4,
    sg= 1
)

In [22]:
w2v_model.wv.most_similar('shokolad')

[('kruassan', 0.42651882767677307),
 ('shirinlik', 0.4211154580116272),
 ('asalli', 0.3699701130390167),
 ('qatlamli', 0.3285506069660187),
 ('kokos', 0.3227229118347168),
 ('donut', 0.31213313341140747),
 ('kakao', 0.30850720405578613),
 ('ichiga', 0.3071442246437073),
 ('pahlava', 0.30684158205986023),
 ('yumshoq', 0.3055775463581085)]

In [23]:
w2v_model.wv['shokolad']

array([ 9.8011503e-03, -2.6168087e-03,  8.5570207e-03,  9.2596458e-03,
       -3.1976465e-03, -4.0121106e-03,  6.9147320e-03,  1.3117241e-03,
       -8.3320299e-03, -8.0592530e-03, -1.8803857e-03, -3.3819561e-03,
        5.4687522e-03, -8.4143848e-04,  4.3500171e-04,  1.6145108e-03,
        5.7624658e-03, -1.7701430e-03, -7.7718734e-03, -1.7109396e-02,
        1.5152640e-03,  6.5498119e-03,  2.6404448e-03, -1.2262376e-02,
       -7.9362523e-03,  6.4635742e-03, -5.5073099e-03,  5.9814230e-03,
       -8.6057410e-03,  8.7287268e-03, -4.3389718e-03,  2.5411460e-03,
        3.6043706e-03, -9.5374864e-03, -5.6031388e-03,  1.9091691e-03,
       -4.9470314e-03, -3.5738205e-03, -8.3200196e-03, -7.2743429e-04,
        9.5298905e-03, -4.9584010e-03, -2.3738013e-03,  3.2271880e-03,
        1.1244250e-02, -6.9871168e-03, -7.8964001e-03, -5.2782260e-03,
        1.0734234e-02,  2.0114898e-03, -8.3796130e-03, -3.1353289e-03,
        8.3815781e-03, -7.0857182e-03, -5.5870879e-03,  1.0422828e-02,
      

In [24]:
def get_doc_vector(tokens, model):
    vectors= [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if len(vectors) >0 else np.zeros(model.vector_size)
df['product_vector']= df['tokens'].apply(lambda x: get_doc_vector(x, w2v_model))

In [25]:
df['product_vector']

0      [-0.0011448539, 0.0039829235, 0.00083606504, -...
1      [0.0023240433, 0.0039808867, -0.00066508475, 0...
2      [0.0028895538, 0.0035467145, -0.00034196823, 0...
3      [0.001369861, -0.0007391483, -0.001231387, -6....
4      [-0.0013315496, 0.0010548654, 0.0018181236, -0...
                             ...                        
160    [0.001485656, 0.0020385135, -0.00079634454, 0....
161    [-0.00040657778, 0.002751394, 0.004316095, 0.0...
162    [0.0023433692, 0.0039184354, -0.0027896168, 0....
163    [0.00077292713, 0.0034660194, 0.0037788264, 0....
164    [0.0017312397, 0.003361733, 0.0046142573, 0.00...
Name: product_vector, Length: 165, dtype: object

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(product_name, df, n=3):
    idx= df[df['name']== product_name].index[0]
    query_vec= df.loc[idx, 'product_vector']

    similarities= cosine_similarity(
        [query_vec],
        list(df['product_vector'])
    )[0]

    df['score']= similarities
    return df.sort_values('score', ascending= False).iloc[1:n+1][['name', 'score']]


In [35]:
recommend('Shokoladli tort', df)

Unnamed: 0,name,score
120,Shokoladli pirojniy,0.897729
140,Shokoladli tort,0.879747
105,Shokoladli tartaletka,0.848269
