# Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings

from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')

In [2]:
wine_clusters = pd.read_csv("wine_clusters.csv", converters={"embeddings": literal_eval})
wine_clusters.head()

Unnamed: 0,wine,year,winery,winery_norm,price,bottle,type_wine,type_wine_details,type_agriculture,region,...,parker_score,parker_score_num,penin_score,image,url,tasting_notes,notes_norm_removed_new_reduction_dropped,embeddings,cluster11,cluster13
0,Ultreia Saint Jacques 2021,2021.0,Raúl Pérez Viticultor,raul perez viticultor,16.2,0.75,Red,Red,Conventional,Castilla y León,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/ultreia-sai...,Red fruit / Mineral / Voluminous / Medium-bodi...,red_fruit mineral black_fruit floral,"[0.010368325747549534, 0.054386626929044724, -...",1,1
1,Blanc Pescador,,Perelada,perelada,7.8,0.75,Sparkling,Frizzante white,Conventional,Wines without GI,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/blanc-pesca...,Light / Pleasant / Sparkling / Aromatic / Frui...,pleasant aromatic,"[0.010398883372545242, 0.0519464947283268, -0....",4,2
2,Bach Extrísimo Blanco Semidulce 2021,2021.0,Masia Bach,masia bach,9.4,0.75,White,White,Conventional,Cataluña,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/bach-extris...,Good entry / Structured / Balanced / Good acid...,structured balanced good_acidity persistent ar...,"[0.006561089772731066, 0.03554246947169304, -0...",7,0
3,José Pariente Verdejo 2022,2022.0,Bodegas José Pariente,jose pariente,16.2,0.75,White,White,Conventional,Castilla y León,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/jose-parien...,Rich / Unctuous / Pleasant bitterness / White ...,rich unctuous white_fruit stone_fruit herbs,"[0.008630958385765553, 0.04478497430682182, -0...",5,3
4,Coto de Imaz Reserva 2018,2018.0,El Coto de Rioja,el coto de rioja,13.0,0.75,Red,Red,Conventional,La Rioja,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/coto-de-ima...,Fleshy / Powerful / Ripe tannins / Velvety / R...,powerful tannins black_fruit spices complex,"[0.009208849631249905, 0.05225313454866409, -0...",5,3


#### Embeddings format
The format of the embeddings changed into strings with extra characters, probably because of saving and reading the DataFrame as a CSV. Source for solving it: https://stackoverflow.com/questions/62308832/pandas-save-and-open-then-values-changed-to-be-string-problem

In [None]:
# wine_clusters["embeddings"][0]

# Input: a wine from the dataset
We will start by recommending a wine that is similar to a wine from the dataset. This would be the case if the recommender was for the online store: when the webpage displays the information about a particular wine, it would display also a list of the 5 most similar wines. <br>
For obtaining the most similar wines, we will try to use the cosine similarity. Resources:
- https://naomy-gomes.medium.com/the-cosine-similarity-and-its-use-in-recommendation-systems-cb2ebd811ce1
- https://ambarishg.github.io/posts/recommender-career-spacy/

In [3]:
wine_name = "Ultreia Saint Jacques 2021"

In [36]:
def find_similar_wines(wine_name, num_similar=5):
    # Get the embedding and the cluster of the input wine
    embedding = np.array(wine_clusters.loc[wine_clusters["wine"] == wine_name, "embeddings"].values[0])
    cluster = wine_clusters.loc[wine_clusters["wine"] == wine_name, "cluster13"].values[0]

    # Calculate cosine similarity between the input wine and all wines in the same cluster
    cluster_wines = wine_clusters[wine_clusters["cluster13"] == cluster]
    cosine_similarities = cosine_similarity(np.vstack(cluster_wines["embeddings"]), embedding.reshape(1, -1)) # vstack and reshape turn into 2D arrays to ensure compatibility with cosine_similarity

    # Get the indices of the most similar wines in the cluster
    similar_wine_indices = np.argsort(cosine_similarities.flatten())[::-1][:num_similar + 1]  # +1 to exclude the input wine itself
    similar_wine_indices = similar_wine_indices[similar_wine_indices != np.where(wine_clusters["wine"] == wine_name)[0][0]]

    # Get the names of the most similar wines
    similar_wines = cluster_wines.iloc[similar_wine_indices]["wine"].tolist()

    return similar_wines

In [37]:
find_similar_wines(wine_name)

['Sumarroca Rosat 2021',
 'Finca Cucó Negre 2022',
 'Les Sorts Rosat 2019',
 'José Aristegui Xeo 2019 (0.37 L)',
 'Cillar Tempranillo 2022']

It's interesting because we got 2 rosé wines and one sparkling, when we have input a red wine, although it is true that it is a young wine. 

In [38]:
find_similar_wines("Las Moradas de San Martín La Sabina 2015")

['Las Moradas de San Martín La Sabina 2015',
 'Cepa Gavilán 2020',
 'Ponte da Boga Capricho de Sousón 2018',
 'Antima 2015',
 'Brúixola 2016',
 'Gruñón 2017']

# Filters

In [None]:
filters = {"type": "Red", 
           "region": "All",
           "grapes": "All",
           "price": [5.0, 40.0],
           "year": [2017, 2023],
           "reviews": [4.0, 5.0]}

In [None]:
def find_similar_wines(wine_name, filters, num_similar=5):
    # Get the embedding and the cluster of the input wine
    embedding = np.array(wine_clusters.loc[wine_clusters["wine"] == wine_name, "embeddings"].values[0])
    cluster = wine_clusters.loc[wine_clusters["wine"] == wine_name, "clusters"].values[0]

    # Obtain the wines from the same cluster and apply user filters
    cluster_wines = wine_clusters[wine_clusters["clusters"] == cluster]

    type_mask = cluster_wines["type_wine"] == filters["type"]
    if filters["region"] == "All":
        region_mask = np.ones(len(cluster_wines), dtype=bool)  # Select all wines (no filtering by region)
    else:
        region_mask = cluster_wines["region_gi"] == filters["region"]
    if filters["grapes"] == ["All"]:
        grapes_mask = np.ones(len(cluster_wines), dtype=bool)  # Select all wines (no filtering by grapes)
    else:
        if "All" in filters["grapes"]:
            filters["grapes"].remove("All")
        grapes_mask = cluster_wines["grapes"].apply(lambda x: any(grape in x for grape in filters["grapes"]))
    price_mask = cluster_wines["price"].between(*filters["price"])
    year_mask = cluster_wines["year"].between(*filters["year"])
    reviews_mask = cluster_wines["customer_reviews"].between(*filters["reviews"])

    filtered_wines = cluster_wines[type_mask & region_mask & grapes_mask & price_mask & year_mask & reviews_mask]

    # Check if the filtered_wines dataframe is empty
    if filtered_wines.empty:
        print("No wines found matching the specified criteria.")
        return []
    else:
        # Calculate cosine similarity between the input wine and all wines in the same cluster
        cosine_similarities = cosine_similarity(np.vstack(filtered_wines["embeddings"]), embedding.reshape(1, -1)) # vstack and reshape turn into 2D arrays to ensure compatibility with cosine_similarity

        # Get the indices of the most similar wines in the cluster
        similar_wine_indices = np.argsort(cosine_similarities.flatten())[:num_similar + 1]  # +1 to include the input wine itself

        # Get the index of the input wine
        input_wine_index = wine_clusters.index[wine_clusters["wine"] == wine_name].tolist()[0]

        # Exclude the input wine from the list of similar wine indices
        similar_wine_indices = similar_wine_indices[similar_wine_indices != input_wine_index]

        # Get the names of the most similar wines (excluding the input wine)
        similar_wines = filtered_wines.iloc[similar_wine_indices]["wine"].tolist()

    return similar_wines

In [None]:
wine_name = "Ultreia Saint Jacques 2021"
recommendations = find_similar_wines(wine_name, filters)
recommendations

# User input

In [5]:
# Train the Word2Vec model
cores = multiprocessing.cpu_count()

notes = wine_clusters["notes_norm_removed_new_reduction_dropped"].str.split()
w2v_model = Word2Vec(min_count=70,
                 window=2,
                 vector_size=300,
                 sample=6e-5, 
                 alpha=0.03, 
                 min_alpha=0.0007, 
                 negative=20,
                 workers=cores-1)
w2v_model.build_vocab(notes, progress_per=10000)
w2v_model.train(notes, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

# Train the TF-IDF model
def custom_tokenizer(text):
    return text.split()

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(wine_clusters["notes_norm_removed_new_reduction_dropped"])

In [6]:
# Function to obtain the embedding from the text input

def text_to_vector(text_input, w2v_model, tfidf_vectorizer, tfidf_matrix):    
    # Tokenize the text input
    words = text_input.split()

    # Convert words to Word2Vec vectors
    word_vectors = []
    for word in words:
        if word in w2v_model.wv:
            word_vector = w2v_model.wv[word]
            word_vectors.append(word_vector)

    # Calculate TF-IDF scores for the words in the input
    tfidf_scores = tfidf_vectorizer.transform([text_input])

    # Calculate the TF-IDF weighted average vector of the input text
    tfidf_avg_vector = np.zeros(w2v_model.vector_size)
    for word, tfidf_score in zip(words, tfidf_scores.toarray()[0]):
        if word in w2v_model.wv:
            word_vector = w2v_model.wv[word]
            tfidf_avg_vector += word_vector * tfidf_score

    if len(word_vectors) > 0:
        tfidf_avg_vector /= len(word_vectors)
        
    # Reshape the vector to be a 2D array as expected by Kmeans
    tfidf_avg_vector = tfidf_avg_vector.reshape(1, -1)

    return tfidf_avg_vector

In [7]:
def text_to_vector(text_input, w2v_model, tfidf_vectorizer, tfidf_matrix):
    # Tokenize the text input using the custom tokenizer
    words = custom_tokenizer(text_input)
     
    # Print the words to check if they are correct
    print("Words:", words)

    # Convert words to Word2Vec vectors
    word_vectors = []
    valid_words = []
    for word in words:
        if word in w2v_model.wv:
            word_vector = w2v_model.wv[word]
            word_vectors.append(word_vector)
            valid_words.append(word)

    # Check if there are valid words in the input
    if len(valid_words) == 0:
        print("No valid words in the input text.")
        return None

    # Get the indices of valid words in the TF-IDF vectorizer's vocabulary
    word_indices = [tfidf_vectorizer.vocabulary_.get(word, -1) for word in valid_words]

    # Check if there are valid word indices
    if all(index == -1 for index in word_indices):
        print("No valid words in the TF-IDF vectorizer's vocabulary.")
        return None

    # Get the TF-IDF scores for the valid words from the TF-IDF matrix
    tfidf_scores = tfidf_matrix[:, word_indices].toarray()

    # Print the TF-IDF scores to check if they are correct
    print("TF-IDF scores:", tfidf_scores)

    # Calculate the TF-IDF weighted average vector of the input text
    tfidf_avg_vector = np.zeros(w2v_model.vector_size)
    for word, tfidf_score in zip(words, tfidf_scores[0]):
        if word in w2v_model.wv:
            word_vector = w2v_model.wv[word]
            tfidf_avg_vector += word_vector * tfidf_score

    if len(word_vectors) > 0:
        tfidf_avg_vector /= len(word_vectors)

    # Reshape the vector to be a 2D array as expected by Kmeans
    tfidf_avg_vector = tfidf_avg_vector.reshape(1, -1)

    return tfidf_avg_vector

In [8]:
embeddings_array = np.array(wine_clusters["embeddings"].tolist())

In [9]:
kmeans = KMeans(n_clusters=13, random_state=42)
kmeans.fit(embeddings_array)

In [10]:
descriptors = "elegant fresh red_fruit"

In [13]:
def recommend_from_descriptors(descriptors, num_similar=5):
    # Get the embedding and the cluster of the input 
    embedding = text_to_vector(descriptors, w2v_model, tfidf_vectorizer, tfidf_matrix)
    cluster = kmeans.predict(embedding)[0]

    # Calculate cosine similarity between the input and all wines in the same cluster
    cluster_wines = wine_clusters[wine_clusters["cluster13"] == cluster]
    cosine_similarities = cosine_similarity(np.vstack(cluster_wines["embeddings"]), embedding.reshape(1, -1)) # reshape and vstack turn into 2D arrays to ensure compatibility with cosine_similarity

    # Get the indices of the most similar wines in the cluster
    similar_wine_indices = np.argsort(cosine_similarities.flatten())[::-1][:num_similar]

    # Get the names of the most similar wines
    similar_wines = cluster_wines.iloc[similar_wine_indices]["wine"].tolist()

    return similar_wines

In [14]:
recommend_from_descriptors(descriptors)

Words: ['elegant', 'fresh', 'red_fruit']
TF-IDF scores: [[0.         0.         0.45912398]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 ...
 [0.         0.26008836 0.        ]
 [0.         0.         0.33722114]
 [0.29476285 0.         0.25238703]]


['Alta Alella Mirgin Exeo Paratge Qualificat Vallcirera 2016',
 'Gran Feudo Reserva 2016',
 'Nona 2020',
 'Zurbano 2017',
 'Faustino & Eneko 2015']

This function recommends all the time the same wine. <br>
We will try returning a random wine from the same cluster and adding some print statements.

In [19]:
def recommend_from_descriptors(descriptors, num_similar=5):
    # Get the embedding and the cluster of the input
    embedding = text_to_vector(descriptors, w2v_model, tfidf_vectorizer, tfidf_matrix)
    cluster = kmeans.predict(embedding)[0]
    print(embedding)
    print(cluster)
    
    if np.all(embedding == 0):  # Check if the embedding is all zeros
        print("not able to recommend")
        similar_wines = []
        
    else:
        # Filter the dataframe to select only the predicted cluster
        filtered_df = wine_clusters[wine_clusters["cluster13"] == cluster]

        # Get five random wines from the filtered dataframe
        similar_wines = filtered_df.sample(5, ignore_index=True)

    return similar_wines

In [29]:
recommendations = recommend_from_descriptors("fresh citrus_fruit aromatic")
recommendations

Words: ['fresh', 'citrus_fruit', 'aromatic']
TF-IDF scores: [[0.         0.         0.        ]
 [0.         0.         0.54048216]
 [0.         0.         0.29581195]
 ...
 [0.26008836 0.         0.31378501]
 [0.         0.         0.        ]
 [0.         0.         0.        ]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

[]

We see here that the problem is that the embedding is consisted in all zeros, even when the TF-IDF scores are not zero. Check the Word2Vec model.

In [27]:
# Get the unique words from the "notes_norm_removed_reduced250" column
all_words = set(" ".join(wine_clusters["notes_norm_removed_new_reduction_dropped"]).split())

# Get the vocabulary of the Word2Vec model
word2vec_vocabulary = set(w2v_model.wv.index_to_key)

# Find the words that are present in the Word2Vec vocabulary
words_in_word2vec = all_words.intersection(word2vec_vocabulary)

# Find the words that are not present in the Word2Vec vocabulary
words_not_in_word2vec = all_words.difference(word2vec_vocabulary)

print("Words in Word2Vec vocabulary:", len(words_in_word2vec))
print("Words not in Word2Vec vocabulary:", len(words_not_in_word2vec))
print("Example words not in Word2Vec vocabulary:", list(words_not_in_word2vec)[:10])

Words in Word2Vec vocabulary: 38
Words not in Word2Vec vocabulary: 0
Example words not in Word2Vec vocabulary: []


In [30]:
# Check if the Word2Vec model is loaded correctly
print(w2v_model)

# Check if the words have valid embeddings
words_to_check = ['white_fruit', 'vanilla', 'aromatic']
for word in words_to_check:
    if word in w2v_model.wv:
        print(f"{word}: Valid embedding")
    else:
        print(f"{word}: No valid embedding")

Word2Vec<vocab=38, vector_size=300, alpha=0.03>
white_fruit: Valid embedding
vanilla: No valid embedding
aromatic: Valid embedding


In [31]:
# Get the vocabulary of the Word2Vec model
word2vec_vocabulary = list(set(w2v_model.wv.index_to_key))
word2vec_vocabulary

['black_fruit',
 'powerful',
 'good_acidity',
 'balanced',
 'tannins',
 'mineral',
 'structured',
 'rounded',
 'persistent',
 'long',
 'fresh',
 'spices',
 'smooth',
 'wood',
 'complex',
 'bottle_aging',
 'unctuous',
 'flavoursome',
 'silky',
 'white_fruit',
 'crunchy',
 'green_fruit',
 'full',
 'dried_cooked_fruits',
 'red_fruit',
 'ageing',
 'toasty',
 'elegant',
 'finish',
 'herbs',
 'citrus_fruit',
 'stone_fruit',
 'rich',
 'tropical_fruit',
 'pleasant',
 'balsamic',
 'aromatic',
 'floral']

In [21]:
recommendations = recommend_from_descriptors("black_fruit wood elegant")
recommendations

Words: ['black_fruit', 'wood', 'elegant']
TF-IDF scores: [[0.43374017 0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.30025796 0.        ]
 ...
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.23843319 0.26985023 0.29476285]]
[[-1.98754606e-03  1.93368681e-02 -4.48273883e-04  6.05238539e-03
  -9.24150956e-03 -2.34348004e-02  7.43698080e-03  3.59559804e-02
  -7.25593853e-03 -1.15863557e-02  1.98364928e-02 -1.54213731e-02
  -4.17114329e-03  1.79845194e-02  3.11838339e-03 -1.16926183e-02
   2.44133969e-02 -8.03449564e-03 -4.92432031e-03 -2.26513123e-02
   3.00263924e-03 -1.36291335e-02  2.90777100e-03  2.48178268e-02
  -1.16762072e-02  3.82274886e-03 -1.30791080e-02  1.24545644e-02
  -1.16715841e-02 -1.80802457e-02  2.38514676e-03  1.55853185e-04
   7.91664422e-03 -1.35898404e-02 -2.07441801e-04  1.65397537e-02
   1.36854475e-02 -2.43981679e-02 -2.15001063e-03  4.57096255e-03
  -9.94991946e-03 -6.23338545e-04  5.43977196e-03 -1.51624903

Unnamed: 0,wine,year,winery,winery_norm,price,bottle,type_wine,type_wine_details,type_agriculture,region,...,parker_score,parker_score_num,penin_score,image,url,tasting_notes,notes_norm_removed_new_reduction_dropped,embeddings,cluster11,cluster13
0,Vol d'Ànima de Raimat Negre 2020,2020.0,Raimat,raimat,12.4,0.75,Red,Red,Conventional,Cataluña,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/vol-d-anima...,Pleasant / Sweet entry / Toasty notes / Fruit ...,pleasant toasty tannins powerful structured lo...,"[0.005344940349459648, 0.02864959090948105, -0...",9,12
1,Pedalier Albariño 2017,2017.0,Family Owned Wineries,family owned wineries,15.5,0.75,White,White,Conventional,Galicia,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/pedalier-al...,Oily / Flavoursome / Complex / Powerful / With...,flavoursome complex powerful structured good_a...,"[0.005740273743867874, 0.03046875074505806, -0...",9,12
2,Can Axartell Ventum 2016,2016.0,Can Axartell,can axartell,26.4,0.75,Red,Red,Organic,Islas Baleares,...,,,92.0,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/can-axartel...,Good entry / Slightly sweet / Mediterranean ch...,structured good_acidity powerful balanced pers...,"[0.005594116169959307, 0.03010796196758747, -0...",9,12
3,Fuentes del Silencio Las Quintas 2016,2016.0,Fuentes del Silencio,fuentes del silencio,32.75,0.75,Red,Red,Conventional,Castilla y León,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/las-quintas/,Good entry / Delicate / Fresh / Unctuous / Ric...,fresh unctuous rich wood spices tannins finish...,"[0.0061105284839868546, 0.032187044620513916, ...",9,12
4,Matías i Torres Negramoll 2017,2017.0,Bodega Juan Matías Torres,juan matias torres,30.5,0.75,Red,Red,Conventional,Islas Canarias,...,92+,92.5,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/matias-i-to...,Good entry / Fresh / Good acidity / Elegant / ...,fresh good_acidity elegant ageing mineral tann...,"[0.006441925652325153, 0.03407750651240349, -0...",9,12


In [22]:
recommendations = recommend_from_descriptors("red_fruit wood elegant")
recommendations

Words: ['red_fruit', 'wood', 'elegant']
TF-IDF scores: [[0.45912398 0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.30025796 0.        ]
 ...
 [0.         0.         0.        ]
 [0.33722114 0.         0.        ]
 [0.25238703 0.26985023 0.29476285]]
[[-0.00122639  0.0196837  -0.00031803  0.0066918  -0.01013533 -0.02462507
   0.00802772  0.03783645 -0.00825816 -0.01188541  0.02061255 -0.01653929
  -0.00476387  0.02004684  0.00257003 -0.01234266  0.02577547 -0.00783025
  -0.00482067 -0.02508275  0.00319828 -0.01438537  0.00305097  0.02593158
  -0.01159178  0.00409407 -0.01445297  0.01286441 -0.01269818 -0.01923772
   0.00268578  0.00097657  0.00883876 -0.01410944 -0.0008096   0.01712601
   0.01368631 -0.02603811 -0.00200575  0.00404297 -0.01050961 -0.00133932
   0.00571393 -0.01629919  0.01673717  0.01434836 -0.01618533  0.00893993
   0.00035375  0.00946145 -0.00500585  0.00546077 -0.01586562  0.01521798
   0.00360235  0.00490484 -0.00023744  0.00179167 -0.0005

Unnamed: 0,wine,year,winery,winery_norm,price,bottle,type_wine,type_wine_details,type_agriculture,region,...,parker_score,parker_score_num,penin_score,image,url,tasting_notes,notes_norm_removed_new_reduction_dropped,embeddings,cluster11,cluster13
0,Contino Gran Reserva 2017,2017.0,Viñedos del Contino,vinedos del contino,60.8,0.75,Red,Red,Conventional,La Rioja,...,94.0,94.0,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/contino-gra...,Balanced / Powerful / Silky / Elegant / Well-i...,balanced powerful silky elegant wood flavourso...,"[0.006224039476364851, 0.0334448516368866, -0....",9,12
1,Frore de Carme 2017,2017.0,Adega Familiar Eladio Piñeiro,adega familiar eladio pineiro,37.4,0.75,White,White,Conventional,Galicia,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/frore-de-ca...,Flavoursome / Glyceric / With body / Well-inte...,flavoursome good_acidity balsamic citrus_fruit...,"[0.005299676209688187, 0.02821599878370762, -0...",9,12
2,Aires de Garbet 2018,2018.0,Perelada,perelada,44.8,0.75,Red,Red,Conventional,Cataluña,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/aires-de-ga...,Fresh / Balanced / With personality / Ripe tan...,fresh balanced tannins silky finish long persi...,"[0.006629740819334984, 0.035058118402957916, -...",9,12
3,Can Axartell Corum Blanco 2021,2021.0,Can Axartell,can axartell,27.2,0.75,White,White,Organic,Islas Baleares,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/can-axartel...,Full / Silky / Good acidity / Complex / Intens...,full silky good_acidity complex long fresh aro...,"[0.006149639841169119, 0.033375464379787445, -...",9,12
4,Cuevas de Arom Tuca Negra 2017,2017.0,Cuevas de Arom,cuevas de arom,84.2,0.75,Red,Red,Conventional,Aragón,...,,,,https://cdn.vinissimus.com/img/unsafe/p500x/pl...,https://www.vinissimus.com/en/wine/tuca-negra/,Powerful / Fresh / Mineral notes / Complex / F...,powerful fresh mineral complex tannins good_ac...,"[0.006628081668168306, 0.03598056361079216, -0...",9,12


In [32]:
recommendations = recommend_from_descriptors("white_fruit silky spices")
recommendations

Words: ['white_fruit', 'silky', 'spices']
TF-IDF scores: [[0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 ...
 [0.         0.5151798  0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.26465022]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

[]

In [33]:
recommendations = recommend_from_descriptors("fresh citrus_fruit aromatic")
recommendations

Words: ['fresh', 'citrus_fruit', 'aromatic']
TF-IDF scores: [[0.         0.         0.        ]
 [0.         0.         0.54048216]
 [0.         0.         0.29581195]
 ...
 [0.26008836 0.         0.31378501]
 [0.         0.         0.        ]
 [0.         0.         0.        ]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

[]

Even when using words that we have checked that are present in the Word2Vec vocabulary, some combinations return not valid embeddings. This is probably because of the limited size of the training data. Training the model on a larger corpus would allow it to learn a wider range of contexts in which the words are used and therefore obtain better word representations.