In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import gensim
from gensim import corpora
from nltk.stem.porter import PorterStemmer
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
nltk.download('punkt')
import re

In [None]:
# Import the data and clean/preprocess the text data.
df = pd.read_csv('wine-raitngs.csv')
df.dropna(inplace=True)
df['notes'] = df['notes'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
df['notes'] = df['notes'].str.lower()
df['notes'] = df['notes'].str.replace(r'[^\w\s]','', regex = True)
df['notes'] = df['notes'].str.replace('\d+', '', regex=True)
stop_words = stop_words.STOP_WORDS
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['notes'].head(5)
df.head(5)

In [None]:
ps = PorterStemmer()

#create a spot to save the processed text
processed_text = []

#loop through each item in the list
for text in list(df['notes']):
  #create tokens
  text = nltk.word_tokenize(text)
  #stem the words
  text = [ps.stem(word = word) for word in text]
  #add it to our list
  processed_text.append(text)

#create a dictionary of the words
dictionary = corpora.Dictionary(processed_text)

#create a TDM
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_text]

#build model with 50 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus = doc_term_matrix, #TDM
                                           id2word = dictionary, #Dictionary
                                           num_topics = 50,
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

In [None]:
# Create vectors and calculate similarities
from sklearn.metrics.pairwise import cosine_similarity

topic_vecs = []

for i in range(len(processed_text)):
    top_topics = lda_model.get_document_topics(doc_term_matrix[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(50)]
    topic_vecs.append(topic_vec)
    i = i+1

doc_sim = cosine_similarity(topic_vecs)

doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()


def wine_recommender(wine_name, wines, doc_sims):
    # Find wine ID
    wine_idx = np.where(wines == wine_name)[0]

    # Check if wine exists
    if len(wine_idx) == 0:
        return f"Wine '{wine_name}' not found."

    # Get wine similarities
    wine_similarities = doc_sims.iloc[wine_idx[0]].values

    # Get top 5 similar wine IDs
    similar_wine_idxs = np.argsort(-wine_similarities)[1:6]

    # Get top 5 wines
    similar_wines = wines[similar_wine_idxs]

    # Return the top 5 wines
    return similar_wines


w1 = wine_recommender("14 Hands Hot to Trot Red Blend 2012",
                  df["name"].values,
                  doc_sim_df)


w2 = wine_recommender("A to Z Pinot Gris 2003",
                  df["name"].values,
                  doc_sim_df)


print(w1)
print(w2)

In [None]:
from sklearn.metrics import pairwise_distances

doc_sim_euc = pairwise_distances(topic_vecs, metric = 'euclidean')
doc_sim_df_euc = pd.DataFrame(doc_sim_euc)


w3 = wine_recommender("14 Hands Hot to Trot Red Blend 2012",
                  df["name"].values,
                  doc_sim_df_euc)


w4 = wine_recommender("A to Z Pinot Gris 2003",
                  df["name"].values,
                  doc_sim_df_euc)


print(w3)
print(w4)

In [None]:
model = gensim.models.Word2Vec(
        processed_text,
        vector_size=50, #smaller size for smaller data
        window=6,
        min_count=2,
        workers=4)

def document_vectorizer(corpus, model, num_features):
  vocabulary = set(model.wv.index_to_key)
  def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in words:
      if word in vocabulary:
        nwords = nwords + 1.
        feature_vector = np.add(feature_vector, model.wv[word])
      if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector
  features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
  return np.array(features)

avg_wv_features = document_vectorizer(corpus=processed_text,
                                                    model=model,
                                                     num_features=50)

doc_sim_wv = pd.DataFrame(cosine_similarity(avg_wv_features))

w5 = wine_recommender("14 Hands Hot to Trot Red Blend 2012",
                  df["name"].values,
                   doc_sim_wv)


w6 = wine_recommender("A to Z Pinot Gris 2003",
                  df["name"].values,
                   doc_sim_wv)


print(w5)
print(w6)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances


doc_sim_euc_wv = pairwise_distances(avg_wv_features, metric = 'euclidean')
doc_sim_euc_wv = pd.DataFrame(doc_sim_euc_wv)

w7 = wine_recommender("14 Hands Hot to Trot Red Blend 2012",
                  df["name"].values,
                   doc_sim_euc_wv)


w8 = wine_recommender("A to Z Pinot Gris 2003",
                  df["name"].values,
                   doc_sim_euc_wv)


print(w7)
print(w8)