In [None]:
!pip install stop-words
!pip install pyLDAvis
!pip install bertopic

In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import gensim
from gensim.models import LsiModel
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
nltk.download('punkt')

In [None]:
# Import the data and clean/preprocess the text data.
df = pd.read_csv('wine-raitngs.csv')
df.dropna(inplace=True)
df['notes'] = df['notes'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
df['notes'] = df['notes'].str.lower()
df['notes'] = df['notes'].str.replace(r'[^\w\s]','', regex = True)
df['notes'] = df['notes'].str.replace('\d+', '', regex=True)
stop_words = stop_words.STOP_WORDS
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['notes'].head(5)
df.head(5)

In [None]:
# Red wine data split topic modeling and visualization
df1 = df[df['variety'] == 'Red Wine']

texts1 = list(df1['notes'])
texts1 = [word_tokenize(text) for text in texts1]

dictionary1 = corpora.Dictionary(texts1)
corpus1 = [dictionary1.doc2bow(text) for text in texts1]


lda_model1 = gensim.models.ldamodel.LdaModel(corpus = corpus1, #TDM
                                           id2word = dictionary1, #Dictionary
                                           num_topics = 5,
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)


vis = pyLDAvis.gensim_models.prepare(lda_model1, corpus1, dictionary1, n_jobs = 1)
pyLDAvis.display(vis)

#pyLDAvis.save_html(vis, 'LDA_Visualization_1.html') ##saves the file

In [None]:
# White wine data split topic modeling and visualization
df2 = df[df['variety'] == 'White Wine']

texts2 = list(df2['notes'])
texts2 = [word_tokenize(text) for text in texts2]

dictionary1.filter_extremes(no_below=10, no_above=0.9)
corpus2 = [dictionary1.doc2bow(text) for text in texts2]

lda_model2 = gensim.models.ldamodel.LdaModel(corpus = corpus2, #TDM
                                           id2word = dictionary1, #Dictionary
                                           num_topics = 5,
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)


vis1 = pyLDAvis.gensim_models.prepare(lda_model2, corpus2, dictionary1, n_jobs = 1)
pyLDAvis.display(vis1)

#pyLDAvis.save_html(vis1, 'LDA_Visualization_2.html') ##saves the file

In [None]:
# Bert Topic Modeling
from bertopic import BERTopic
import plotly


topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(df['notes'])


topic_model.visualize_topics()

In [None]:
topics_per_class = topic_model.topics_per_class(df['notes'], classes=df['variety'])


topic_model.visualize_topics_per_class(topics_per_class)