In [18]:
import pandas as pd

In [45]:
from nltk.corpus import stopwords;
import nltk;
from gensim.models import ldamodel
import gensim.corpora;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;

In [25]:
df = pd.read_csv("abcnews-date-text.csv")

In [27]:
df.shape[0] / 1000000

1.244184

In [47]:
df_2 = pd.read_csv("data/winemag-data/winemag-data_first150k.csv", index_col=0)

In [48]:
df_2.shape

(150930, 10)

In [49]:
df_2.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [39]:
df_2.variety.value_counts(1).round(4).head(20) * 100

Chardonnay                       9.60
Pinot Noir                       9.47
Cabernet Sauvignon               8.48
Red Blend                        6.67
Bordeaux-style Red Blend         4.87
Sauvignon Blanc                  4.19
Syrah                            3.86
Riesling                         3.66
Merlot                           3.36
Zinfandel                        2.52
Sangiovese                       2.22
Malbec                           2.13
White Blend                      1.87
Rosé                             1.87
Tempranillo                      1.69
Nebbiolo                         1.48
Portuguese Red                   1.47
Sparkling Blend                  1.33
Shiraz                           1.31
Corvina, Rondinella, Molinara    1.11
Name: variety, dtype: float64

In [51]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jameshelfrich/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [52]:
stops = set(stopwords.words("english"))

In [55]:
df_2.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [59]:
df_2["bow"] = df_2["description"].str.lower().str.split()

In [63]:
df_2["no_stops"] = df_2["bow"].apply(lambda x: [i for i in x if i not in stops])

In [68]:
df_2["no_stops_str"] = df_2["no_stops"].apply(lambda x: " ".join(x))

In [69]:
df_2["no_stops_str"]

0         tremendous 100% varietal wine hails oakville a...
1         ripe aromas fig, blackberry cassis softened sw...
2         mac watson honors memory wine made mother trem...
3         spent 20 months 30% new french oak, incorporat...
4         top wine la bégude, named highest point vineya...
                                ...                        
150925    many people feel fiano represents southern ita...
150926    offers intriguing nose ginger, lime floral ele...
150927    classic example comes cru vineyard called terr...
150928    perfect salmon shade, scents peaches, cherries...
150929    pinot grigios taste like this. rich pear-like ...
Name: no_stops_str, Length: 150930, dtype: object

In [53]:
len(stops)

179

In [None]:
nltk.corpus.

# NMF

In [95]:
train_str = df_2["no_stops_str"].copy()

In [71]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000);
x_counts = vectorizer.fit_transform(train);

In [72]:
x_counts

<150930x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 3565510 stored elements in Compressed Sparse Row format>

In [73]:
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [74]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [76]:
num_topics = 7

In [77]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');

In [78]:
#fit the model
model.fit(xtfidf_norm)

NMF(init='nndsvd', n_components=7)

In [79]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [80]:
get_nmf_topics(model, 20)



Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07
0,aromas,apple,wine,dry,fresh,sweet,drink
1,cherry,citrus,tannins,flavors,acidity,simple,now
2,berry,peach,ripe,pinot,light,soft,soft
3,palate,white,fruit,cherries,crisp,flavors,ready
4,finish,pear,wood,good,wine,sugary,ripe
5,black,green,fruits,cola,bright,like,fruity
6,spice,finish,rich,oak,fruity,vanilla,easy
7,plum,flavors,structure,cherry,red,jam,full
8,fruit,lemon,firm,noir,attractive,tastes,bodied
9,red,lime,years,silky,character,cherry,texture


# LDA

In [83]:
train = df_2["no_stops"].copy()

In [84]:
id2word = gensim.corpora.Dictionary(train);

In [85]:
corpus = [id2word.doc2bow(text) for text in train];

In [86]:
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics);

In [87]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [88]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07
0,flavors,imported,wine,wine,flavors,cabernet,new
1,cherry,imports.,"wine,",aromas,wine,blend,oak
2,fruit,wine,fruit,fruit,aromas,finish.,pinot
3,wine,"nice,",ripe,offers,citrus,black,one
4,good,fruit,tannins,notes,finish.,wine,best
5,flavors.,ltd.,years.,spice,fruit,tannins,wine
6,aromas,still,acidity,mouth,crisp,notes,shows
7,palate,here.,rich,"fruit,",white,fruit,oak.
8,bit,drinkable,great,delivers,apple,merlot,vineyard
9,black,"cab,",wine.,bright,green,flavors,fruit


# BERT

In [92]:
from bertopic import BERTopic

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer

In [103]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model, language="english", calculate_probabilities=True, verbose=True)

In [None]:
#instantiate BERTopic
# topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

#generate the topics
topics, probs = topic_model.fit_transform(train_str)

Batches:   0%|          | 0/4717 [00:00<?, ?it/s]

In [None]:
topic_model.save("my_model", serialization="pickle")