# Topic Modeling with SciKit Learn

In this notebook we create a topic model from our corpus  using SciKit Learn's library. We'll save our results and then use another notebook to explore the results.

# Set Up

## Imports

In [1]:
import pandas as pd
import numpy as np

## Configuration

In [2]:
corpus_file = './corpora/winereviews-sampled-tapi.csv'
db_dir = './db'
data_prefix = corpus_file.split('/')[-1].split('-')[0]
csv_sep = '|'

In [3]:
data_prefix

'winereviews'

## Parameters

In [4]:
n_terms = 4000 # Vocabulary size
ngram_range = (1,3)
use_tfidf = True
n_topics = 40 # Number of topics
max_iter = 5 # Number of iterations for topic model

In [5]:
topic_cols = [t for t in range(n_topics)]

# Import Corpus Data

We import a corpus in our standard format

In [6]:
corpus = pd.read_csv(corpus_file, sep=csv_sep)
corpus.index.name = 'doc_id'

## Inspect contents

In [7]:
corpus.head()

Unnamed: 0_level_0,doc_key,doc_title,doc_label,doc_province,doc_points,doc_price,doc_content,doc_original,doc_variety,doc_taster,doc_place
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,62484,Bridlewood 2010 Pinot Noir (Monterey County) ...,US,California,86,20.0,"This is a sound Pinot Noir, rich in raspberry ...","This is a sound Pinot Noir, rich in raspberry ...",Pinot Noir,,US California Monterey County Central Coast
1,25964,J Vineyards & Winery 2012 Bow Tie Vineyard Pin...,US,California,92,75.0,"Bow Tie is planted to a variety of clones, inc...","Bow Tie is planted to a variety of clones, inc...",Pinot Noir,Virginie Boone,US California Russian River Valley Sonoma
2,15168,Pascal Jolivet 2010 Attitude Sauvignon Blanc (...,France,Loire Valley,87,,"More fragrant than herbaceous, this is a round...","More fragrant than herbaceous, this is a round...",Sauvignon Blanc,Roger Voss,France Loire Valley Val de Loire
3,78759,Grati 2013 Villa di Vetrice (Chianti Rufina) ...,Italy,Tuscany,88,19.0,This vibrant wine opens with aromas that recal...,This vibrant wine opens with aromas that recal...,Red Blend,Kerin O’Keefe,Italy Tuscany Chianti Rufina
4,10070,Tiefenbrunner 2015 Pinot Grigio (Alto Adige) ...,Italy,Northeastern Italy,88,16.0,This 100% Pinot Grigio opens with aromas of le...,This 100% Pinot Grigio opens with aromas of le...,Pinot Grigio,Kerin O’Keefe,Italy Northeastern Italy Alto Adige


In [8]:
corpus.sample(5)

Unnamed: 0_level_0,doc_key,doc_title,doc_label,doc_province,doc_points,doc_price,doc_content,doc_original,doc_variety,doc_taster,doc_place
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9048,110148,Antica Enotria 2011 Vriccio Primitivo (Puglia)...,Italy,Southern Italy,86,20.0,Notes of cherry and raspberry open the nose of...,Notes of cherry and raspberry open the nose of...,Primitivo,,Italy Southern Italy Puglia
8362,126774,Hermann Moser 2014 Per Due Grüner Veltliner (K...,Austria,Kremstal,90,13.0,"A wonderful combination of fresh, green pears,...","A wonderful combination of fresh, green pears,...",Grüner Veltliner,Anne Krebiehl MW,Austria Kremstal
5986,9350,Viu Manent 2012 La Capilla Single Vineyard Cab...,Chile,Colchagua Valley,89,25.0,Brambly cassis and raspberry aromas are warm a...,Brambly cassis and raspberry aromas are warm a...,Cabernet Sauvignon,Michael Schachner,Chile Colchagua Valley
8432,52316,Nicosia 2010 Fondo Filara Catarratto (Sicilia)...,Italy,Sicily & Sardinia,87,15.0,"This fresh, Catarratto-based white wine opens ...","This fresh, Catarratto-based white wine opens ...",Catarratto,,Italy Sicily & Sardinia Sicilia
7273,44440,MacLaren 2013 Moaveni Vineyard Syrah (Bennett ...,US,California,90,45.0,An exciting varietal wine from an appellation ...,An exciting varietal wine from an appellation ...,Syrah,Virginie Boone,US California Bennett Valley Sonoma


In [9]:
corpus.shape

(10000, 11)

In [10]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   doc_key       10000 non-null  int64  
 1   doc_title     10000 non-null  object 
 2   doc_label     9994 non-null   object 
 3   doc_province  9994 non-null   object 
 4   doc_points    10000 non-null  int64  
 5   doc_price     9314 non-null   float64
 6   doc_content   10000 non-null  object 
 7   doc_original  10000 non-null  object 
 8   doc_variety   10000 non-null  object 
 9   doc_taster    7949 non-null   object 
 10  doc_place     10000 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 859.5+ KB


# Create Bag-of-Words 

ie. a __Count Vector Space__

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [12]:
count_engine = CountVectorizer(max_features=n_terms, stop_words='english', ngram_range=ngram_range)
count_model = count_engine.fit_transform(corpus.doc_content)

## Get Generated VOCAB

In [13]:
VOCAB = pd.DataFrame(count_engine.get_feature_names(), columns=['term_str'])
VOCAB = VOCAB.set_index('term_str')
# VOCAB.index.name = 'term_id' # For convenience, we'll use strings for IDs

## Get Generated Bag-of-Words

We do this just to show what the counter vectorizer produced. `DTM` stands for documet-term matrix. We convert this sparse matrix into a "thin" dataframe that keeps only terms with counts for each document. 

In [14]:
DTM = pd.DataFrame(count_model.toarray(), index=corpus.index, columns=VOCAB.index)
BOW = DTM.stack().to_frame('n')
BOW = BOW[~(BOW.n == 0)]

In [15]:
DTM.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 4000 entries, 000 to zippy
dtypes: int64(4000)
memory usage: 305.2 MB


In [16]:
BOW.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 268148 entries, (0, 'acidity') to (9999, 'wine shows')
Columns: 1 entries, n to n
dtypes: int64(1)
memory usage: 3.2+ MB


## Compute TF-IDF

In [17]:
tfidf_engine = TfidfTransformer()
tfidf_model = tfidf_engine.fit_transform(count_model)

In [18]:
TFIDF = pd.DataFrame(tfidf_model.toarray(), index=corpus.index, columns=VOCAB.index)

In [19]:
BOW['tfidf'] = TFIDF.stack()

In [20]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
doc_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
0,acidity,1,0.108765
0,coastal,1,0.318555
0,flavors,1,0.082603
0,jam,1,0.231052
0,lots,1,0.250917
...,...,...,...
9999,tar,1,0.234509
9999,violets,1,0.252271
9999,weight,1,0.186367
9999,wine,1,0.070603


## Add Features to VOCAB

In [21]:
VOCAB['ngram'] = VOCAB.index.str.contains(' ')
VOCAB['n'] = DTM.sum()
VOCAB['tfidf_mean'] = TFIDF.mean()

In [22]:
VOCAB.sort_values('n', ascending=False)

Unnamed: 0_level_0,ngram,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
wine,False,6219,0.040113
flavors,False,4837,0.032686
fruit,False,3795,0.029686
aromas,False,3071,0.024335
palate,False,2971,0.023535
...,...,...,...
tannins lend,True,12,0.000286
cement,False,12,0.000288
finish brisk,True,12,0.000328
mixture,False,12,0.000350


In [23]:
VOCAB[VOCAB.ngram==True].sort_values('n', ascending=False)

Unnamed: 0_level_0,ngram,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black cherry,True,606,0.007612
fruit flavors,True,575,0.008326
cabernet sauvignon,True,414,0.005714
ready drink,True,311,0.005603
palate offers,True,301,0.004329
...,...,...,...
crisp tart,True,12,0.000358
stewed plum,True,12,0.000287
balance wine,True,12,0.000330
pine needle,True,12,0.000302


In [24]:
VOCAB[VOCAB.ngram==True].sort_values('tfidf_mean', ascending=False)

Unnamed: 0_level_0,ngram,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fruit flavors,True,575,0.008326
black cherry,True,606,0.007612
cabernet sauvignon,True,414,0.005714
ready drink,True,311,0.005603
medium bodied,True,285,0.004896
...,...,...,...
juicy wild cherry,True,13,0.000261
mature black cherry,True,13,0.000258
raspberry compote,True,12,0.000258
spice whiff,True,13,0.000255


# Generate Topic Models

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [39]:
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA

## Using LDA

In [26]:
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

### THETA

In [27]:
if use_tfidf:
    THETA = pd.DataFrame(lda.fit_transform(tfidf_model), index=corpus.index)
else:
    THETA = pd.DataFrame(lda.fit_transform(count_model), index=corpus.index)
THETA.index.name = 'doc_id'
THETA.columns.name = 'topic_id'

In [28]:
THETA.sample(20).style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
4506,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.044625,0.043002,0.003774,0.003774,0.639202,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.137321,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774,0.003774
1670,0.050526,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.356414,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.093514,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418,0.340494,0.004418,0.004418,0.004418,0.004418,0.004418,0.004418
2437,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.063319,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.711925,0.057092,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531,0.004531
7462,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.269358,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.456831,0.108774,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446,0.00446
7394,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.192879,0.004116,0.004116,0.004116,0.004116,0.345129,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.309704,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116,0.004116
8655,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.325409,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308,0.205736,0.004308,0.004308,0.004308,0.004308,0.004308,0.309468,0.004308,0.004308,0.004308,0.004308,0.004308,0.004308
3252,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.373313,0.004419,0.004419,0.458764,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419,0.004419
7215,0.32835,0.003614,0.045805,0.003614,0.178715,0.003614,0.069083,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.08743,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.003614,0.167756,0.003614
8303,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.574899,0.004838,0.180897,0.004838,0.065199,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838,0.004838
78,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.659765,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.160764,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723,0.004723


In [29]:
# THETA.sum(1).sum()

### PHI

In [30]:
PHI = pd.DataFrame(lda.components_, columns=VOCAB.index)
PHI.index.name = 'topic_id'
PHI.columns.name  = 'term_str'

In [31]:
PHI.T.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.848294,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.35003,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,4.85113,0.025,0.025,0.025,0.025,0.025
000 cases,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.068952,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,4.218056,0.025,0.025,0.025,0.025,0.025
10,7.746042,0.300106,0.025,0.025411,0.970494,0.337143,0.025,2.135264,0.206167,0.025,0.025005,0.025,0.200152,0.130099,5.300514,0.683757,0.025,0.025004,1.661724,0.598947,0.383469,0.32561,0.025,0.025,1.015342,2.68889,1.600118,1.021419,0.420111,0.041081,0.025,0.531789,0.025,0.025,0.040424,0.025,0.025,0.025,0.025,0.025
10 cabernet,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.443486,0.025,0.025,0.025,0.025,0.025,2.784984,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025
10 merlot,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,2.703099,0.025,0.025,0.025,0.263419,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.271572,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025


### Create Topic Glosses

In [32]:
n_top_words = 7

In [33]:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [34]:
TOPICS

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,cabernet,cabernet sauvignon,sauvignon,merlot,franc,blend,cabernet franc
1,cherry,sirah,petite sirah,petite,tannins,wine,flavors
2,aged wine,wood aged,wood aged wine,lemony,jerky,white stone,stone fruit
3,new wood,lacks,plum fruits,wood flavors,green fruits,cabernet sauvignon blend,sauvignon blend
4,years,hard,tannins,cherry,hard tannins,offers dried,palate offers dried
5,term,term aging,long term,pretty good,round wine,anjou,fruit core
6,rose petal,petal,offers juicy,palate offers juicy,muted,noticeable,uncomplicated
7,fruit,wine,spice,flavors,dark,blackberry,black
8,passion fruit,passion,flavors,fruit,finish,wine,lime acidity
9,50,grounds,grapy,coffee grounds,drinks,finish good,good


In [35]:
TOPICS['topwords'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [36]:
TOPICS['doc_weight_sum'] = THETA.sum()

## Using NMF

In [48]:
nmf_engine = NMF(n_components=n_topics, init='nndsvd', random_state=1, alpha=.1, l1_ratio=.5)

In [63]:
THETA_NMF = pd.DataFrame(nmf_engine.fit_transform(tfidf_model), index=corpus.index)
THETA_NMF.columns.name = 'topic_id'

In [64]:
THETA_NMF

topic_id,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.004795,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.030172,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.003034,0.0,0.000000,0.000000,...,0.0,0.000728,0.000000,0.000000,0.012577,0.000000,0.0,0.000000,0.000000,0.0
2,0.0,0.000000,0.006951,0.000000,0.000000,0.000000,0.011587,0.0,0.000000,0.069036,...,0.0,0.001641,0.029044,0.001721,0.000000,0.000000,0.0,0.000000,0.003968,0.0
3,0.0,0.012151,0.000000,0.098693,0.000000,0.000000,0.000000,0.0,0.000000,0.001281,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
4,0.0,0.018638,0.050656,0.032773,0.000000,0.000000,0.005610,0.0,0.000000,0.000000,...,0.0,0.000000,0.019610,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.010256,0.000000,0.000000,0.005637,0.000000,0.008245,0.0,0.000396,0.039170,...,0.0,0.021381,0.027275,0.023808,0.000000,0.017378,0.0,0.000000,0.000000,0.0
9996,0.0,0.008959,0.000000,0.000000,0.000000,0.020932,0.032915,0.0,0.000000,0.000000,...,0.0,0.006693,0.005327,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
9997,0.0,0.000000,0.000000,0.000000,0.050167,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.003275,0.000000,0.000000,0.020139,0.000000,0.0,0.000000,0.000000,0.0
9998,0.0,0.025419,0.000265,0.005557,0.004323,0.000000,0.000000,0.0,0.000000,0.033460,...,0.0,0.000000,0.025494,0.020414,0.000000,0.000000,0.0,0.000000,0.000000,0.0


In [65]:
PHI_NMF = pd.DataFrame(nmf.components_, columns=VOCAB.index)

In [66]:
PHI_NMF.index.name = 'topic_id'
PHI_NMF.columns.name  = 'term_str'

In [67]:
PHI_NMF.T.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,0.021179,0.004845,0.0,0.0,0.0,0.0,0.0,0.031007,0.014939,0.0,0.0,0.0,0.0,0.0,0.044432,0.020039,0.0,0.0,0.006786,0.0,0.005343,0.0,0.002481,0.0,0.0,0.0,0.009684,0.001487,0.0,0.020873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000 cases,0.013892,0.0,0.0,0.0,0.0,0.0,0.0,0.00851,0.020116,0.0,0.0,0.0,0.0,0.0,0.04412,0.021054,0.0,0.0,0.014918,0.0,0.000195,0.0,0.002692,0.0,0.0,0.0,0.0,0.002532,0.0,0.018692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.028341,0.005959,0.030085,0.0,0.0,0.0,0.0,0.301782,0.0,0.0,0.0,0.125,0.0,0.038117,0.050237,0.104588,0.0,0.0,0.007658,0.002621,0.0,0.0,0.0,0.049366,0.0,0.0,0.0,0.020636,0.0,0.058076,0.0,0.0,0.017124,0.0,0.0,0.002794,0.066896,0.0,0.284645
10 cabernet,0.0,0.0,0.005685,0.0,0.0,0.0,0.0,0.004044,0.127351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008705,0.0,0.0,0.020427,0.0,0.0,0.0,0.023757,0.0,0.0
10 merlot,0.0,0.0,0.003792,1.4e-05,0.0,0.0,0.0,0.0,0.076373,0.0,0.0,0.0,0.002586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001801,0.0,0.0,0.001186,0.0,0.0,0.01253,0.0,0.0,0.000617,0.020565,0.0,0.0,0.009227,0.0,0.002417


In [60]:
TOPICS_NMF = PHI_NMF.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [61]:
TOPICS_NMF

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,flavors,fruit flavors,good,orange,bit,feels,cherry flavors
1,wine,character,age,years,young,great,structure
2,black,black cherry,black fruit,black currant,currant,licorice,black plum
3,white,peach,white peach,stone,lemon,citrus,lime
4,fruit,fruit flavors,black fruit,stone,stone fruit,red fruit,cherry fruit
5,red,red fruit,red cherry,red berry,red currant,currant,red fruits
6,acidity,bright,balanced,bright acidity,juicy,vibrant,crisp acidity
7,blackberry,chocolate,cassis,tannic,good,big,jam
8,cabernet,sauvignon,cabernet sauvignon,merlot,franc,cabernet franc,verdot
9,finish,long,finish drink,herbal,flavors finish,feels,note


In [68]:
TOPICS_NMF['topwords'] = TOPICS_NMF.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [69]:
TOPICS_NMF['doc_weight_sum'] = THETA_NMF.sum()

In [72]:
TOPICS_NMF[['topwords']]

term_str,topwords
topic_id,Unnamed: 1_level_1
0,"0 flavors, fruit flavors, good, orange, bit, f..."
1,"1 wine, character, age, years, young, great, s..."
2,"2 black, black cherry, black fruit, black curr..."
3,"3 white, peach, white peach, stone, lemon, cit..."
4,"4 fruit, fruit flavors, black fruit, stone, st..."
5,"5 red, red fruit, red cherry, red berry, red c..."
6,"6 acidity, bright, balanced, bright acidity, j..."
7,"7 blackberry, chocolate, cassis, tannic, good,..."
8,"8 cabernet, sauvignon, cabernet sauvignon, mer..."
9,"9 finish, long, finish drink, herbal, flavors ..."


# Save the Model

# Keep Corpus Label Info

In [37]:
LABELS = corpus[set(corpus.columns.tolist()) - set(['doc_key', 'doc_content', 'doc_original'])]

## Save each dataframe

This could of course be generalized as a function or class method.

In [77]:
LABELS.to_csv(f"{db_dir}/{data_prefix}-LABELS.csv", index=True)
VOCAB.to_csv(f"{db_dir}/{data_prefix}-VOCAB.csv", index=True)
BOW.to_csv(f"{db_dir}/{data_prefix}-BOW.csv", index=True)
TOPICS.to_csv(f"{db_dir}/{data_prefix}-TOPICS.csv", index=True)
THETA.to_csv(f"{db_dir}/{data_prefix}-THETA.csv", index=True)
PHI.to_csv(f"{db_dir}/{data_prefix}-PHI.csv", index=True)
TOPICS_NMF.to_csv(f"{db_dir}/{data_prefix}-TOPICS_NMF.csv", index=True)
THETA_NMF.to_csv(f"{db_dir}/{data_prefix}-THETA_NMF.csv", index=True)
PHI_NMF.to_csv(f"{db_dir}/{data_prefix}-PHI_NMF.csv", index=True)