# Topic Modeling with SciKit Learn

In this notebook we create a topic model from our corpus  using SciKit Learn's library. We'll save our results and then use another notebook to explore the results.

# Set Up

## Imports

In [103]:
import pandas as pd
import numpy as np

## Configuration

In [104]:
corpus_file = './corpora/winereviews-tapi.csv'
db_dir = './db'
data_prefix = corpus_file.split('/')[-1].split('-')[0]
csv_sep = '|'

In [105]:
data_prefix

'winereviews'

## Parameters

In [106]:
n_terms = 4000 # Vocabulary size
ngram_range = (1,4)
use_tfidf = True
n_topics = 20 # Number of topics
max_iter = 5 # Number of iterations for topic model

In [107]:
topic_cols = [t for t in range(n_topics)]

# Import Corpus Data

We import a corpus in our standard format

In [108]:
corpus = pd.read_csv(corpus_file, sep=csv_sep)
corpus.index.name = 'doc_id'

## Inspect contents

In [109]:
corpus.head()

Unnamed: 0_level_0,doc_key,doc_title,doc_label,doc_province,doc_points,doc_price,doc_content,doc_original,doc_variety,doc_taster,doc_place
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,86023,Lange 2011 Three Hills Cuvée Pinot Noir (Willa...,US,Oregon,88,40.0,"A tart, astringent Pinot, it needs a bit more ...","A tart, astringent Pinot, it needs a bit more ...",Pinot Noir,Paul Gregutt,US Oregon Willamette Valley Willamette Valley
1,45852,Finca Casa Lo Alto 2008 Reserva Red (Utiel-Req...,Spain,Levante,84,35.0,The cola and licorice aromas are candied and e...,The cola and licorice aromas are candied and e...,Red Blend,Michael Schachner,Spain Levante Utiel-Requena
2,32297,Plantagenet 2004 Omrah Cabernet Sauvignon (Wes...,Australia,Western Australia,88,15.0,"A good value, this starts off a little shaky t...","A good value, this starts off a little shaky t...",Cabernet Sauvignon,Joe Czerwinski,Australia Western Australia Western Australia
3,43293,Bougrier 2012 Rosé d'Anjou (Rosé) by Roger Voss,France,Loire Valley,84,13.0,"Typical, light and sweet rosé, fruity with bri...","Typical, light and sweet rosé, fruity with bri...",Rosé,Roger Voss,France Loire Valley Rosé d'Anjou
4,118523,Bolla 2007 Le Poiane (Valpolicella Classico S...,Italy,Veneto,87,14.0,If you aren't familiar with Ripasso (a hybrid ...,If you aren't familiar with Ripasso (a hybrid ...,"Corvina, Rondinella, Molinara",,Italy Veneto Valpolicella Classico Superiore R...


In [110]:
corpus.sample(5)

Unnamed: 0_level_0,doc_key,doc_title,doc_label,doc_province,doc_points,doc_price,doc_content,doc_original,doc_variety,doc_taster,doc_place
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1985,94180,Carmel Road 2012 Liberated Riesling (Arroyo Se...,US,California,85,16.0,This Riesling is distinctly off-dry. It's rich...,This Riesling is distinctly off-dry. It's rich...,Riesling,,US California Arroyo Seco Central Coast
4790,22401,Barnard Griffin 2012 Syrah (Columbia Valley (W...,US,Washington,87,17.0,"Streaks of citrus, berry and cracker are integ...","Streaks of citrus, berry and cracker are integ...",Syrah,Paul Gregutt,US Washington Columbia Valley (WA) Columbia Va...
3174,75139,Cutruzzola 2011 Riven Rock Vineyard Riesling (...,US,California,87,22.0,"This polished, un-oaked Riesling shows the var...","This polished, un-oaked Riesling shows the var...",Riesling,,US California San Luis Obispo County Central C...
225,39815,Leyendas de Familia 2009 Magia Negra Ensamblaj...,Chile,Maule Valley,90,69.0,"Herbal, olive-infused aromas of berry fruits a...","Herbal, olive-infused aromas of berry fruits a...",Bordeaux-style Red Blend,Michael Schachner,Chile Maule Valley
5883,74073,Lamiable NV Grand Cru Brut (Champagne) Grand ...,France,Champagne,91,,Lamiable is a small house in Tours-sur-Marne a...,Lamiable is a small house in Tours-sur-Marne a...,Champagne Blend,Roger Voss,France Champagne Champagne


In [111]:
corpus.shape

(10000, 11)

In [112]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   doc_key       10000 non-null  int64  
 1   doc_title     10000 non-null  object 
 2   doc_label     9993 non-null   object 
 3   doc_province  9993 non-null   object 
 4   doc_points    10000 non-null  int64  
 5   doc_price     9301 non-null   float64
 6   doc_content   10000 non-null  object 
 7   doc_original  10000 non-null  object 
 8   doc_variety   10000 non-null  object 
 9   doc_taster    7945 non-null   object 
 10  doc_place     10000 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 859.5+ KB


# Create Bag-of-Words 

ie. a __Count Vector Space__

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [113]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [114]:
count_engine = CountVectorizer(max_features=n_terms, stop_words='english', ngram_range=ngram_range)
count_model = count_engine.fit_transform(corpus.doc_content)

## Get Generated VOCAB

In [115]:
VOCAB = pd.DataFrame(count_engine.get_feature_names(), columns=['term_str'])
VOCAB = VOCAB.set_index('term_str')
# VOCAB.index.name = 'term_id' # For convenience, we'll use strings for IDs

## Get Generated Bag-of-Words

We do this just to show what the counter vectorizer produced. `DTM` stands for documet-term matrix. We convert this sparse matrix into a "thin" dataframe that keeps only terms with counts for each document. 

In [116]:
DTM = pd.DataFrame(count_model.toarray(), index=corpus.index, columns=VOCAB.index)
BOW = DTM.stack().to_frame('n')
BOW = BOW[~(BOW.n == 0)]

In [117]:
DTM.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 4000 entries, 000 to zippy acidity
dtypes: int64(4000)
memory usage: 305.2 MB


In [118]:
BOW.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 267755 entries, (0, 'accented') to (9999, 'wine')
Columns: 1 entries, n to n
dtypes: int64(1)
memory usage: 3.2+ MB


## Compute TF-IDF

In [119]:
tfidf_engine = TfidfTransformer()
tfidf_model = tfidf_engine.fit_transform(count_model)

In [120]:
TFIDF = pd.DataFrame(tfidf_model.toarray(), index=corpus.index, columns=VOCAB.index)

In [121]:
BOW['tfidf'] = TFIDF.stack()

In [122]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
doc_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
0,accented,1,0.212521
0,astringent,1,0.219259
0,barrel,1,0.183510
0,bit,1,0.164003
0,bottle,1,0.198999
...,...,...,...
9999,showing,1,0.160109
9999,stone,1,0.161591
9999,stone fruit,1,0.177589
9999,texture,1,0.118262


## Add Features to VOCAB

In [123]:
VOCAB['ngram_len'] = None # Since VOCAB has no columns yet
VOCAB['ngram_len'] = VOCAB.apply(lambda x: len(x.name.split()), 1)
VOCAB['n'] = DTM.sum()
VOCAB['tfidf_mean'] = TFIDF.mean()

In [124]:
VOCAB.ngram_len.value_counts()

1    2099
2    1722
3     172
4       7
Name: ngram_len, dtype: int64

In [125]:
# VOCAB[VOCAB.ngram_len == VOCAB.ngram_len.max()].sort_values('n', ascending=False)

In [126]:
# VOCAB.sort_values('n', ascending=False)

In [127]:
# VOCAB[VOCAB.ngram_len > 1].sort_values('n', ascending=False)

In [128]:
# VOCAB[VOCAB.ngram_len > 1].sort_values('tfidf_mean', ascending=False)

# Generate Topic Models

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [129]:
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA

## Using LDA

In [130]:
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

### THETA

In [131]:
if use_tfidf:
    THETA = pd.DataFrame(lda.fit_transform(tfidf_model), index=corpus.index)
else:
    THETA = pd.DataFrame(lda.fit_transform(count_model), index=corpus.index)
THETA.index.name = 'doc_id'
THETA.columns.name = 'topic_id'

In [132]:
THETA.sample(20).style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3438,0.009994,0.009994,0.009994,0.009994,0.38524,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.009994,0.434867,0.009994,0.009994,0.009994
2021,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.200129,0.549294,0.123641,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467,0.007467
4632,0.008986,0.008986,0.008986,0.008986,0.256378,0.008986,0.008986,0.008986,0.008986,0.008986,0.008986,0.415015,0.008986,0.008986,0.008986,0.008986,0.175845,0.008986,0.008986,0.008986
8593,0.156376,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.007251,0.713115,0.007251,0.007251
7610,0.007887,0.102788,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.755243,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887,0.007887
8118,0.329383,0.007566,0.007566,0.007566,0.007566,0.007566,0.007566,0.007566,0.032136,0.007566,0.007566,0.509852,0.007566,0.007566,0.007566,0.007566,0.007566,0.007566,0.007566,0.007566
7953,0.008202,0.008202,0.052318,0.008202,0.008202,0.008202,0.008202,0.008202,0.008202,0.52503,0.008202,0.008202,0.008202,0.236868,0.054556,0.008202,0.008202,0.008202,0.008202,0.008202
2799,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.007641,0.186692,0.007641,0.007641,0.007641,0.457134,0.226285
9479,0.008787,0.008787,0.008787,0.008787,0.833052,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787,0.008787
7151,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912,0.830669,0.008912,0.008912,0.008912,0.008912,0.008912,0.008912


In [133]:
# THETA.sum(1).sum()

### PHI

In [134]:
PHI = pd.DataFrame(lda.components_, columns=VOCAB.index)
PHI.index.name = 'topic_id'
PHI.columns.name  = 'term_str'

In [135]:
PHI.T.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
000,0.05,0.05,0.05,0.276793,0.05,0.05,0.05,0.05,0.05,0.05,0.513132,0.222212,0.05,2.728725,0.990855,0.588163,0.05,0.05,0.05,0.05
000 cases,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.051734,0.07287,0.05,2.707337,1.02852,0.05,0.05,0.05,0.05,0.05
10,0.184771,0.05,0.278381,1.001127,0.05,1.137785,0.676016,0.05,3.057713,0.276093,0.143498,0.09121,2.089341,8.492685,0.568426,0.194201,0.310856,3.552122,2.586569,0.698969
10 merlot,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1.027632,1.342182,0.05,0.05,0.05,1.023215,0.05,0.05
10 years,0.05,0.05,0.05,0.05,0.050007,0.05,0.05,0.05,2.506586,0.05,0.05,0.05,0.05,0.72404,0.05,0.05,0.050577,0.495653,3.008859,0.380836


### Create Topic Glosses

In [136]:
n_top_words = 7

In [137]:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [138]:
TOPICS

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,cherry,fruit,wine,aromas,flavors,bright,notes
1,cool climate,climate,italy,cool,lip,smacking,informal
2,flavors,wine,dr,fruit,dr pepper,lacking,blueberry
3,wine,tannins,drink,fruits,acidity,ripe,rich
4,white,palate,peach,apple,citrus,acidity,flavors
5,wine,acidity,drink,crisp,character,ripe,texture
6,pinot,cherry,pinot noir,noir,raspberry,red,flavors
7,wine fine,barrel,cherry crushed,fruit,time,crushed raspberry,red
8,black,wine,fruit,vineyard,cherry,tannins,flavors
9,finish,aromas,flavors,palate,fruit,green,nose


In [139]:
TOPICS['topwords'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [140]:
TOPICS['doc_weight_sum'] = THETA.sum()

## Using NMF

In [141]:
nmf_engine = NMF(n_components=n_topics, init='nndsvd', random_state=1, alpha=.1, l1_ratio=.5)

### THETA

In [142]:
THETA_NMF = pd.DataFrame(nmf_engine.fit_transform(tfidf_model), index=corpus.index)
THETA_NMF.columns.name = 'topic_id'

In [143]:
THETA_NMF.sample(20).style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
8791,0.05511,0.0,0.0,0.0,0.0,0.001323,0.0,0.01258,0.0,0.0,0.009889,0.0,0.0,0.0,0.0,0.0,0.00328,0.0,0.0,0.019248
5295,0.015473,0.003566,0.0,0.0,0.044617,0.0,0.001208,0.010448,0.0,0.0,0.0,0.0,0.0,0.0,0.001462,0.026603,0.01146,0.0,0.001414,0.0
9667,0.0,0.0,0.0,0.007282,0.013872,0.0,0.000999,0.000872,1.9e-05,0.00854,0.022645,0.0,0.0,0.0,0.0,0.0,0.033549,0.0,0.0,0.000208
2735,0.010685,0.019591,0.0,0.001386,0.0,0.000325,0.0,0.002151,0.016381,0.002913,0.0,0.0,0.0,0.0,0.0,0.00903,0.0,0.000354,0.0,0.0
8775,0.001132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007396,0.0,0.042825,0.000852,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6592,0.002261,0.003317,0.0,0.007582,0.0,0.0,0.00107,0.001602,0.0,0.016645,0.0,0.0,0.023116,0.045301,0.0,0.0,0.0,0.0,0.0,0.0
4077,0.005574,0.0018,0.0,0.001143,0.0,0.00349,0.022386,0.000784,0.015588,0.000211,0.0,0.001555,0.0,0.0,0.0,0.0,0.0,0.0,0.008975,0.0
8995,0.008285,0.020099,0.0,0.0,0.0,0.0,0.0,0.0,0.015091,0.0,0.0,0.0,0.0,0.0,0.033026,0.0,0.0,0.057079,0.0,0.0
6347,0.01797,0.0,0.0,0.008283,0.0,0.0,0.009181,0.034713,0.0,0.000465,0.0,0.001838,0.0,0.0,0.0,0.0,0.006241,0.0,0.0,0.0
4871,0.0,0.012213,0.0,0.012664,0.0,0.057171,0.0,0.0,0.0135,0.033022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024064


### PHI

In [144]:
PHI_NMF = pd.DataFrame(nmf_engine.components_, columns=VOCAB.index)

In [145]:
PHI_NMF.index.name = 'topic_id'
PHI_NMF.columns.name  = 'term_str'

In [146]:
PHI_NMF.T.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000 cases,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.043333,0.0,0.0,0.055016,0.160916,0.0,0.0,0.022803,0.0,0.0,0.0,0.0,0.017203,0.0,0.0,0.007809,0.024557,0.0,0.0,0.0
10 merlot,0.0,0.0,0.0,0.004434,0.036964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 years,0.023723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04473,0.0,0.0,0.0


### Topics

In [147]:
TOPICS_NMF = PHI_NMF.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [148]:
TOPICS_NMF

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,wine,fruits,ripe,rich,drink,tannins,wood
1,crisp,light,acidity,fresh,bright,wine,fruity
2,lemon,lime,lemon lime,grapefruit,zest,riesling,orange
3,tannins,alongside,palate,aromas,cherry,offers,palate offers
4,cabernet,sauvignon,cabernet sauvignon,blend,merlot,franc,cabernet franc
5,red,red berry,red cherry,red fruit,red fruits,berry,red currant
6,berry,finish,plum,flavors,aromas,herbal,feels
7,black,black cherry,cherry,pepper,black pepper,blackberry,currant
8,fruit,fruit flavors,flavors,tropical fruit,black fruit,tropical,aromas
9,sweet,flavors,vanilla,like,pineapple,honey,soft


In [149]:
TOPICS_NMF['topwords'] = TOPICS_NMF.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [150]:
TOPICS_NMF['doc_weight_sum'] = THETA_NMF.sum()

In [151]:
TOPICS_NMF.iloc[:, 7:]

term_str,topwords,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"0 wine, fruits, ripe, rich, drink, tannins, wood",89.181919
1,"1 crisp, light, acidity, fresh, bright, wine, ...",65.148885
2,"2 lemon, lime, lemon lime, grapefruit, zest, r...",39.623773
3,"3 tannins, alongside, palate, aromas, cherry, ...",67.308991
4,"4 cabernet, sauvignon, cabernet sauvignon, ble...",44.589535
5,"5 red, red berry, red cherry, red fruit, red f...",40.4883
6,"6 berry, finish, plum, flavors, aromas, herbal...",70.376952
7,"7 black, black cherry, cherry, pepper, black p...",57.334993
8,"8 fruit, fruit flavors, flavors, tropical frui...",55.332295
9,"9 sweet, flavors, vanilla, like, pineapple, ho...",60.523884


# Save the Model

## Keep Corpus Label Info

This is effectively the LIB table.

In [152]:
LABELS = corpus[set(corpus.columns.tolist()) - set(['doc_key', 'doc_content', 'doc_original'])]

## Save each dataframe

This could of course be generalized as a function or class method.

In [153]:
LABELS.to_csv(f"{db_dir}/{data_prefix}-LABELS.csv", index=True)
VOCAB.to_csv(f"{db_dir}/{data_prefix}-VOCAB.csv", index=True)
BOW.to_csv(f"{db_dir}/{data_prefix}-BOW.csv", index=True)
TOPICS.to_csv(f"{db_dir}/{data_prefix}-TOPICS.csv", index=True)
THETA.to_csv(f"{db_dir}/{data_prefix}-THETA.csv", index=True)
PHI.to_csv(f"{db_dir}/{data_prefix}-PHI.csv", index=True)
TOPICS_NMF.to_csv(f"{db_dir}/{data_prefix}-TOPICS_NMF.csv", index=True)
THETA_NMF.to_csv(f"{db_dir}/{data_prefix}-THETA_NMF.csv", index=True)
PHI_NMF.to_csv(f"{db_dir}/{data_prefix}-PHI_NMF.csv", index=True)