# Topic Modeling with SciKit Learn

In this notebook we create a topic model from our corpus  using SciKit Learn's library. We'll save our results and then use another notebook to explore the results.

# Set Up

## Imports

In [1]:
import pandas as pd
import numpy as np
from lib import tapi

## Configuration

In [2]:
tapi.list_corpora()

['airbnb',
 'anphoblacht',
 'arxiv',
 'covid19',
 'jstor_hyperparameter',
 'novels',
 'okcupid',
 'tamilnet',
 'winereviews',
 'yelp',
 'zuboff']

In [3]:
# data_prefix = 'winereviews'
data_prefix = 'tamilnet'

In [4]:
db = tapi.Edition(data_prefix)

## Parameters

In [5]:
n_terms = 4000 # Vocabulary size
ngram_range = (1,4) # ngram min and max lengths
n_topics = 20 # Number of topics
max_iter = 5 # Number of iterations for topic model

In [6]:
topic_cols = [t for t in range(n_topics)]

## Create Tables Object

These tables constitute a "digital critical edition."

# Import Corpus Data

We import a corpus in our standard format

In [7]:
corpus = db.get_corpus()

## Inspect contents

In [8]:
corpus.head(10)

Unnamed: 0_level_0,doc_key,doc_title,doc_uri,doc_label,doc_ord,doc_content,doc_original,doc_year,doc_date,doc_citation
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,19631,Speaker urged to ensure protection to parliame...,https://www.tamilnet.com/art.html?catid=13&art...,2007,2007,anura bandaranaike minister for national heri...,<h1>Speaker urged to ensure protection to parl...,,,
1,16455,Youth shot at Valaichenai Church,https://www.tamilnet.com/art.html?catid=13&art...,2006,2006,two motorbike riding gunmen shot and wounded ...,<h1>Youth shot at Valaichenai Church </h1> [Ta...,,,
2,13890,Token fast held in Trinco in support of aid deal,https://www.tamilnet.com/art.html?catid=13&art...,2005,2005,one-day token fast was held in trincomalee to...,<h1>Token fast held in Trinco in support of ai...,,,
3,15106,"World concerned over escalation of violence, S...",https://www.tamilnet.com/art.html?catid=13&art...,2005,2005,the global community is gravely concerned ove...,<h1>World concerned over escalation of violenc...,,,
4,26524,United Sri Lanka is the steadfast aim of UNP ...,https://www.tamilnet.com/art.html?catid=13&art...,2009,2009,general secretary of united national party un...,<h1>United Sri Lanka is the steadfast aim of U...,,,
5,1428,UN official calls for end to war,https://www.tamilnet.com/art.html?catid=13&art...,1998,1998,the un special representative on children in ...,<h1>UN official calls for end to war </h1> [Ta...,,,
6,14822,Paramilitary cadres abduct 3 youths in Batticaloa,https://www.tamilnet.com/art.html?catid=13&art...,2005,2005,armed men believed to be the cadres of the pa...,<h1>Paramilitary cadres abduct 3 youths in Bat...,,,
7,4198,Soldier killed in firefight,https://www.tamilnet.com/art.html?catid=13&art...,1999,1999,a soldiers was killed when members of the lib...,<h1>Soldier killed in firefight </h1> [TamilNe...,,,
8,31955,Badurdeen squad threatens Tamil newspaper edit...,https://www.tamilnet.com/art.html?catid=13&art...,2014,2014,a squad led by upfa provincial councillor fro...,<h1>Badurdeen squad threatens Tamil newspaper ...,,,
9,8432,Tamil villages without post boxes,https://www.tamilnet.com/art.html?catid=13&art...,2003,2003,several tamil villages in the trincomalee dis...,<h1>Tamil villages without post boxes </h1> [T...,,,


In [9]:
corpus.shape

(10000, 10)

In [10]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   doc_key       10000 non-null  int64  
 1   doc_title     10000 non-null  object 
 2   doc_uri       10000 non-null  object 
 3   doc_label     10000 non-null  int64  
 4   doc_ord       10000 non-null  int64  
 5   doc_content   10000 non-null  object 
 6   doc_original  10000 non-null  object 
 7   doc_year      0 non-null      float64
 8   doc_date      0 non-null      float64
 9   doc_citation  0 non-null      float64
dtypes: float64(3), int64(3), object(4)
memory usage: 781.4+ KB


# Create Bag-of-Words 

ie. a __Count Vector Space__

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
count_engine = CountVectorizer(max_features=n_terms, stop_words='english', ngram_range=ngram_range)
count_model = count_engine.fit_transform(corpus.doc_content)

In [None]:
# corpus.doc_content

## Get Generated VOCAB

In [None]:
db.VOCAB = pd.DataFrame(count_engine.get_feature_names(), columns=['term_str'])
db.VOCAB = db.VOCAB.set_index('term_str')
db.VOCAB['ngram_len'] = None # To be added later
# VOCAB.index.name = 'term_id' # For convenience, we'll use strings for IDs

In [None]:
db.VOCAB.sample(10)

## Get Generated BOW

We do this just to show what the counter vectorizer produced. `DTM` stands for documet-term matrix. We convert this sparse matrix into a "thin" dataframe that keeps only terms with counts for each document. 

In [None]:
db.DTM = pd.DataFrame(count_model.toarray(), index=corpus.index, columns=db.VOCAB.index)
db.BOW = db.DTM.stack().to_frame('n')
db.BOW = db.BOW[~(db.BOW.n == 0)]

In [None]:
# db.BOW.head(10)

In [None]:
db.DTM.info(verbose=False)

In [None]:
db.BOW.info(verbose=False)

## Compute TF-IDF

In [None]:
tfidf_engine = TfidfTransformer()
tfidf_model = tfidf_engine.fit_transform(count_model)

In [None]:
db.TFIDF = pd.DataFrame(tfidf_model.toarray(), index=corpus.index, columns=db.VOCAB.index)

In [None]:
db.BOW['tfidf'] = db.TFIDF.stack()

In [None]:
db.BOW

## Add Features to VOCAB

In [None]:
db.VOCAB['ngram_len'] = db.VOCAB.apply(lambda x: len(x.name.split()), 1)
db.VOCAB['n'] = db.DTM.sum()
db.VOCAB['tfidf_mean'] = db.TFIDF.mean()

In [None]:
db.VOCAB

In [None]:
db.VOCAB.ngram_len.value_counts().plot.bar()

# Generate Topic Models

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

## Using LDA

In [None]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

### THETA

In [None]:
db.THETA = pd.DataFrame(lda_engine.fit_transform(count_model), index=corpus.index)
db.THETA.index.name = 'doc_id'
db.THETA.columns.name = 'topic_id'

In [None]:
db.THETA.sample(20).T.style.background_gradient()

### PHI

In [None]:
db.PHI = pd.DataFrame(lda_engine.components_, columns=db.VOCAB.index)
db.PHI.index.name = 'topic_id'
db.PHI.columns.name  = 'term_str'

In [None]:
db.PHI.T.head().style.background_gradient()

### Create Topic Glosses

In [None]:
n_top_words = 7

In [None]:
db.TOPICS = db.PHI.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [None]:
db.TOPICS

In [None]:
db.TOPICS['topwords'] = db.TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

In [None]:
db.TOPICS

### Add Doc Weights

In [None]:
db.TOPICS['doc_weight_sum'] = db.THETA.sum()

In [None]:
db.TOPICS.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

## Using NMF

In [None]:
nmf_engine = NMF(n_components=n_topics, init='nndsvd', random_state=1, alpha=.1, l1_ratio=.5)

### THETA

In [None]:
db.THETA_NMF = pd.DataFrame(nmf_engine.fit_transform(tfidf_model), index=corpus.index)
db.THETA_NMF.columns.name = 'topic_id'

In [None]:
db.THETA_NMF.sample(20).style.background_gradient()

### PHI

In [None]:
db.PHI_NMF = pd.DataFrame(nmf_engine.components_, columns=db.VOCAB.index)

In [None]:
db.PHI_NMF.index.name = 'topic_id'
db.PHI_NMF.columns.name = 'term_str'

In [None]:
db.PHI_NMF.T.head().style.background_gradient()

### Topics

In [None]:
db.TOPICS_NMF = db.PHI_NMF.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [None]:
db.TOPICS_NMF

In [None]:
db.TOPICS_NMF['topwords'] = db.TOPICS_NMF.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [None]:
db.TOPICS_NMF['doc_weight_sum'] = db.THETA_NMF.sum()

In [None]:
db.TOPICS_NMF.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

# Save the Model

## Keep Corpus Label Info

This is effectively the LIB table.

In [None]:
db.LABELS = corpus[set(corpus.columns.tolist()) - set(['doc_key', 'doc_content', 'doc_original'])]

## Save Tables

In [None]:
db.save_tables()

In [None]:
# See if it worked ...

!ls -l ./db/{data_prefix}*.csv

# Visualize with LDAViz

In [None]:
db.pyldaviz()