# LDA : Latent Dirichlet Allocation

## References

* ratsgo'blog
  - [Topic Modeling, LDA](https://ratsgo.github.io/from%20frequency%20to%20semantics/2017/06/01/LDA/)
  - [Topic Modeling, LDA 구현](https://ratsgo.github.io/from%20frequency%20to%20semantics/2017/07/09/lda/)
  - [밑바닥부터 시작하는 데이터 사이언스 / 예시코드](https://github.com/Insight-book/data-science-from-scratch/blob/master/code-python3/natural_language_processing.py)
  
* etc
  - [Topic Modeling with Scikit Learn](https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730)
  - [Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation](https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py)


$$
DT  = DP dot PT
$$




__DT : Documents-Term matrix__

| DT | t1 | t2 | t3 | t4 |
|----|----|----|----|----|
| D1 | 0  | 0  | 0  | 0  |
| D2 | 0  | 0  | 0  | 0  |
| D3 | 0  | 0  | 0  | 0  |



__DP : Documents-Topic matrix__


| DP | Topic1 | Topic2 |
|----|--------|--------|
| D1 |   0     |   0     |
| D2 |   0     |   0     |
| D3 |   0     |   0     |


__PT : Topic-term matrix__

|  PT    | t1 | t2 | t3 | t4 |
|:------:|:--:|:--:|:--:|:--:|
| Topic1 |  0  | 0   | 0   |   0 |
| Topic2 | 0   |  0  |  0  | 0   |

## DATA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_colwidth", 200)

### Get Data

In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data[:200]
len(documents)

200

### Data cleansing

In [3]:
news_df = pd.DataFrame( {'document': documents } )

# remain only alphabets
news_df['clean_doc'] = news_df['document'].str.replace( "[^a-zA-Z#]", " " )
# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply( lambda x: ' '.join( [ w for w in x.split() if len(w) > 3 ] ) )
# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply( lambda x: x.lower() )
# make words list
news_df['list_doc'] = news_df['clean_doc'].apply( lambda x: x.split() )
# make doc size
news_df['doc_size'] = news_df['list_doc'].apply( lambda x: len(x) )
# make unique words size
news_df['doc_uniq_words_size'] = news_df['list_doc'].apply( lambda x: len( set(x) ) )

## Set Variables and Functions

In [4]:
from collections import Counter

# 총 문서
doc = news_df['list_doc']
# 총 문서 수 
D = len( doc )
# 총 토픽 수
K = number_of_topics = 20

# 문서마다 각 토픽에 해당하는 단어의 수. shape=( D, K )
doc_topic_counts = [ Counter() for _ in  doc ]
# 각 단어가 각 토픽에 할당되는 횟수. shape=( K, words_dic )
topic_word_counts = [ Counter() for _ in range(K) ]
# 각 토픽에 할당되는 총 단어수
topic_counts = [ 0 for _ in range(K) ]
# 각 문서에 포함되는 총 단어수
doc_lengths = news_df['doc_size']
# 단어의 종류수
news_df['doc_uniq_words_size']
voca = set( sum( doc, [] ) )
V = len( voca )

In [5]:
def p_topic_given_document( topic, d, alpha=0.1 ):
    """
    문서 d의 모든 단어 가운데 topic에 속하는 단어의 비율 ( with add-alpha smoothing )
    """
    return ( doc_topic_counts[d][topic] + alpha ) / ( news_df['doc_size'][d] + K * alpha )

def p_word_given_topic( word, topic, beta=0.1 ):
    """
    토픽에 속한 단어 가운데 word의 비율
    """
    return ( ( topic_word_counts[topic][word] + beta ) ) / ( topic_counts[topic] + V * beta )

def topic_weight( d, word, k ):
    """
    문서와 문서의 단어가 주어지면 k번째 토픽의 weight를 반환
    """
    return p_word_given_topic( word, k ) * p_topic_given_document( k, d )



In [6]:
def choose_new_topic( d, word ):
    return sample_from( [ topic_weight(d, word, k) for k in range(K) ]  )

import random
random.seed(0)

def sample_from( weights ):
    """
    i를 weights[i] / sum(weights) 확률로 반환
    아래 식을 만족하는 가장 작은 i를 반환
    weights[0] + ... + weights[i] >= rnd
    """
    total = sum(weights)
    rnd = total * random.random()      # 0과 total 사이를 균일하게 선택
    for i, w in enumerate( weights ):
        rnd -= w
        if rnd <= 0 : return i


## Topic Modeling via LDA

In [7]:
# 각 단어를 임의의 토픽에 랜덤 배정
doc_topics = [ [ random.randrange( K ) for word in d ] for d in doc ]

# AB를 구하는데 필요한 숫자를 셈
for d in range( D ):
    for word, topic in zip( doc[d], doc_topics[d] ):
        doc_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1



In [8]:
# 깁스 셈플링

times = 5

for iter in range( times ):
    for d in range( D ):
        for i, (word, topic) in enumerate( zip( doc[d], doc_topics[d] ) ):
            # 샘플링 대상 word와 topic을 제외하고 세어봄
            doc_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[ topic ] -= 1
            doc_lengths[ d ] -= 1
            
            # 깁스 샘플링 대상 word와 topic을 제외한
            # 말뭉치 모든 word의 topic 정보를 토대로
            # 샘플링 대상 word의 새로운 topic을 선택
            new_topic = choose_new_topic( d, word )
            doc_topics[d][i] = new_topic
            
            # 샘프링 대상 word의 새로운 topic을 반영해 
            # 말뭉치 정보 업데이트
            doc_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[ new_topic ] += 1
            doc_lengths[ d ] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [None]:
doc_topic_counts[0]

## Bonus

### Topic Modeling with Scikit learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_features = 1000
no_topics = 20
no_top_words = 10

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

# Run LDA
lda = LatentDirichletAllocation( n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print( "Topic %d:" % (topic_idx) )
        print(" ".join([feature_names[i] 
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) )
        
display_topics(lda, tf_feature_names, no_top_words)