<a href="https://colab.research.google.com/github/m3yrin/NTM/blob/master/LDA_jp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gensim LDA model for Japanese articles
auther : m3yrin

reference : http://tdual.hatenablog.com/entry/2018/04/09/133000

### Memo
* tdual' s LDA script is massively cited.
* janome tokenizer is used instead of Mecab.


In [0]:
!pip install janome

import os
if not os.path.exists('text'):
    !wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
    !tar xvzf ldcc-20140209.tar.gz

In [0]:
from urllib import request 
import logging
from pathlib import Path
import numpy as np
import re
import janome
import random
from gensim import corpora, models

from janome.tokenizer import Tokenizer
from janome import analyzer
from janome.charfilter import *
from janome.tokenfilter import *

from tqdm import tqdm_notebook as tqdm

In [0]:
# https://ohke.hateblo.jp/entry/2017/11/02/230000
class NumericReplaceFilter(TokenFilter):
    def apply(self, tokens):
        for token in tokens:
            parts = token.part_of_speech.split(',')
            if (parts[0] == '名詞' and parts[1] == '数'):
                token.surface = '0'
                token.base_form = '0'
                token.reading = 'ゼロ'
                token.phonetic = 'ゼロ'
            yield token

            
class docTokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        
        self.char_filters = [
                        UnicodeNormalizeCharFilter(), 
                        RegexReplaceCharFilter(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", u''), #url
                        RegexReplaceCharFilter(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?", u''), #*.jp
                        RegexReplaceCharFilter(self.exclude_reg, u'')
                       ]
        
        self.token_filters = [
                         NumericReplaceFilter(),
                         POSKeepFilter(self.include_pos),
                         POSStopFilter(self.exclude_posdetail), 
                         LowerCaseFilter()
                        ]
        
        self.analyzer = analyzer.Analyzer(self.char_filters, Tokenizer(), self.token_filters)
        
    def tokenize(self, text):

        tokens = self.analyzer.analyze(text)
        tokens = [re.sub(r"," ,"\t", str(i)) for i in tokens]
        l = [line.split("\t") for line in tokens]
        
        #Janome response
        #i[] : ['認め', '動詞', '自立', '*', '*', '一段', '連用形', '認める', 'ミトメ', 'ミトメ']

        res = []
        for i in l:
            if i[7] not in self.stopwords:
                res.append(i[7])
                
        return res
        


### Hyper-parameters

In [0]:
num_articles = -1
topic_num = 20
passes = 50

### Load stopwords

In [0]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]

stopwords += ['*', '&', '[', ']', ')', '(', '-',':','.','/','0', '...?', '——', '!【', '"', ')、', ')。', ')」']

print("# Stopword : ", len(stopwords))

### Load articles

In [0]:
doc_path = "./text/"
doc_dir = Path(doc_path)
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
articles = [a for categ in dirs for a in categ.iterdir()]
random.shuffle(articles)

articles = articles[:num_articles]

In [0]:
tokenizer = docTokenizer(stopwords = stopwords, exclude_reg=r"\d(年|月|日)")

docs = []
for a in tqdm(articles):
    with a.open() as f:
        
        # discard first two lines.
        f.readline()
        f.readline()
        docs.append(tokenizer.tokenize(f.read()))

### Build dict

In [0]:
# build dict
d = corpora.Dictionary(docs)
d.filter_extremes(no_below=5, no_above=0.2)
d.compactify()

# make bow
corpus = [d.doc2bow(w) for w in docs]

# test train split
test_size = int(len(corpus) * 0.1)
test_corpus = corpus[:test_size]
train_corpus = corpus[test_size:]

### Build LDA

In [0]:
# logging setting
logging.basicConfig(format='%(message)s', level=logging.INFO)

# build LDA
lda = models.LdaModel(corpus=train_corpus, id2word=d,num_topics=topic_num,passes=10, update_every=5)

### Results

In [0]:
N = sum(count for doc in train_corpus for id, count in doc)
print("# of words in train corpas: ",N)
perplexity = np.exp2(-lda.log_perplexity(train_corpus))
print("perplexity(train):", perplexity)

print("==============================")
N = sum(count for doc in test_corpus for id, count in doc)
print("# of words in test corpas: ",N)
perplexity = np.exp2(-lda.log_perplexity(test_corpus))
print("perplexity(test):", perplexity)

In [0]:
def get_topic_words(topic_id):
    tw = []
    for t in lda.get_topic_terms(topic_id):
        tw.append(d[t[0]])
    
    return tw

for t in range(topic_num):
    tw = get_topic_words(t)
    print('Topic {}: {}'.format(t + 1, ' '.join(tw)))