# 課題のソースコード

 * nltk
 * gensim
 * pyLDAvis

In [1]:
!pip install nltk
!pip install gensim
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## データロード・前処理

In [3]:
nltk.download('brown')
from nltk.corpus import brown as corpus

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [4]:
# データの確認
for n,item in enumerate(corpus.words(corpus.fileids()[0])[:300]):
    print(item, end=" ")
    if (n%25) ==24:
      print(" ")

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .  
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise  
and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged  
by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan  
Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in  
the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's  
registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `

In [5]:
#全document数
len(corpus.fileids())

500

In [6]:
#全documentで学習する
docs=[corpus.words(fileid) for fileid in corpus.fileids()]

# ドキュメントの確認
print(docs[:5])
print("num of docs:", len(docs))

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['Austin', ',', 'Texas', '--', 'Committee', 'approval', ...], ['Several', 'defendants', 'in', 'the', 'Summerdale', ...], ['Oslo', 'The', 'most', 'positive', 'element', 'to', ...], ['East', 'Providence', 'should', 'organize', 'its', ...]]
num of docs: 500


## 前処理 

In [7]:
#ストップワードリストの作成
en_stop = nltk.corpus.stopwords.words('english')
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<","''", ""] \
         +['$', '$10', 'one', 'would', 'could', 'say', '!', 'make', 'new'] \
         +en_stop

In [8]:
#前処理関数の作成

from nltk.corpus import wordnet as wn #lemmatize関数のためのimport
import re

def preprocess_word(word, stopwordset):
    
    #1.make words lower  example: Python =>python
    word=word.lower()
    
    #2.remove "," and "."
    if word in [",","."]:
        return None
    
    #3.remove stopword  example: the => (None) 
    if word in stopwordset:
        return None
    
    #4.lemmatize  example: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma

def cleaning_text(text):
    pattern1 = '@'
    text = re.sub(pattern1, '', text)    
    pattern2 = '<b>|</b>' 
    text = re.sub(pattern2, '', text)    
    pattern3 = r'\(.*?\)'
    text = re.sub(pattern3, '', text)
    pattern4 = r'\<.*?\>'
    text = re.sub(pattern4, '', text)
    pattern5 = '[0-9]*'
    text = re.sub(pattern5, '', text)
    return text

def preprocess_document(document):
    document=[cleaning_text(text) for text in document] 
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [9]:
# 前処理の結果
#before
print(docs[0][:25]) 

#after
print(preprocess_documents(docs)[0][:25])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['fulton', 'county', 'grand', 'jury', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produce', 'evidence', 'irregularity', 'take', 'place', 'jury', 'term-end', 'presentment', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', 'deserve']


## LDA準備

In [10]:
import gensim
from gensim import corpora

In [11]:
#documentを，gensim LDAが読み込めるデータ構造にする

#辞書の作成
dictionary = corpora.Dictionary(preprocess_documents(docs))
#コーパスの作成
corpus_ = [dictionary.doc2bow(doc) for doc in preprocess_documents(docs)]

In [12]:
#Dictionary:gensimにおける辞書クラス
#token2id属性には単語と辞書IDとの対応が格納される

print(dictionary.token2id)



In [13]:
#corpusの確認(documentごとに単語の(ID、出現回数)のリストが得られる)
print(corpus_[0][:10]) 

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 4), (5, 3), (6, 2), (7, 2), (8, 1), (9, 2)]


In [14]:
#before
print([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]])

#after
print(dictionary.doc2bow([w.lower() for w in corpus.sents(corpus.fileids()[0])[0]]))

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[(46, 1), (127, 1), (174, 1), (192, 1), (225, 1), (228, 1), (251, 1), (290, 1), (305, 1), (409, 1), (430, 1), (463, 1)]


## LDA学習

In [15]:
ldamodel_5 = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=5, #トピック数変えてみる
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           minimum_probability=0.02    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

ldamodel_10 = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=10, #トピック数変えてみる
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           minimum_probability=0.02    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

ldamodel_20 = gensim.models.ldamodel.LdaModel(corpus=corpus_,
                                           num_topics=20, #トピック数変えてみる
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           minimum_probability=0.02    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )




## パラメータの確認

In [16]:
def print_list(list_topic):
  for topic in list_topic:
      print(topic)

topics_5 = ldamodel_5.print_topics(num_words=10)
topics_10 = ldamodel_10.print_topics(num_words=10)
topics_20 = ldamodel_20.print_topics(num_words=10)

In [17]:
print_list(topics_5)

(0, '0.004*"state" + 0.004*"time" + 0.003*"af" + 0.003*"two" + 0.003*"know" + 0.003*"take" + 0.003*"first" + 0.003*"go" + 0.003*"get" + 0.002*"may"')
(1, '0.003*"man" + 0.003*"come" + 0.003*"also" + 0.003*"go" + 0.003*"take" + 0.003*"two" + 0.003*"may" + 0.003*"know" + 0.003*"even" + 0.003*"back"')
(2, '0.004*"like" + 0.003*"know" + 0.003*"may" + 0.003*"state" + 0.003*"people" + 0.003*"get" + 0.002*"use" + 0.002*"go" + 0.002*"take" + 0.002*"even"')
(3, '0.003*"first" + 0.003*"know" + 0.003*"come" + 0.003*"take" + 0.003*"get" + 0.003*"see" + 0.003*"time" + 0.002*"use" + 0.002*"man" + 0.002*"like"')
(4, '0.004*"time" + 0.003*"two" + 0.003*"state" + 0.003*"first" + 0.003*"go" + 0.003*"af" + 0.003*"use" + 0.003*"may" + 0.003*"take" + 0.002*"come"')


In [18]:
print_list(topics_10)

(0, '0.003*"come" + 0.003*"time" + 0.003*"know" + 0.003*"state" + 0.003*"even" + 0.002*"like" + 0.002*"go" + 0.002*"may" + 0.002*"get" + 0.002*"see"')
(1, '0.003*"like" + 0.003*"take" + 0.003*"time" + 0.003*"go" + 0.003*"also" + 0.002*"man" + 0.002*"get" + 0.002*"may" + 0.002*"must" + 0.002*"state"')
(2, '0.003*"know" + 0.003*"go" + 0.003*"use" + 0.003*"two" + 0.003*"get" + 0.003*"af" + 0.003*"time" + 0.003*"come" + 0.003*"first" + 0.002*"may"')
(3, '0.003*"get" + 0.003*"time" + 0.003*"know" + 0.003*"state" + 0.003*"come" + 0.003*"two" + 0.002*"go" + 0.002*"years" + 0.002*"also" + 0.002*"first"')
(4, '0.004*"like" + 0.003*"time" + 0.003*"go" + 0.003*"man" + 0.003*"know" + 0.003*"even" + 0.003*"come" + 0.003*"take" + 0.002*"state" + 0.002*"first"')
(5, '0.004*"time" + 0.003*"may" + 0.003*"take" + 0.003*"get" + 0.003*"man" + 0.003*"use" + 0.003*"come" + 0.002*"two" + 0.002*"first" + 0.002*"know"')
(6, '0.003*"time" + 0.003*"may" + 0.003*"man" + 0.003*"first" + 0.003*"like" + 0.003*"know"

In [19]:
print_list(topics_20)

(0, '0.003*"take" + 0.003*"may" + 0.002*"go" + 0.002*"first" + 0.002*"two" + 0.002*"like" + 0.002*"come" + 0.002*"state" + 0.002*"know" + 0.002*"good"')
(1, '0.003*"take" + 0.003*"get" + 0.003*"man" + 0.003*"know" + 0.003*"even" + 0.003*"af" + 0.002*"time" + 0.002*"like" + 0.002*"first" + 0.002*"two"')
(2, '0.002*"come" + 0.002*"two" + 0.002*"like" + 0.002*"time" + 0.002*"man" + 0.002*"go" + 0.002*"first" + 0.002*"take" + 0.002*"back" + 0.002*"also"')
(3, '0.004*"know" + 0.003*"go" + 0.003*"get" + 0.003*"come" + 0.003*"time" + 0.003*"back" + 0.003*"like" + 0.003*"may" + 0.003*"man" + 0.003*"even"')
(4, '0.004*"first" + 0.003*"af" + 0.003*"two" + 0.003*"state" + 0.002*"time" + 0.002*"years" + 0.002*"use" + 0.002*"take" + 0.002*"also" + 0.002*"may"')
(5, '0.003*"come" + 0.003*"get" + 0.003*"go" + 0.002*"use" + 0.002*"man" + 0.002*"two" + 0.002*"take" + 0.002*"even" + 0.002*"time" + 0.002*"may"')
(6, '0.004*"may" + 0.003*"state" + 0.003*"time" + 0.002*"know" + 0.002*"even" + 0.002*"take" 

In [20]:
#[(当該documentにおけるトピックIDとそのprobability　)]　 ※　のうち、minimum_probabilityの値を超えるもの
#
document_ids = [0,40,80,120,160,200,240,280,320,360]
print("確認するドキュメントの数:", len(document_ids))
for n,item in enumerate(corpus_):
    if n in document_ids:
      print("document ID "+str(n)+":" ,end="")
      print(ldamodel_5.get_document_topics(item))

確認するドキュメントの数: 10
document ID 0:[(2, 0.032222636), (4, 0.9363249)]
document ID 40:[(2, 0.9988142)]
document ID 80:[(0, 0.70649606), (1, 0.13671999), (4, 0.15659589)]
document ID 120:[(3, 0.30743355), (4, 0.6922727)]
document ID 160:[(0, 0.21682177), (1, 0.6802151), (2, 0.026390156), (3, 0.07647381)]
document ID 200:[(0, 0.7805245), (1, 0.09182017), (2, 0.02555416), (4, 0.09495838)]
document ID 240:[(0, 0.9719475)]
document ID 280:[(2, 0.05266885), (4, 0.9404074)]
document ID 320:[(0, 0.7012101), (1, 0.1810445), (2, 0.035607535), (4, 0.080489725)]
document ID 360:[(1, 0.0901554), (2, 0.24294002), (4, 0.66671073)]


In [21]:
for n,item in enumerate(corpus_):
    if n in document_ids:
      print("document ID "+str(n)+":" ,end="")
      print(ldamodel_10.get_document_topics(item))

document ID 0:[(2, 0.06948695), (3, 0.75293076), (9, 0.17693569)]
document ID 40:[(1, 0.29385358), (3, 0.089208946), (4, 0.5232602), (5, 0.0775637)]
document ID 80:[(0, 0.069337375), (2, 0.034352537), (5, 0.8421937)]
document ID 120:[(7, 0.5431463), (9, 0.45607245)]
document ID 160:[(2, 0.031315748), (4, 0.057530507), (7, 0.026786821), (9, 0.8714915)]
document ID 200:[(4, 0.45787886), (6, 0.4991071), (7, 0.036711942)]
document ID 240:[(3, 0.95837617), (4, 0.040812876)]
document ID 280:[(0, 0.3617489), (5, 0.63575745)]
document ID 320:[(5, 0.9757926)]
document ID 360:[(6, 0.8498049), (8, 0.14942181)]


In [22]:
for n,item in enumerate(corpus_):
    if n in document_ids:
      print("document ID "+str(n)+":" ,end="")
      print(ldamodel_20.get_document_topics(item))

document ID 0:[(3, 0.06220797), (6, 0.14741124), (8, 0.32400182), (10, 0.12673184), (11, 0.3331611)]
document ID 40:[(11, 0.998149)]
document ID 80:[(3, 0.99819696)]
document ID 120:[(1, 0.77013993), (13, 0.110029206), (14, 0.115397856)]
document ID 160:[(3, 0.9981194)]
document ID 200:[(1, 0.05340549), (3, 0.39055747), (4, 0.10744337), (9, 0.046238337), (14, 0.26372755), (18, 0.125656)]
document ID 240:[(11, 0.7297109), (14, 0.23992988), (18, 0.023588955)]
document ID 280:[(3, 0.26375404), (6, 0.2899253), (11, 0.15120223), (15, 0.28500488)]
document ID 320:[(4, 0.15210852), (6, 0.25676557), (8, 0.10162215), (11, 0.41956407), (15, 0.0503635)]
document ID 360:[(1, 0.1676752), (16, 0.81366056)]


In [23]:
#documentのcategory
categories=[corpus.categories(fileid) for fileid in corpus.fileids()]

In [24]:
n = 0

#n番目のdocumentのcategory
print(categories[n])

#n番目のdocumentの生の文章
print(" ".join(docs[n]))

['news']


In [25]:
#トピックごとの代表キーワード
print("トピックごとの代表キーワードを10個示す")
for i in range(5):
  items = [dictionary[t[0]] for t in ldamodel_5.get_topic_terms(i, topn = 10)]
  print("topic", i, ":", items)

print("トピックごとの代表キーワードを10個示す")
for i in range(10):
  items = [dictionary[t[0]] for t in ldamodel_10.get_topic_terms(i, topn = 10)]
  print("topic", i, ":", items)

print("トピックごとの代表キーワードを10個示す")
for i in range(20):
  items = [dictionary[t[0]] for t in ldamodel_20.get_topic_terms(i, topn = 10)]
  print("topic", i, ":", items)


トピックごとの代表キーワードを10個示す
topic 0 : ['state', 'time', 'af', 'two', 'know', 'take', 'first', 'go', 'get', 'may']
topic 1 : ['man', 'come', 'also', 'go', 'take', 'two', 'may', 'know', 'even', 'back']
topic 2 : ['like', 'know', 'may', 'state', 'people', 'get', 'use', 'go', 'take', 'even']
topic 3 : ['first', 'know', 'come', 'take', 'get', 'see', 'time', 'use', 'man', 'like']
topic 4 : ['time', 'two', 'state', 'first', 'go', 'af', 'use', 'may', 'take', 'come']
トピックごとの代表キーワードを10個示す
topic 0 : ['come', 'time', 'know', 'state', 'even', 'like', 'go', 'may', 'get', 'see']
topic 1 : ['like', 'take', 'time', 'go', 'also', 'man', 'get', 'may', 'must', 'state']
topic 2 : ['know', 'go', 'use', 'two', 'get', 'af', 'time', 'come', 'first', 'may']
topic 3 : ['get', 'time', 'know', 'state', 'come', 'two', 'go', 'years', 'also', 'first']
topic 4 : ['like', 'time', 'go', 'man', 'know', 'even', 'come', 'take', 'state', 'first']
topic 5 : ['time', 'may', 'take', 'get', 'man', 'use', 'come', 'two', 'first', 'know'

## 可視化

In [26]:
#import pyLDAvis.gensim
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [27]:
#gensimではK個のトピックに0~K-1のidが割り振られていたのに対し，pyLDAvisでは1~Kのidが割り振られていることに注意

lda_display_5 = pyLDAvis.gensim_models.prepare(ldamodel_5, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_5)

  and should_run_async(code)


In [28]:
lda_display_10 = pyLDAvis.gensim_models.prepare(ldamodel_10, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_10)

  and should_run_async(code)


In [29]:
lda_display_20 = pyLDAvis.gensim_models.prepare(ldamodel_20, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_20)

  and should_run_async(code)


In [30]:
pyLDAvis.save_html(lda_display_5,'vis_5.html')

  and should_run_async(code)


In [31]:
pyLDAvis.save_html(lda_display_10,'vis_10.html')

  and should_run_async(code)


In [32]:
pyLDAvis.save_html(lda_display_20,'vis_20.html')

  and should_run_async(code)
