<a href="https://colab.research.google.com/github/miskamvedebel/miskamvedebel/blob/master/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import pandas as pd
import numpy as np
from keras_preprocessing.text import Tokenizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/data/descriptions.xlsx')


In [0]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [0]:
exc = data.full_description.tolist()
pattern = '[0-9]'
exc = [re.sub(pattern, '', txt) for txt in exc]
exc = [wordpunct_tokenize(sent) for sent in exc]
exc_no_stop = []
for sent in exc:
  exc_no_stop.append(' '.join(lemmatizer.lemmatize(word, pos='v') for word in sent if word not in stop_words))

In [0]:
filters = '!"#$%&()*+,-./:;’‘\'–<=>?@[\\]^_`{|}~\t\n'
tokenizer = Tokenizer(filters=filters, char_level=False, num_words=None, lower=True, split=' ')
tokenizer.fit_on_texts(exc_no_stop)
sequences = tokenizer.texts_to_sequences(exc_no_stop)

In [0]:
#setting up number of topics
n_topics = 6
# dirichlet distribution parameter
alpha = 1
# hyperparameter
eta = 0.001
# gibbs sampling
iterations = 4
#length of vocab
N = len(tokenizer.index_word)

In [0]:
'''
LDA assumes the following generative process for each document w in a corpus D:

Choose N ∼ Poisson(ξ).
Choose θ ∼ Dir(α).
For each of the N words wn:
(a) Choose a topic zn ∼ Multinomial(θ).
(b) Choose a word wn from p(wn | zn ,β), a multinomial probability conditioned on the topic zn.
'''
# Randomly assign topics to each word
# Generate word-topic count

word_topic = np.zeros(shape=(N, n_topics))
topic_assignment = [np.zeros(shape=(len(d))) for d in sequences]
document_topic = np.zeros(shape=(len(sequences), n_topics))

In [0]:
topics = set(np.arange(n_topics))

for d, document in enumerate(sequences):
  for t, token in enumerate(document):
    topic = np.random.choice(n_topics, 1)
    topic_assignment[d][t] = topic
    word_topic[token-1][topic] += 1
  
  unique, counts = np.unique(topic_assignment[d], return_counts=True)
  if unique.shape[0] != n_topics:
        missing = list(topics.difference(unique))
        for ms in missing:
          counts = np.insert(counts, ms, 0)

  document_topic[d, :] = counts

for it in range(iterations):
  for di, document in enumerate(sequences):
    for ti, token in enumerate(document):
      
      t0 = topic_assignment[di][ti]

      denom_a = np.sum(document_topic[di, :]) + n_topics * alpha
      denom_b = np.sum(word_topic, axis=0) + N * eta

      p_z = (word_topic[token-1, :] + eta) / denom_b * (document_topic[di, :] + alpha) / denom_a
      t1 = np.random.choice(n_topics, 1, p=p_z/np.sum(p_z))

      topic_assignment[di][ti] = t1
      word_topic[token-1][t1] += 1

      unique, counts = np.unique(topic_assignment[di], return_counts=True)
      if unique.shape[0] != n_topics:
        missing = list(topics.difference(unique))
        for ms in missing:
          counts = np.insert(counts, ms, 0) 
      
      document_topic[di, :] = counts


In [0]:
resuls = pd.DataFrame(word_topic, index=tokenizer.index_word.values())

In [50]:
pd.set_option('display.max_rows', 1000)
resuls[0].sort_values(ascending=False)[:10]

nbsp    2800.0
you     2742.0
day     1918.0
time    1697.0
the     1674.0
take    1590.0
one     1545.0
tour    1479.0
get     1362.0
and     1348.0
Name: 0, dtype: float64

In [51]:
resuls[1].sort_values(ascending=False)[:10]

you      2588.0
nbsp     2332.0
take     1735.0
tour     1714.0
day      1673.0
one      1609.0
time     1603.0
visit    1520.0
get      1452.0
the      1395.0
Name: 1, dtype: float64

In [52]:
resuls[2].sort_values(ascending=False)[:10]

you      2797.0
nbsp     2107.0
day      2085.0
time     1801.0
the      1576.0
take     1517.0
one      1495.0
water    1481.0
lunch    1385.0
tour     1367.0
Name: 2, dtype: float64

In [53]:
resuls[3].sort_values(ascending=False)[:10]

you      2838.0
nbsp     2305.0
take     1832.0
day      1678.0
time     1617.0
the      1563.0
tour     1520.0
one      1461.0
get      1379.0
water    1318.0
Name: 3, dtype: float64

**Gensim**

In [0]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [0]:
exc = data.full_description.tolist()
exc = [wordpunct_tokenize(sent) for sent in exc]
exc_no_stop = []
for sent in exc:
  temp = [lemmatizer.lemmatize(word) for word in sent if word not in stop_words]
  exc_no_stop.append(temp)

In [0]:
text = []
for seq in sequences:
  text.append([tokenizer.index_word[tok] for tok in seq])

In [0]:
dictionary = Dictionary(text)
corpus = [dictionary.doc2bow(txt) for txt in text]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)

In [63]:
lda.print_topics()

[(0,
  '0.022*"nbsp" + 0.008*"you" + 0.007*"park" + 0.007*"take" + 0.007*"time" + 0.006*"day" + 0.006*"one" + 0.006*"and" + 0.005*"water" + 0.005*"the"'),
 (1,
  '0.010*"you" + 0.009*"time" + 0.007*"the" + 0.007*"one" + 0.006*"get" + 0.006*"explore" + 0.006*"take" + 0.006*"day" + 0.006*"visit" + 0.006*"nbsp"'),
 (2,
  '0.011*"you" + 0.011*"beach" + 0.009*"day" + 0.008*"water" + 0.008*"swim" + 0.007*"lunch" + 0.007*"take" + 0.006*"back" + 0.006*"nbsp" + 0.006*"cruise"'),
 (3,
  '0.014*"you" + 0.007*"tour" + 0.007*"day" + 0.007*"nbsp" + 0.006*"take" + 0.006*"the" + 0.006*"see" + 0.006*"get" + 0.005*"island" + 0.005*"one"'),
 (4,
  '0.010*"tour" + 0.009*"nbsp" + 0.007*"one" + 0.007*"dive" + 0.007*"you" + 0.007*"local" + 0.007*"visit" + 0.006*"time" + 0.005*"day" + 0.005*"take"')]