In [1]:
import jieba as jb, matplotlib as mpl, matplotlib.pyplot as plt, numpy as np, os, pandas as pd, re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
stopwords = [',', '?', '、', '。', '“', '”', '《', '》', '！', '，', '：', '；', '？', 
             '（', '）', ',', ':', 'hi', 'auntie', 'ok', '向左走', '向右走', '大家', '利申', 
             '雖然', '但係', '乜', '一齊', '可以', '應該', '好多', '已經', '因為', '邊個',
             '好似', '而家', '一定', '之前', '即刻', '好過', '仲有', '如果', '其實', '一半',
             '有人', '個人', '一次', '無人', '好好', '根本', '一樣', '成日', '問題', '不過',
             '有時', '之後', '沒有', '所以', '不如', '個個', '無法']

In [3]:
def sent_token(sent, StopWords=True, RemoveHttp=True):
  if RemoveHttp == True:
    sent = re.sub(r'^https?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE)
  words = '/'.join(jb.cut(sent)).split('/')
  if StopWords == True:
    words = [w for w in words if w not in stopwords]
  return words

In [4]:
def tfidfvectorizer(words_list, max_features=1000, n_top_words=50, n_components=10, return_model=False):
  def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
      print("Topic #%d:" % topic_idx)
      print(" ".join([feature_names[i]
                      for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

  sents = []
  for words in words_list:
    sents.append(' '.join(words))
  vtr = CountVectorizer(max_df=0.85, min_df=2,
                        max_features=max_features)
  vtr_sents = vtr.fit_transform(sents)

  lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                  learning_method='online',
                                  learning_offset=50,
                                  random_state=12345)
  lda.fit(vtr_sents)
  if return_model == True:
    return lda, vtr_sents, vtr
  vtr_feature_names = vtr.get_feature_names()
  #print_top_words(lda, vtr_feature_names, n_top_words)
  lda_words_list = []
  for topic_idx, topic in enumerate(lda.components_):
    term = [topic_idx, [vtr_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]]
    lda_words_list.append(term)
  return lda_words_list

In [10]:
def data_processing():
  words_list = []
  data_files = []
  folder = './input/'
  for subdir, dirs, files in os.walk(folder):
    for f in files:
      data_files.append(subdir + f)
  for f in data_files:
    df = pd.read_csv(f)
    sents = df.response.values
    for sent in sents:
      if not isinstance(sent, str):
        continue
      sent = sent.split('|')
      for s in sent:
        words_list.append(sent_token(s))
        
  return tfidfvectorizer(words_list, return_model=True)

In [11]:
sklearn_lda, sklearn_wordvec, sklearn_wordvecmodel = data_processing()

In [12]:
pyLDA = pyLDAvis.sklearn.prepare(sklearn_lda, sklearn_wordvec, sklearn_wordvecmodel, mds='tsne')
pyLDAvis.save_html(pyLDA, 'ldavis_golden.html')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
