In [70]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re
import nltk
import  sys

from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [60]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
from src.utils import get_all_channels_messages, preprocess_text


In [72]:
def build_model(corpus, word_to_id):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                        id2word=word_to_id,
                                        num_topics=5,
                                        random_state=100,
                                        update_every=1,
                                        chunksize=100,
                                        passes=10,
                                        alpha='auto',
                                        per_word_topics=True)    
    return lda_model

def show_topics(lda_model):
    pprint(lda_model.show_topics(formatted=False))

def get_top_topics(df):
    df, word_list, word_to_id, corpus = prepare_data(df)
    lda_model = build_model(corpus, word_to_id)

    # Show the top 10 topics
    show_topics(lda_model)
    
    # Visualize the top 10 topics
    pyLDAvis.enable_notebook()
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)
    return LDAvis_prepared

def prepare_data(df):
    # print(df['text'])
    # text = preprocess_text(df['text'])
    df['text'] = df['text'].apply(preprocess_text)
    sentence_list = [tweet for tweet in df['text']]
    word_list = [sent.split() for sent in sentence_list]

    #Create dictionary which contains Id and word
    word_to_id = corpora.Dictionary(word_list) #generate unique tokens
    corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]
    
    return df, word_list, word_to_id, corpus



In [73]:
slack_data_path = os.path.abspath('../anonymized')
all_channels = SlackDataLoader(slack_data_path).get_channels()

# Top 10 topics from all channels

In [78]:
all_messages = get_all_channels_messages(all_channels)

all_messages = [{'text': text, 'ts': ts} for text, ts in all_messages]

df = pd.DataFrame(all_messages)
# print(all_messages)

get_top_topics(df)

[(0,
  [('am', 0.033568554),
   ('think', 0.030391995),
   ('has', 0.021866497),
   ('install', 0.021292524),
   ("it's", 0.018556295),
   ("don't", 0.013578344),
   ('by', 0.013546538),
   ('been', 0.010689203),
   ('run', 0.009971262),
   ('pip', 0.009913878)]),
 (1,
  [('as', 0.03709628),
   ('thank', 0.024389667),
   ('any', 0.024331743),
   ('meeting', 0.022429442),
   ('our', 0.022358933),
   ("i'm", 0.021140285),
   ('good', 0.019104075),
   ('thanks', 0.01603836),
   ('after', 0.015720481),
   ('still', 0.013312635)]),
 (2,
  [('=', 0.03262919),
   ('me', 0.032134045),
   ('data', 0.028673014),
   ('working', 0.022080671),
   ('using', 0.02201523),
   ('yes', 0.020403963),
   ('same', 0.020128626),
   ('now', 0.01893851),
   ('work', 0.016991204),
   ('about', 0.01636062)]),
 (3,
  [('the', 0.08505558),
   ('i', 0.046231154),
   ('to', 0.040731296),
   ('you', 0.030280609),
   ('it', 0.024946295),
   ('in', 0.024838135),
   ('is', 0.024705432),
   ('and', 0.023794437),
   ('we'