# Topic Modeling

For more details on how topic modeling works, [see here](https://topix.io/tutorial/tutorial.html)

### Execute this cell to install required python module

After you've installed this once, you can delete this cell.

In [None]:
!pip install pyldavis

### Import dependencies

In [None]:
import pandas as pd
import numpy as np
#from sklearn.datasets import fetch_20newsgroups

# module to visualize topics
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

### Load 20newsgroups data

In [None]:
#news = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

#df = pd.DataFrame({"body": news.data})
#df = pd.read_csv('all_listings.csv')
sydney_listings = pd.read_csv('Reviews_ASCII.csv')
df =sydney_listings
df.head(10)

In [None]:
df.shape

### Preprocess text

In [None]:
#from utils import clean_text
#df['body'] = df['body'].apply(lambda x: clean_text(x))

### Generate feature vectors

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(df['comments'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()

### Fit feature vectors to the LDA topic model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 20

lda4 = LatentDirichletAllocation(n_components=no_topics, random_state=4, evaluate_every=1).fit(tf)

### Display top words for each topic

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic: {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
display_topics(lda4, tf_feature_names, no_top_words)

### Visualizing our topics in 2-dimensional space

How to interpret this visualization:
1. Each bubble represents a topic
2. Larger topics are more frequent in the corpus
3. Topics closer together are more similar
4. When you click on a topic, the most relevant terms for that topic show in red on the right, and in blue is the frequency of that term in all other topics
5. When you hover over a word in the chart on the right, the bubbles will adjust according to how relevant that term is to each topic


In [None]:
pyLDAvis.sklearn.prepare(lda4, tf, tf_vectorizer)

### Create Document - Topic Matrix

In [None]:
lda_output = lda4.transform(tf)

# column names
topicnames = ["Topic" + str(i) for i in range(no_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics