# Topic Modeling

For more details on how topic modeling works, [see here](https://topix.io/tutorial/tutorial.html)

### Execute this cell to install required python module

After you've installed this once, you can delete this cell.

In [1]:
!pip install pyldavis



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


### Import dependencies

In [18]:
import pandas as pd
import numpy as np
#from sklearn.datasets import fetch_20newsgroups

# module to visualize topics
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

### Load 20newsgroups data

In [19]:
#news = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

#df = pd.DataFrame({"body": news.data})
#df = pd.read_csv('all_listings.csv')
sydney_listings = pd.read_csv('sydney-2016-01-03-listings.csv')
df =sydney_listings
df.head(10)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,9814837,https://www.airbnb.com/rooms/9814837,20160103104543,2016-01-03,Near Auburn Station,"Share 1 bathroom, 2 toilets with family. Walk ...",,"Share 1 bathroom, 2 toilets with family. Walk ...",none,,...,,f,,,f,strict,f,f,3,
1,4343438,https://www.airbnb.com/rooms/4343438,20160103104543,2016-01-03,Nikita & Rusty Home,We welcome you to share 1 bedroom in our 3 bed...,Your room is a spacious master bedroom includi...,We welcome you to share 1 bedroom in our 3 bed...,none,"*Olympic park, ANZ Stadium & Allphones arena w...",...,9.0,f,,,f,moderate,f,f,2,1.12
2,1295993,https://www.airbnb.com/rooms/1295993,20160103104543,2016-01-03,Large & modern room- 25 mins to CBD,I'm going to Europe for 3 months so the big be...,Welcome to your very own immaculate and modern...,I'm going to Europe for 3 months so the big be...,none,,...,9.0,f,,,f,flexible,f,f,2,0.24
3,9651483,https://www.airbnb.com/rooms/9651483,20160103104543,2016-01-03,Entire apartment 2mins from station,"Hi, this space is available from 22/12 to 26/1...",,"Hi, this space is available from 22/12 to 26/1...",none,,...,,f,,,f,strict,f,f,1,
4,1477506,https://www.airbnb.com/rooms/1477506,20160103104543,2016-01-03,Sydney Great Home @ Olympic Park,2 rooms with a good quality queen bed to be re...,A great place to stay if you come to Sydney fo...,2 rooms with a good quality queen bed to be re...,none,,...,9.0,f,,,f,strict,t,t,2,1.38
5,5398392,https://www.airbnb.com/rooms/5398392,20160103104543,2016-01-03,Modern Room near SydOlympic Park,Our 1 bedroom is located in a quiet and safe n...,1 x bedroom in a modern 2 bedroom contemporary...,Our 1 bedroom is located in a quiet and safe n...,none,20kms to the city center of Sydney. Close acce...,...,10.0,f,,,f,strict,f,f,1,3.1
6,9656968,https://www.airbnb.com/rooms/9656968,20160103104543,2016-01-03,Great location two bed apartment.,This two bedroom modern apartment is located m...,This is a tidy two bedroom apartment with two ...,This two bedroom modern apartment is located m...,none,Wentworth point has a real community feel.If y...,...,10.0,f,,,f,flexible,f,f,1,1.0
7,1169841,https://www.airbnb.com/rooms/1169841,20160103104543,2016-01-03,"Breakers Cove, waterfront view","Breakers Cove is a lovely 2 bedroom, 2 bathroo...","Breakers Cove is a lovely 2 bedroom, 2 bathroo...","Breakers Cove is a lovely 2 bedroom, 2 bathroo...",none,Located approximately 20 minutes to Sydney CBD...,...,9.0,f,,,f,flexible,f,f,8,0.21
8,1171089,https://www.airbnb.com/rooms/1171089,20160103104543,2016-01-03,"Australia Towers 9.06, Best Choice!",Australia Towers 9.06 is a brand new spacious ...,"Our apartment is sophisticated, elegant and ge...",Australia Towers 9.06 is a brand new spacious ...,none,Guests will enjoy an abundance of world class ...,...,9.0,f,,,f,strict,f,f,8,0.43
9,6986347,https://www.airbnb.com/rooms/6986347,20160103104543,2016-01-03,AT 9.01 Executive 1 bedroom,Australia Towers 9.01 is a beautiful 1 bedroom...,Australia Towers 9.01 is a beautiful 1 bedroom...,Australia Towers 9.01 is a beautiful 1 bedroom...,none,Guests will enjoy an abundance of world class ...,...,8.0,f,,,f,strict,f,f,8,1.46


In [20]:
df.shape

(16149, 92)

### Preprocess text

In [21]:
#from utils import clean_text
#df['body'] = df['body'].apply(lambda x: clean_text(x))

### Generate feature vectors

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(df['description'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()

### Fit feature vectors to the LDA topic model

In [23]:
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 20

lda4 = LatentDirichletAllocation(n_components=no_topics, random_state=4, evaluate_every=1).fit(tf)

### Display top words for each topic

In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic: {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda4, tf_feature_names, no_top_words)

Topic: 0
balmoral parramatta enmore corso river marrickville lively crows 2br dulwich
Topic: 1
rockdale highway ma die workout villas livingroom cola manor coca
Topic: 2
mosman let toaster pad kettle fridge kitchenette erskineville housemates microwave
Topic: 3
large house bedroom home living area bedrooms dining kitchen outdoor
Topic: 4
period semi designer year executive booking finishes expansive new annandale
Topic: 5
house friendly share room center hidden neighbours neighborhood work live
Topic: 6
star casino thai japanese italian pizza commercial matter factory indian
Topic: 7
queenscliff shelly balgowlah dam roseville steyne stn stockland een killara
Topic: 8
room bed bedroom apartment kitchen bathroom large queen tv balcony
Topic: 9
dee toilets sorry eastwood coffees german mary pedestrian carlingford regards
Topic: 10
newtown house redfern west terrace uni travellers university backyard uts
Topic: 11
beach bondi apartment walk sydney restaurants manly cafes city away
Topic: 1

### Visualizing our topics in 2-dimensional space

How to interpret this visualization:
1. Each bubble represents a topic
2. Larger topics are more frequent in the corpus
3. Topics closer together are more similar
4. When you click on a topic, the most relevant terms for that topic show in red on the right, and in blue is the frequency of that term in all other topics
5. When you hover over a word in the chart on the right, the bubbles will adjust according to how relevant that term is to each topic


In [25]:
pyLDAvis.sklearn.prepare(lda4, tf, tf_vectorizer)

### Create Document - Topic Matrix

In [26]:
lda_output = lda4.transform(tf)

# column names
topicnames = ["Topic" + str(i) for i in range(no_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
Doc0,0.0,0.0,0.0,0.5,0.0,0.07,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,3
Doc1,0.0,0.0,0.16,0.0,0.0,0.04,0.0,0.0,0.2,0.0,0.14,0.08,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.33,19
Doc2,0.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,0.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,8
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43,19
Doc4,0.0,0.0,0.0,0.04,0.0,0.0,0.01,0.0,0.2,0.0,0.0,0.65,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.06,11
Doc5,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.0,0.46,0.0,0.0,0.06,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.05,8
Doc6,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.79,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.13,8
Doc7,0.0,0.0,0.0,0.67,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.03,0.12,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc8,0.0,0.0,0.0,0.13,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.0,8
Doc9,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.54,0.0,0.0,0.0,0.0,0.02,0.0,13
