# Notebook for topic modeling 

# 0. Imports

In [3]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
#!pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## visualizing LDA--likely need to install
#!pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctstr = string.punctuation # english punctuation marks

# 0. Load data

In [4]:
ab = pd.read_csv("../../public_data/airbnb_text.zip")
ab.head()


Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# 1. Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

## 1.1 Load stopwords list and augment with our own custom ones

In [5]:
list_stopwords = stopwords.words("english")
custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']
list_stopwords_new = list_stopwords + custom_words_toadd


## 1.2 Remove stopwords from lowercase version of corpus


In [6]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
wordpunct_tokenize(example_listing)
nostop_listing = [word 
                for word in wordpunct_tokenize(example_listing) 
                if word not in list_stopwords_new]


['cozy', 'entire', 'floor', 'of', 'brownstone']

## 1.3 stem and remove non-alpha

Other contexts we may want to leave digits in

In [7]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]


## 1.4 Activity 1

The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

- Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
- Apply the function iteratively to preprocess all the texts in `corpus_lower`. Output could either be a list where each list element is a string of a list (e.g., `cozy brownstone apt`), or a list of lists where each element is a tokenized string (e.g., `['cozy', 'brownstone', 'apt'])`

Output is flexible: it could be a list of lists containing tokenized/stemmed text or a list of strings.

In [8]:
def process_step1(one_str):   
    try:
        nostop_listing1 = [word for word in wordpunct_tokenize(one_str)
                          if word not in list_stopwords_new]
        clean_listing = [porter.stem(word) for word in nostop_listing1
                        if word.isalpha() 
                        and len(word) > 3]
        clean_listing_str = " ".join(clean_listing)
        return(clean_listing_str)
    except:
        return("")

cleaned_listings = [process_step1(one_listing) for one_listing in 
                   corpus_lower]


In [9]:
ab['proc_name'] = cleaned_listings
ab[['name', 'proc_name']].head(10)

Unnamed: 0,name,proc_name
0,Clean & quiet apt home by the park,clean quiet home park
1,Skylit Midtown Castle,skylit midtown castl
2,THE VILLAGE OF HARLEM....NEW YORK !,villag harlem york
3,Cozy Entire Floor of Brownstone,cozi entir floor brownston
4,Entire Apt: Spacious Studio/Loft by central park,entir spaciou studio loft central park
5,Large Cozy 1 BR Apartment In Midtown East,larg cozi midtown east
6,BlissArtsSpace!,blissartsspac
7,Large Furnished Room Near B'way,larg furnish room near
8,Cozy Clean Guest Room - Family Apt,cozi clean guest room famili
9,Cute & Cozy Lower East Side 1 bdrm,cute cozi lower east side bdrm


# 2. Create a document-term matrix and do some basic diagnostics (more manual approach)

Here we'll create a DTM first using the raw documents; in the activity, you'll create one using the preprocessed docs
that you created in activity 1

## 2.1 Define the dtm function and select data to transform into a document-term matrix

In [10]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [11]:
## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price',
            'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 9899)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
22540,18227529,Manhattan,140,Live in New York Near Central Park and Columbi...,live in new york near central park and columbi...
47531,35794273,Brooklyn,130,Hope Garden,hope garden
33906,26858196,Brooklyn,75,AWESOME 2 BEDS - QUEEN + SOFA - NEXT TO METRO,awesome 2 beds - queen + sofa - next to metro
12047,9369514,Brooklyn,25,STARTUP CHEAP PLACE BROOKLYN,startup cheap place brooklyn
2908,1669149,Manhattan,250,Beautiful Modern Midtown Apartment,beautiful modern midtown apartment


## 2.2 Execute the dtm function to create the document-term matrix

In [12]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                metadata = 
                ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

dtm_nopre.head()

Sparse matrix form:
   (0, 546)	1
  (0, 488)	1
  (0, 622)	1
  (0, 963)	1
  (0, 619)	1
  (0, 255)	1
  (0, 653)	1
  (0, 124)	1
  (0, 280)	1
  (1, 478)	1
  (1, 424)	1
  (2, 157)	1
  (2, 184)	1
  (2, 704)	1
  (2, 785)	1
  (2, 624)	1
  (2, 870)	1
  (2, 584)	1

Dense matrix form:
    10  100  1000  1000sq  10292  10ft  10min  11  110th  116  ...  交通便利  \
0   0    0     0       0      0     0      0   0      0    0  ...     0   
1   0    0     0       0      0     0      0   0      0    0  ...     0   
2   0    0     0       0      0     0      0   0      0    0  ...     0   
3   0    0     0       0      0     0      0   0      0    0  ...     0   
4   0    0     0       0      0     0      0   0      0    0  ...     0   

   位于北上远离开辆  家庭式獨立衛生間套房g  温馨小筑  简单的四房一厅两卫生间  纽约之家  走路四分钟到地铁站  건물  따뜻한  작은  
0         0            0     0            0     0          0   0    0   0  
1         0            0     0            0     0          0   0    0   0  
2         0            0     0            0  

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,10,100,1000,1000sq,10292,10ft,...,交通便利,位于北上远离开辆,家庭式獨立衛生間套房g,温馨小筑,简单的四房一厅两卫生间,纽约之家,走路四分钟到地铁站,건물,따뜻한,작은
0,22540,18227529,Manhattan,140,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47531,35794273,Brooklyn,130,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33906,26858196,Brooklyn,75,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12047,9369514,Brooklyn,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2908,1669149,Manhattan,250,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.3 Use that matrix/column sums to get basic summary stats of top words

In [13]:
## summing each col
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in            357
room          204
bedroom       164
private       146
cozy          134
             ... 
fitness         1
fl              1
flatbushbk      1
flatiron        1
작은              1
Length: 979, dtype: int64

## 2.4 Activity 2: repeat the above but using the preprocessed text data

- Stick with the same random sample of 1000 `ab_small`
- Apply the preprocessing steps from activity 1 to create a new column in `ab_small` with the 
preprocessed text (if you got stuck on that, try just removing stopwords)
- Use the `create_dtm` function to create a document-term matrix from the preprocessed data
-  Take the sum of each of the term columns to find the top words 

In [14]:
ab_small['processed'] = [process_step1(one_list)
                         for one_list in ab_small.name_lower.to_list()]

In [15]:
ab_small["processed"] = ab_small.name_lower.apply(process_step1)

In [16]:
create_dtm()

TypeError: create_dtm() missing 2 required positional arguments: 'list_of_strings' and 'metadata'

In [17]:
metadata_list = ['id', 'neighbourhood_group', 'price_rawdata']

dtm_pre = create_dtm(list_of_strings= ab_small.processed,
                metadata = ab_small[metadata_list])

dtm_pre_termsonly = dtm_pre[[col for col in dtm_pre.columns
                             if col not in metadata_list and
                             col != "index"]]

dtm_pre_termsonly.sum().sort_values(ascending = False)

Sparse matrix form:
   (0, 318)	1
  (0, 632)	1
  (0, 372)	1
  (0, 99)	1
  (0, 392)	1
  (0, 122)	1
  (1, 274)	1
  (1, 235)	1
  (2, 31)	1
  (2, 43)	1
  (2, 433)	1
  (2, 496)	1
  (2, 376)	1
  (2, 347)	1

Dense matrix form:
    abod  access  across  ador  afford  airbnb  airi  airport  amaz  amazingli  \
0     0       0       0     0       0       0     0        0     0          0   
1     0       0       0     0       0       0     0        0     0          0   
2     0       0       0     0       0       0     0        0     0          0   
3     0       0       0     0       0       0     0        0     0          0   
4     0       0       0     0       0       0     0        0     0          0   

   ...  york  yorkvil  young  交通便利  位于北上远离开辆  家庭式獨立衛生間套房g  温馨小筑  简单的四房一厅两卫生间  \
0  ...     1        0      0     0         0            0     0            0   
1  ...     0        0      0     0         0            0     0            0   
2  ...     0        0      0     0         0        

room         208
bedroom      167
privat       146
cozi         134
studio        94
            ... 
industri       1
indoor         1
independ       1
incred         1
走路四分钟到地铁站      1
Length: 642, dtype: int64

In [18]:
dtm_nopre.shape, dtm_pre.shape

((1000, 983), (1000, 646))

# 3. Use gensim to more automatically preprocess/estimate a topic model

## 3.1 Creating the objects to feed the LDA modeling function

Different outputs described below: 
- Tokenized and preprocessed text 
- Dictionary 
- Corpus 

In [19]:
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts

text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_lower]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering


## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

{0: '.', 1: 'and', 2: 'central', 3: 'columbia', 4: 'in'}

Filtering out very rare and very common words reduced the length of dictionary from 1054 to 28.


{0: '.', 1: 'and', 2: 'in', 3: 'near', 4: 'park'}

Sample of documents represented in dictionary format (with omitted words noted):


[([(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
  {'central': 1, 'columbia': 1, 'live': 1, 'new': 1, 'u': 1, 'york': 1}),
 ([], {'garden': 1, 'hope': 1}),
 ([(5, 2), (6, 1), (7, 1)],
  {'+': 1,
   'awesome': 1,
   'beds': 1,
   'metro': 1,
   'next': 1,
   'queen': 1,
   'sofa': 1}),
 ([(8, 1)], {'cheap': 1, 'place': 1, 'startup': 1}),
 ([(9, 1)], {'beautiful': 1, 'midtown': 1, 'modern': 1}),
 ([(1, 1), (5, 1), (6, 1), (7, 1), (10, 1), (11, 1), (12, 1)],
  {':)': 1, 'bright': 1, 'ride': 1, 'short': 1}),
 ([(5, 1), (13, 1)],
  {'2beds2baths': 1,
   'central': 1,
   'grand': 1,
   'nations': 1,
   'new': 1,
   'united': 1}),
 ([(5, 1), (13, 1), (14, 1), (15, 1)],
  {'couple': 1, 'family': 1, 'friendly': 1, 'very': 1}),
 ([(2, 1), (12, 1)], {'beautiful': 1, 'duplex': 1, 'modern': 1}),
 ([(1, 1), (12, 1), (16, 1), (17, 1)],
  {'bathroom': 1, 'kitchen': 1, 'shared': 1})]

##  3.2 Estimating the model

In [20]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feed the lda function (1) the corpus we created from the dictionary
## (2) a parameter we decide on for the number of topics,
## (3) the dictionary itself,
## (4) parameter for number of passes through training data
## (5) parameter that returns, for each word remaining in dict, the 
## topic probabilities
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True, 
                                         random_state = 91988)


## 3.3  Seeing what topics the estimated model discovers

In [21]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)


(0, '0.267*"in" + 0.150*"apartment" + 0.103*"the" + 0.084*"of" + 0.077*"room" + 0.072*"brooklyn" + 0.051*"apt" + 0.038*"park" + 0.033*"sunny" + 0.026*"&"')
(1, '0.123*"in" + 0.119*"bedroom" + 0.111*"spacious" + 0.106*"," + 0.106*"east" + 0.091*"1" + 0.058*"apartment" + 0.058*"sunny" + 0.051*"and" + 0.029*"-"')
(2, '0.234*"cozy" + 0.210*"," + 0.090*"room" + 0.078*"/" + 0.071*"!" + 0.063*"&" + 0.059*"park" + 0.043*"in" + 0.030*"brooklyn" + 0.022*"and"')
(3, '0.174*"private" + 0.135*"room" + 0.109*"studio" + 0.090*"in" + 0.065*"." + 0.063*"with" + 0.056*"!" + 0.052*"to" + 0.046*"/" + 0.046*"manhattan"')
(4, '0.179*"-" + 0.104*"bedroom" + 0.095*"." + 0.081*"2" + 0.078*"near" + 0.078*"apt" + 0.057*"to" + 0.048*"in" + 0.044*"manhattan" + 0.036*"park"')


In [22]:
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['live',
  'in',
  'new',
  'york',
  'near',
  'central',
  'park',
  'and',
  'columbia',
  'u',
  '.'],
 ['hope', 'garden'],
 ['awesome',
  '2',
  'beds',
  '-',
  'queen',
  '+',
  'sofa',
  '-',
  'next',
  'to',
  'metro'],
 ['startup', 'cheap', 'place', 'brooklyn'],
 ['beautiful', 'modern', 'midtown', 'apartment']]

[[(0, 0.028244894),
  (1, 0.027645271),
  (2, 0.024302833),
  (3, 0.031619065),
  (4, 0.88818794)],
 [(0, 0.20073183),
  (1, 0.19741756),
  (2, 0.17480811),
  (3, 0.22501582),
  (4, 0.20202667)],
 [(0, 0.033632565),
  (1, 0.03311372),
  (2, 0.029271966),
  (3, 0.03784261),
  (4, 0.8661391)],
 [(0, 0.6425431),
  (1, 0.08820588),
  (2, 0.07810777),
  (3, 0.100679845),
  (4, 0.090463474)],
 [(0, 0.6432312),
  (1, 0.0884367),
  (2, 0.077929564),
  (3, 0.10032383),
  (4, 0.09007865)]]

### Visualizing 

In [23]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

## 3.4 Activity 3

- Preprocess the texts if you haven't already
- Run the topic model with preprocessed texts
- Play around with other parameters like `n_topics` to find a configuration that produces useful topics

If you get stuck on the preprocessing part, you can use below function and example code for applying it. Then continue as above (start with tokenizing).

In [24]:
ab_small = ab_small[ab_small.processed != ""].copy()

tokenized_text = [wordpunct_tokenize(one_text) 
                for one_text in 
                ab_small.processed]
tokenized_text

[['live', 'york', 'near', 'central', 'park', 'columbia'],
 ['hope', 'garden'],
 ['awesom', 'bed', 'queen', 'sofa', 'next', 'metro'],
 ['startup', 'cheap', 'place'],
 ['beauti', 'modern', 'midtown'],
 ['bright', 'cozi', 'room', 'short', 'ride'],
 ['grand', 'central', 'unit', 'nation'],
 ['sunni', 'coupl', 'famili', 'friendli'],
 ['beauti', 'room', 'modern', 'duplex'],
 ['privat', 'room', 'share', 'bathroom', 'kitchen'],
 ['bedroom', 'within', 'full'],
 ['modern', 'walk', 'grand', 'central'],
 ['modern', 'luxuri', 'condo', 'midtown', 'west'],
 ['cozi', 'studio', 'live', 'room', 'share', 'littl', 'itali'],
 ['privat', 'studio', 'roof', 'privat', 'bath', 'deck'],
 ['charm', 'lower', 'east', 'side'],
 ['master', 'suit', 'privat', 'bath', 'balconi'],
 ['cozi', 'master', 'bedroom', 'minut'],
 ['doubl', 'bed', 'york', 'chinatown'],
 ['fantast', 'studio', 'core'],
 ['cozi', 'room', 'near'],
 ['sonder', 'stock', 'exchang', 'tranquil', 'loung'],
 ['drench', 'artsi', 'bedroom'],
 ['modern', 'train

In [25]:
## preprocess and estimate topicmod
### create dictionary
text_proc_dict = corpora.Dictionary(tokenized_text)
### filter dictionary- using 2% as bounds
text_proc_dict.filter_extremes(no_below = round(ab_small.shape[0]*0.02),
                             no_above = round(ab_small.shape[0]*0.98))

### create corpus from dictionary
corpus_fromdict_proc = [text_proc_dict.doc2bow(one_text) 
                       for one_text in tokenized_text]
# corpus_fromdict_proc

In [26]:
%%time
### estimate model
n_topics = 5
ldamod_proc = gensim.models.ldamodel.LdaModel(corpus_fromdict_proc, 
                                              num_topics = n_topics, 
                                              id2word=text_proc_dict, 
                                              passes=6, alpha = 'auto',
                                              per_word_topics = True, 
                                              random_state = 91988)

### print topics and words
topics = ldamod_proc.print_topics(num_words = 15)
for topic in topics:
    print(topic)
    

(0, '0.250*"room" + 0.196*"privat" + 0.083*"cozi" + 0.066*"bedroom" + 0.065*"home" + 0.060*"spaciou" + 0.050*"view" + 0.043*"larg" + 0.033*"citi" + 0.030*"beauti" + 0.025*"great" + 0.021*"bushwick" + 0.016*"near" + 0.011*"harlem" + 0.008*"subway"')
(1, '0.105*"modern" + 0.095*"loft" + 0.093*"brownston" + 0.078*"quiet" + 0.075*"suit" + 0.075*"garden" + 0.061*"harlem" + 0.060*"beauti" + 0.057*"bushwick" + 0.035*"williamsburg" + 0.033*"huge" + 0.028*"near" + 0.027*"cozi" + 0.027*"bright" + 0.020*"clean"')
(2, '0.113*"locat" + 0.091*"train" + 0.089*"close" + 0.064*"prime" + 0.064*"time" + 0.061*"midtown" + 0.056*"west" + 0.045*"cozi" + 0.045*"subway" + 0.041*"room" + 0.034*"modern" + 0.032*"privat" + 0.032*"bedroom" + 0.032*"clean" + 0.031*"spaciou"')
(3, '0.199*"studio" + 0.133*"park" + 0.103*"sunni" + 0.087*"luxuri" + 0.079*"central" + 0.078*"heart" + 0.039*"spaciou" + 0.035*"cozi" + 0.031*"room" + 0.022*"near" + 0.018*"williamsburg" + 0.017*"midtown" + 0.015*"prime" + 0.014*"clean" + 0.

In [27]:
### visualize
lda_display_proc = gensimvis.prepare(ldamod_proc, corpus_fromdict_proc, text_proc_dict)
pyLDAvis.display(lda_display_proc)

# Additional summaries of topics and documents 

What if we want to find which topics are associated with higher listing prices?

In [30]:
## get topic probabilities by doc and find mean listing by topic
### get document topics - list of list tuples
topic_probs_bydoc = [ldamod_proc.get_document_topics(item) for item in corpus_fromdict_proc]

## each document has a list containing topic, probability
## tuples- example w/ first document
one_list_tup = topic_probs_bydoc[0]
one_list_tup

## create a long form dataframe by flattening the list
topic_probs_bydoc_long = pd.DataFrame([t for lst in topic_probs_bydoc for t in lst],
                                     columns = ['topic', 'probability'])
topic_probs_bydoc
topic_probs_bydoc_long

## add id var- we're repeating each id in the original data k times
## for the number of topics
topic_probs_bydoc_long['doc_id'] = list(np.concatenate([[one_id] * 
                                    n_topics for one_id in ab_small.id]).flat)

## pivot to wide format
topic_probs_bydoc_wide = pd.pivot_table(topic_probs_bydoc_long, index = ['doc_id'],
                        columns = ['topic']).reset_index().reset_index(drop = True)
topic_probs_bydoc_wide.columns = ['doc_id'] + ["topic_" + str(i) for i in np.arange(0, n_topics)]

## merge with original data using doc id
topic_wmeta = pd.merge(topic_probs_bydoc_wide,
                      ab_small,
                      left_on = 'doc_id',
                      right_on = 'id')

## create indicator for listing's top topic
topic_wmeta['toptopic'] = topic_wmeta[[col for col in topic_wmeta.columns if 
                                    "topic_" in col]].idxmax(axis=1)
topic_wmeta.sample(n = 5, random_state = 555)

## group by topic and find mean price
topic_wmeta.groupby('toptopic').agg({'price_rawdata': np.mean})

## group by borough and topic -- higher price for some also reflects
## diff borough composition
topic_wmeta.groupby(['toptopic', 
                    'neighbourhood_group']).agg({'price_rawdata': np.mean})

[(0, 0.06157212),
 (1, 0.038794667),
 (2, 0.03606015),
 (3, 0.8142818),
 (4, 0.049291242)]

[[(0, 0.06157212),
  (1, 0.038794667),
  (2, 0.03606015),
  (3, 0.8142818),
  (4, 0.049291242)],
 [(0, 0.12431624),
  (1, 0.60358834),
  (2, 0.07382608),
  (3, 0.09948662),
  (4, 0.09878271)],
 [(0, 0.26131567),
  (1, 0.16661663),
  (2, 0.1552243),
  (3, 0.20917507),
  (4, 0.20766832)],
 [(0, 0.26131567),
  (1, 0.16661663),
  (2, 0.1552243),
  (3, 0.20917507),
  (4, 0.20766832)],
 [(0, 0.06149219),
  (1, 0.59042984),
  (2, 0.2507266),
  (3, 0.04892302),
  (4, 0.048428357)],
 [(0, 0.4192461),
  (1, 0.038782917),
  (2, 0.036086235),
  (3, 0.048817653),
  (4, 0.45706713)],
 [(0, 0.124671176),
  (1, 0.07927052),
  (2, 0.07388918),
  (3, 0.6232315),
  (4, 0.098937675)],
 [(0, 0.1247401),
  (1, 0.07924468),
  (2, 0.0739322),
  (3, 0.6229743),
  (4, 0.09910873)],
 [(0, 0.38467944),
  (1, 0.482011),
  (2, 0.036098614),
  (3, 0.04867705),
  (4, 0.048533898)],
 [(0, 0.76928353),
  (1, 0.05198501),
  (2, 0.048465464),
  (3, 0.065386355),
  (4, 0.06487959)],
 [(0, 0.12702179),
  (1, 0.07924656),
 

Unnamed: 0,topic,probability
0,0,0.061572
1,1,0.038795
2,2,0.036060
3,3,0.814282
4,4,0.049291
...,...,...
4970,0,0.825773
4971,1,0.038834
4972,2,0.036093
4973,3,0.049393


Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,id,neighbourhood_group,price_rawdata,name,name_lower,processed,toptopic
487,19691179,0.233709,0.038699,0.036067,0.048757,0.642768,19691179,Manhattan,125,Sun-filled Lower East Side Guest Room,sun-filled lower east side guest room,fill lower east side guest room,topic_4
635,24712037,0.124288,0.079332,0.073826,0.623778,0.098776,24712037,Brooklyn,198,Stylish Design Apartment in the Heart of Brooklyn,stylish design apartment in the heart of brooklyn,stylish design heart,topic_3
713,28165121,0.052221,0.030948,0.028838,0.849538,0.038456,28165121,Brooklyn,70,"Large, Sunny Retreat- B/Q subway, Prospect Park!","large, sunny retreat- b/q subway, prospect park!",larg sunni retreat subway prospect park,topic_3
262,10105477,0.124287,0.079327,0.597313,0.100301,0.098772,10105477,Manhattan,97,Midtown Manhattan,midtown manhattan,midtown,topic_2
941,35009512,0.082248,0.052133,0.048509,0.065579,0.75153,35009512,Brooklyn,123,"One bedroom in Williamsburg, Brooklyn","one bedroom in williamsburg, brooklyn",bedroom williamsburg,topic_4


  topic_wmeta.groupby('toptopic').agg({'price_rawdata': np.mean})


Unnamed: 0_level_0,price_rawdata
toptopic,Unnamed: 1_level_1
topic_0,132.556373
topic_1,197.752381
topic_2,137.083333
topic_3,172.546961
topic_4,166.322581


  'neighbourhood_group']).agg({'price_rawdata': np.mean})


Unnamed: 0_level_0,Unnamed: 1_level_0,price_rawdata
toptopic,neighbourhood_group,Unnamed: 2_level_1
topic_0,Bronx,49.2
topic_0,Brooklyn,121.724138
topic_0,Manhattan,183.126866
topic_0,Queens,87.094118
topic_0,Staten Island,93.8
topic_1,Bronx,121.666667
topic_1,Brooklyn,142.508772
topic_1,Manhattan,291.325
topic_1,Queens,124.6
topic_2,Brooklyn,131.787879
