In [172]:
# Run in terminal or command prompt:
#python3 -m spacy download en

# Packages
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pprint import pprint

# Import stopwords and other word packages
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models # as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords

from scipy.sparse import csr_matrix, hstack, coo_matrix


import string
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [173]:
airbnb = pd.read_csv('../data/airbnb_gentrification.csv')

In [174]:
airbnb.head(1)

Unnamed: 0,listing_id,comments_concatenated,name,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,listing_url,description,neighborhood_overview,host_since,host_listings_count,property_type,accommodates,bathrooms_text,bedrooms,beds,amenities,minimum_nights_avg_ntm,maximum_nights_avg_ntm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,zip_code,GEOID,house_price_2021-01-31,house_pct_change,rentals_2021-01-31,rental_price_pct_change,new_restaurants,available_beer,str_permits_2020,str_permits_growth,crimes,total_pop_2010,total_pop_2019,total_pop_change,total_pop_pct_change,pop_over25_2010,pop_over25_2019,pop_over25_change,pop_over25_pcg_change,total_households_2010,total_households_2019,total_households_change,total_households_pct_change,white_pct_2010,white_pct_2019,white_value_change,white_pct_change,bach_pct_2010,bach_pct_2019,bach_value_change,bach_pct_change,rent_pct_2010,rent_pct_2019,rent_value_change,renter_pct_change,median_hhi_2010,median_hhi_2019,median_hhi_value_change,median_hhi_pct_change,poverty_pct_2010,poverty_pct_2019,poverty_value_change,poverty_pct_change,gentrifying
0,6422,I can't say enough about how wonderful it was ...,Nashville Charm,12172,36.17315,-86.73581,40,30,674,4.69,1,267,https://www.airbnb.com/rooms/6422,30 day or more rental during COVID. Show COVID...,Historic East Nashville is home to many new an...,2009-04-03,0.0,Private room in house,2,1 private bath,2.0,3.0,"[""Hair dryer"", ""Bathtub"", ""Lock on bedroom doo...",30.0,365.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,37206.0,47037010000.0,412476.0,38.31,,,1.0,2.0,114.0,114.0,1165.0,2544.0,2100.0,-444.0,-0.174528,1703.0,1639.0,-64.0,-0.037581,1140.0,926.0,-214.0,-0.187719,0.657626,0.940952,0.283327,0.430833,0.408691,0.585723,0.177032,0.43317,0.320175,0.240821,-0.079355,-0.247848,46000.0,91643.0,45643.0,0.992239,10.6,10.2,-0.4,-0.037736,False


In [175]:
airbnb.shape

(5205, 76)

In [176]:
airbnb = airbnb[airbnb['comments_concatenated'].notna()]

In [177]:
airbnb_nlp = airbnb[['listing_id', 'comments_concatenated', 'gentrifying']]

In [178]:
airbnb_nlp.comments_concatenated.isna().sum()

0

## Train Test Split (skip unless using a predictive model)

In [179]:
# create two lists with training and testing apns
train_listings, test_listings = tts(airbnb_nlp['listing_id'].to_list(), 
                                    random_state = 42, 
                                    stratify=airbnb_nlp['gentrifying'])

In [180]:
len(train_listings)

3903

In [181]:
# Create train and test dataframes from the lists of apns
airbnb_train = airbnb_nlp[airbnb_nlp['listing_id'].isin(train_listings)].sort_values('listing_id')
airbnb_test = airbnb_nlp[airbnb_nlp['listing_id'].isin(test_listings)].sort_values('listing_id')

In [182]:
# Create the y_train and y_test dataframes from the lists of apns
y_train = airbnb_nlp['gentrifying']
y_test = airbnb_nlp['gentrifying']

In [183]:
print(airbnb_train.gentrifying.value_counts(normalize=True))
print(airbnb_test.gentrifying.value_counts(normalize=True))

False    0.755829
True     0.244171
Name: gentrifying, dtype: float64
False    0.756341
True     0.243659
Name: gentrifying, dtype: float64


## Gensim Blog - Machine Learning Plus
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [184]:
# Convert to list
data = airbnb_nlp.comments_concatenated.values.tolist()

In [185]:
# Clean review break symbols
data = [re.sub("\\r\\n", "", comment) for comment in data]

In [186]:
# Tokenize each sentence to words, removing uneeded words/characters
def sent_to_words(sentences):
    for sentence in tqdm(sentences):
        tagged_words = nltk.tag.pos_tag(sentence.split()) 
        no_names = [word for word,tag in tagged_words if tag != 'NNP' and tag != 'NNPS'] # Remove proper nouns
        yield(gensim.utils.simple_preprocess(str(no_names), deacc=True)) #Clean and remove punctuation

data_words = list(sent_to_words(data))

#print(data_words[0:1])

HBox(children=(FloatProgress(value=0.0, max=5204.0), HTML(value='')))




In [187]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=100,
                               threshold=10)#,  # higher threshold fewer phrases.
                               #connector_words=phrases.ENGLISH_CONNECTOR_WORDS) ***I think I need to download this.
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)#, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Test trigram on first review
#print(trigram_mod[bigram_mod[data_words[0]]])

In [188]:
# Build list of stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'stay', 'place', 'location', 'home', 'house', 'host', 'great', 'clean', 'nice',
                  'perfect', 'comfortable', 'room', 'definitely', 'recommend', 'space', 'need', 'us'])

In [189]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:1])

In [190]:
# Only use if skipping lemmatization
data_lemmatized = data_words_bigrams.copy()

In [191]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in tqdm(texts)]

# View
#print(corpus[:1])

HBox(children=(FloatProgress(value=0.0, max=5204.0), HTML(value='')))




In [192]:
# How to view a single word within the corpus
id2word[500]

'conversation'

In [193]:
# Human readable format of corpus (term-frequency)
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [194]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,        # Replace with gensim.models.ldamodel.LdaModel()
                       id2word=id2word,
                       num_topics=10, #number of topics to identify
                       random_state=100,
                       #update_every=1,                          #Add back in with LdaModel
                       chunksize=100, #number of documents to pass per chunk
                       passes=10, #number of training passes
                       #alpha='auto',                            #Add back in with LdaModel
                       per_word_topics=True)

In [195]:
# Print the top 10 Keywords in each grouped Topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"would" + 0.011*"building" + 0.007*"parking" + 0.007*"downtown" + '
  '0.007*"night" + 0.007*"good" + 0.007*"walk" + 0.007*"condo" + 0.007*"get" + '
  '0.006*"close"'),
 (1,
  '0.002*"golfers" + 0.001*"aaron" + 0.000*"assignments" + 0.000*"zac" + '
  '0.000*"critiquing" + 0.000*"brunches" + 0.000*"luxurios" + '
  '0.000*"unbothered" + 0.000*"eminties" + 0.000*"respectable"'),
 (2,
  '0.019*"downtown" + 0.016*"us" + 0.015*"would" + 0.012*"quiet" + '
  '0.011*"easy" + 0.009*"bed" + 0.009*"close" + 0.008*"neighborhood" + '
  '0.008*"needed" + 0.008*"time"'),
 (3,
  '0.017*"downtown" + 0.016*"would" + 0.014*"beds" + 0.014*"group" + '
  '0.013*"us" + 0.010*"spacious" + 0.010*"close" + 0.010*"everything" + '
  '0.010*"time" + 0.009*"quick"'),
 (4,
  '0.036*"apartment" + 0.019*"downtown" + 0.017*"everything" + 0.016*"close" + '
  '0.016*"easy" + 0.016*"would" + 0.014*"walk" + 0.011*"quick" + '
  '0.010*"restaurants" + 0.010*"super"'),
 (5,
  '0.018*"cute" + 0.017*"downtown" + 0.

In [196]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - Likely more helpful. Takes a while to run.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.094146690972579

Coherence Score:  0.31943750522169523


## Visual for Viewing each Topic

In [197]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis, 'lda.html')

### Skipping step 17 about finding the best number of topics - Tim recommends 8-12

## Find dominant Topic in each Review

In [198]:
bow = corpora.Dictionary(data_lemmatized)

In [199]:
#Michael's Original
contents = []

for idx, doc in tqdm(enumerate(data_lemmatized)):
    bow = id2word.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'index': idx, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [219]:
topics.head(10)

Unnamed: 0,index,topic,percent
0,0,2,0.182863
1,0,5,0.011289
2,0,6,0.014829
3,0,7,0.720171
4,0,8,0.037668
5,0,9,0.012545
6,1,0,0.038508
7,1,2,0.071354
8,1,7,0.724228
9,1,8,0.151443


In [235]:
idx = topics.groupby(['topic'])['percent'].transform(max) == topics['percent']
topic_listing = topics[idx]
topic_listing

Unnamed: 0,index,topic,percent
399,66,2,0.97903
14669,2794,8,0.966653
15827,3052,3,0.995849
19559,3939,7,0.959073
21844,4516,5,0.983919
22431,4668,9,0.981243
22687,4731,6,0.918147
23365,4914,1,0.100006
23441,4929,4,0.995752
24058,5078,0,0.997336


In [244]:
idx = topics.groupby(['index'])['percent'].transform(max) == topics['percent']
hot_topics = topics[idx].set_index('index')
hot_topics.head()

Unnamed: 0_level_0,topic,percent
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,0.720171
1,7,0.724228
2,7,0.347194
3,7,0.612693
4,7,0.572819


In [245]:
hot_topics.value_counts('topic').sort_index()

topic
0     831
1      21
2     558
3    1489
4     685
5     616
6     100
7     205
8     122
9     757
dtype: int64

In [246]:
airbnb_topics = hot_topics.join(airbnb)

In [247]:
airbnb_topics.head()

Unnamed: 0,topic,percent,listing_id,comments_concatenated,name,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,listing_url,description,neighborhood_overview,host_since,host_listings_count,property_type,accommodates,bathrooms_text,bedrooms,beds,amenities,minimum_nights_avg_ntm,maximum_nights_avg_ntm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,zip_code,GEOID,house_price_2021-01-31,house_pct_change,rentals_2021-01-31,rental_price_pct_change,new_restaurants,available_beer,str_permits_2020,str_permits_growth,crimes,total_pop_2010,total_pop_2019,total_pop_change,total_pop_pct_change,pop_over25_2010,pop_over25_2019,pop_over25_change,pop_over25_pcg_change,total_households_2010,total_households_2019,total_households_change,total_households_pct_change,white_pct_2010,white_pct_2019,white_value_change,white_pct_change,bach_pct_2010,bach_pct_2019,bach_value_change,bach_pct_change,rent_pct_2010,rent_pct_2019,rent_value_change,renter_pct_change,median_hhi_2010,median_hhi_2019,median_hhi_value_change,median_hhi_pct_change,poverty_pct_2010,poverty_pct_2019,poverty_value_change,poverty_pct_change,gentrifying
0,7,0.720171,6422.0,I can't say enough about how wonderful it was ...,Nashville Charm,12172.0,36.17315,-86.73581,40.0,30.0,674.0,4.69,1.0,267.0,https://www.airbnb.com/rooms/6422,30 day or more rental during COVID. Show COVID...,Historic East Nashville is home to many new an...,2009-04-03,0.0,Private room in house,2.0,1 private bath,2.0,3.0,"[""Hair dryer"", ""Bathtub"", ""Lock on bedroom doo...",30.0,365.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,37206.0,47037010000.0,412476.0,38.31,,,1.0,2.0,114.0,114.0,1165.0,2544.0,2100.0,-444.0,-0.174528,1703.0,1639.0,-64.0,-0.037581,1140.0,926.0,-214.0,-0.187719,0.657626,0.940952,0.283327,0.430833,0.408691,0.585723,0.177032,0.43317,0.320175,0.240821,-0.079355,-0.247848,46000.0,91643.0,45643.0,0.992239,10.6,10.2,-0.4,-0.037736,False
1,7,0.724228,25613.0,Nell and Michael made my stay pleasant and eas...,Room in Historic East Nashville Craftsman Home,95811.0,36.17826,-86.74162,75.0,2.0,510.0,3.87,1.0,63.0,https://www.airbnb.com/rooms/25613,Serve up some sweet sounds on the piano and wi...,This home is in a walkable neighborhood with i...,2010-03-18,1.0,Private room in house,2.0,1 private bath,1.0,1.0,"[""Hair dryer"", ""Carbon monoxide alarm"", ""Air c...",2.0,1125.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,37206.0,47037010000.0,412476.0,38.31,,,3.0,12.0,101.0,101.0,1048.0,2485.0,2525.0,40.0,0.016097,1895.0,1938.0,43.0,0.022691,1234.0,1092.0,-142.0,-0.115073,0.889738,0.930297,0.040559,0.045585,0.472296,0.742002,0.269707,0.571055,0.416532,0.274725,-0.141806,-0.340446,52372.0,116800.0,64428.0,1.230199,21.6,4.1,-17.5,-0.810185,True
2,7,0.347194,136015.0,"Very relaxing, beautiful property, and warm / ...",Apart. on the Ridge near Opryland,666322.0,36.19464,-86.67195,103.0,30.0,50.0,0.43,2.0,190.0,https://www.airbnb.com/rooms/136015,Fantastic View! Overlooks farms and is on the ...,Dead end street with great neighbors!,2011-06-05,4.0,Entire apartment,2.0,1 bath,1.0,1.0,"[""Fire extinguisher"", ""Wifi"", ""Dryer"", ""Washer...",30.0,1125.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,37214.0,47037020000.0,278031.0,65.43,1581.0,22.18,9.0,112.0,48.0,48.0,4429.0,5411.0,6024.0,613.0,0.113288,4451.0,4970.0,519.0,0.116603,2566.0,2714.0,148.0,0.057677,0.781186,0.760458,-0.020728,-0.026534,0.365311,0.430986,0.065675,0.179778,0.120811,0.179071,0.058261,0.48225,53590.0,68532.0,14942.0,0.278821,3.6,6.9,3.3,0.916667,False
3,7,0.612693,258817.0,Diana is such a gracious hostess. She has a b...,ButterflyRoom-private bath- open Jan 26,22296.0,36.16249,-86.58988,27.0,30.0,96.0,0.86,7.0,364.0,https://www.airbnb.com/rooms/258817,Morningstar House! Extended stay - 3+ months- ...,"I'm very near the Percy Priest Lake, natural ...",2009-06-19,16.0,Private room in house,2.0,1 private bath,1.0,2.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",30.0,365.0,95.0,10.0,9.0,10.0,10.0,9.0,9.0,37076.0,47037020000.0,285278.0,51.53,,,1.0,4.0,10.0,10.0,3302.0,8025.0,9141.0,1116.0,0.139065,5408.0,6464.0,1056.0,0.195266,3467.0,4045.0,578.0,0.166715,0.828536,0.68242,-0.146116,-0.176354,0.483173,0.464882,-0.018291,-0.037855,0.471878,0.458344,-0.013534,-0.028681,64356.0,72235.0,7879.0,0.122428,3.4,7.1,3.7,1.088235,False
4,7,0.572819,289242.0,I had such a fantastic time at Diana's place. ...,"MorningstarHouse, monthly room- open Feb 2",22296.0,36.16102,-86.59207,24.0,30.0,76.0,0.68,7.0,364.0,https://www.airbnb.com/rooms/289242,Morningstar House! Extended stay - price reduc...,The Morningstar House is in a quiet cul-de-sac...,2009-06-19,16.0,Private room in house,1.0,1 shared bath,1.0,3.0,"[""Hair dryer"", ""Air conditioning"", ""Smoke alar...",30.0,365.0,94.0,9.0,8.0,10.0,9.0,9.0,9.0,37076.0,47037020000.0,285278.0,51.53,,,1.0,4.0,10.0,10.0,3302.0,8025.0,9141.0,1116.0,0.139065,5408.0,6464.0,1056.0,0.195266,3467.0,4045.0,578.0,0.166715,0.828536,0.68242,-0.146116,-0.176354,0.483173,0.464882,-0.018291,-0.037855,0.471878,0.458344,-0.013534,-0.028681,64356.0,72235.0,7879.0,0.122428,3.4,7.1,3.7,1.088235,False


In [250]:
airbnb_topics_df = airbnb_topics[['listing_id', 'topic']]

In [252]:
#airbnb_topics_df.to_csv('../data/airbnb_topics_df.csv')

In [248]:
airbnb_topics_negative = airbnb_topics[airbnb_topics['topic']==1]

In [249]:
airbnb_topics_negative.value_counts('gentrifying')

gentrifying
False    17
True      4
dtype: int64

In [70]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

## Find most representative Reviews for each Topic

In [None]:
for i in range(6):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)

## Sample from Housing Project (this isn't working)

In [None]:
# This is just a previous code, for safekeeping.
# Create a tokenizing function that takes text and removes all numbers
#def tokenizer(text):
#    return [x for x in re.findall(r'[a-z]+', text.lower()) if len(x) > 1]

In [None]:
airbnb_train.head(1)

In [None]:
# using a list comprehension, iterate over the four permit type contents and vectorize them using the TfidfVectorizer
vectorizer_test1 = TfidfVectorizer(
    tokenizer = tokenizer, 
    stop_words = 'english', 
    #min_df=50, 
    #max_df=0.4, 
    #ngram_range=(1,3)
).fit(airbnb_train)

In [None]:
# Check to see the shape of the first sparse matrix generated
vectorizer_test1.transform(airbnb_train)

## Sample from Gensim Blog
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
test_comment = airbnb.comments_concatenated.iloc[2]

In [None]:
print('original document: ')
words = []
for word in test_comment.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(test_comment))

## Try again from Michael's Notebook

In [None]:
docs = airbnb.copy()['comments_concatenated'].iloc[0:3]

In [None]:
docs[0]

In [None]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in tqdm(range(len(docs))):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

## Rest of Michael's walkthrough below

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Set training parameters.
num_topics = 6
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)