# Topic Modeling

Topic modeling on Denver AirBnb review data through 2018

### Import Data and Clean

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/reviews.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,923880,28484508,2015-03-24,11595335,Lorraine,Jasmin was the perfect hostess.\r<br/>The flat...
1,11156,19220,2009-12-05,52946,Jeff,Colleen was friendly and very helpful regardin...
2,11156,32592,2010-03-31,99382,Michael,"Great place, centrally located, easy walk to t..."
3,11156,42280,2010-05-14,105797,Marina,Colleen is very friendly and helpful. The apar...
4,11156,140942,2010-11-17,259213,Sigrid,Dear Colleen!\r<br/>My friend Diemut from Germ...


In [None]:
df.isnull().sum()

listing_id        0
id                0
date              0
reviewer_id       0
reviewer_name     1
comments         47
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
blanks = []  # start with an empty list

#change if there are additional columns
for i,x1,x2,x3,x4,x5,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [None]:
df.drop(blanks, inplace=True)

In [None]:
#split into training and testing sets
df['date']=pd.to_datetime(df['date'])


df_train = df[df['date'].dt.year==2022].copy()
df_test = df[df['date'].dt.year>=2022].copy()

In [None]:
print("training data size: " , len(df_train))
print("testing data size: " , len(df_test))

training data size:  47999
testing data size:  47999


### NLP Pre-processing

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

#tokenize comments
texts_train = df_train['comments'].tolist()
spacy_docs = list(nlp.pipe(texts_train))

In [None]:
#lemmatize words and remove stop words
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in spacy_docs]

In [None]:
#include bi-grams
import re
from gensim.models import Phrases


bigram = Phrases(docs, min_count=10)
tokens = []

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
            tokens.append(token)



In [None]:
#create dictionary
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=5, no_above=0.1)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

Number of unique words in original documents: 29386
Number of unique words after removing rare and common words: 6515
Example representation of document 3: [(40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2)]


In [None]:
#create corpus
corpus = [dictionary.doc2bow(doc) for doc in docs]

### Run LDA model

In [None]:
#run LDA model
from gensim.models import LdaModel

model = LdaModel(corpus=corpus,
                       id2word=dictionary,
                       num_topics=10,#number of topics
                       chunksize=500,
                       passes=3,
                       random_state=617)

#print top words in each topic
for (topic, words) in model.print_topics():
  print(topic, ":", words, '\n')

0 : 0.021*"beautiful" + 0.017*"quiet" + 0.016*"private" + 0.013*"love" + 0.012*"day" + 0.012*"pool" + 0.012*"area" + 0.011*"relax" + 0.011*"peaceful" + 0.011*"incredible" 

1 : 0.040*"helpful" + 0.036*"communication" + 0.030*"friendly" + 0.027*"thank" + 0.026*"check" + 0.024*"definitely" + 0.020*"responsive" + 0.020*"super" + 0.019*"quick" + 0.017*"respond" 

2 : 0.026*"room" + 0.024*"kitchen" + 0.020*"space" + 0.019*"tidy" + 0.018*"transport" + 0.016*"area" + 0.015*"public" + 0.014*"bedroom" + 0.013*"clean_tidy" + 0.013*"unit" 

3 : 0.027*"sydney" + 0.027*"restaurant" + 0.025*"station" + 0.022*"minute" + 0.022*"train" + 0.021*"shop" + 0.019*"city" + 0.016*"cafe" + 0.015*"away" + 0.015*"bus" 

4 : 0.058*"highly" + 0.048*"highly_recommend" + 0.044*"amazing" + 0.042*"beautiful" + 0.038*"house" + 0.027*"fantastic" + 0.022*"excellent" + 0.022*"wonderful" + 0.021*"communication" + 0.020*"enjoy" 

5 : 0.044*"check" + 0.041*"bed" + 0.023*"comfy" + 0.016*"communicate" + 0.011*"night" + 0.011*"

#### Topics:
The topics look like:

1. (0) Host Communication
2. (1) Recommend to others
3. (2) Good Feelings
4. (3) Location
5. (4) Accommodations

In [None]:
#Add topic number to original df
topic_nums = []
topic_1_confidence = []
topic_2_confidence = []
topic_3_confidence = []
topic_4_confidence = []
topic_5_confidence = []
topic_6_confidence = []
topic_7_confidence = []
topic_8_confidence = []
topic_9_confidence = []
topic_0_confidence = []

for (text, doc) in zip(texts_train, docs):
    probs = np.array(model[dictionary.doc2bow(doc)])
    topic_nums.append(probs[np.argsort(probs[:,-1])][-1,0])
    confidence1 = -1
    confidence2 = -1
    confidence3 = -1
    confidence4 = -1
    confidence5 = -1
    confidence6 = -1
    confidence7 = -1
    confidence8 = -1
    confidence9 = -1
    confidence0 = -1
    for p in probs:
      if p[0] == 1 :
        confidence1 = p[1]
      if p[0] == 2 :
        confidence2 = p[1]
      if p[0] == 3 :
        confidence3 = p[1]
      if p[0] == 4 :
        confidence4 = p[1]
      if p[0] == 5 :
        confidence5 = p[1]
      if p[0] == 6 :
        confidence6 = p[1]
      if p[0] == 7 :
        confidence7 = p[1]
      if p[0] == 8 :
        confidence8 = p[1]
      if p[0] == 9 :
        confidence9 = p[1]
      if p[0] == 0 :
        confidence0 = p[1]
    topic_1_confidence.append(confidence1)
    topic_2_confidence.append(confidence2)
    topic_3_confidence.append(confidence3)
    topic_4_confidence.append(confidence4)
    topic_5_confidence.append(confidence5)
    topic_6_confidence.append(confidence6)
    topic_7_confidence.append(confidence7)
    topic_8_confidence.append(confidence8)
    topic_9_confidence.append(confidence9)
    topic_0_confidence.append(confidence0)

df_train['topic'] = topic_nums
df_train['topic1confidence'] = topic_1_confidence
df_train['topic2confidence'] = topic_2_confidence
df_train['topic3confidence'] = topic_3_confidence
df_train['topic4confidence'] = topic_4_confidence
df_train['topic5confidence'] = topic_5_confidence
df_train['topic6confidence'] = topic_6_confidence
df_train['topic7confidence'] = topic_7_confidence
df_train['topic8confidence'] = topic_8_confidence
df_train['topic9confidence'] = topic_9_confidence
df_train['topic0confidence'] = topic_0_confidence

In [None]:
df_train.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,topic,topic1confidence,topic2confidence,topic3confidence,topic4confidence,topic5confidence,topic6confidence,topic7confidence,topic8confidence,topic9confidence,topic0confidence
46,923880,556793261850234844,2022-02-07,95481448,Caitlin,Jasmin’s apartment was a beautiful place to st...,8.0,0.189672,0.185179,0.010004,0.157591,0.010007,0.01,0.010005,0.407538,0.010002,0.010002
47,923880,580705049161873973,2022-03-12,13940471,Bianca,"Thanks to Jasmin for the impeccable, comfy and...",8.0,0.07331,0.048643,0.109167,-1.0,0.070282,-1.0,-1.0,0.520549,0.072043,0.097182
48,923880,603915027858519984,2022-04-13,67605513,Karoline,We had a wonderful stay at Jasmin's place! It ...,4.0,0.101424,0.108639,-1.0,0.356611,-1.0,-1.0,0.152313,0.245294,-1.0,-1.0
62,752978,531430309149660981,2022-01-03,2366718,Vanessa,Tracey’s lovely house was perfect for a week a...,5.0,-1.0,-1.0,0.122689,0.219793,0.269743,-1.0,-1.0,0.175416,0.064602,0.131086
263,934543,539349240023165073,2022-01-13,77414976,Nelson,Great location next to a local cafe and short ...,8.0,0.110669,0.276696,0.010003,0.119041,-1.0,-1.0,-1.0,0.433589,0.01,0.010002


pandas.core.frame.DataFrame

In [None]:
#save df as csv

#Uncomment and run once number of topics is finalized:
df_train.to_csv('topic_model_2022.csv')

# Classification

I want to predict the topic of reviews left in 2019 in Denver, trained on the reviews through 2018.


In [None]:
import numpy as np
import pandas as pd

df_train = pd.read_csv('inter.csv')

In [None]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,topic
0,61,752978,734922531,2021-02-27,672738,Harriet,Amazing location and a super comfortable house...,4.0
1,128,934398,762678899,2021-05-25,43301148,Seunghye,Good location. close to the station. It was a ...,2.0
2,252,934543,729324743,2021-02-06,30188327,Jane,"Fabulous location, comfortable and well appoin...",3.0
3,253,934543,736870932,2021-03-06,3091628,Hope,"We had a wonderful weekend away, Linda was so ...",2.0
4,254,934543,742846221,2021-03-27,84849356,Karen,Lovely accommodation in a great location. Clea...,4.0


In [None]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    spacy_docs = nlp(sentence)

    docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in spacy_docs]
    mytokens = []
    for ar in docs:
      mytokens.extend(ar)

    # return preprocessed list of tokens
    return mytokens


# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

#assign TF_IDF vectorizer
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
from sklearn.model_selection import train_test_split

X = df_train['comments'] # the features we want to analyze
ylabels = df_train['topic'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=617)

In [None]:
X_train = df_train['comments']
y_train = df_train['topic']
X_test = df_test['comments']

In [None]:
X_train.isnull().sum()

0

### SVM

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000,
                                    max_depth = 5)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

ValueError: ignored

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))

SVM Accuracy: 0.8547089695914761


In [None]:
#test on new data
x=np.random.randint(0,len(df_train))
sample_pred = df_train['comments'][x]
print(sample_pred)

prediction_test = pipe.predict(sample_pred)
print("First guess:", prediction_test[0])
print("Second guess:", prediction_test[1])

Very warm & welcoming from the moment we got there. Equipt with all basics essentials one would need.
First guess: 4.0
Second guess: 2.0


We get much better accuracy with the SVM model.

In this random example, we see that the model predicted the topic 4 (accomodation). We can also ask to see the next likely topic.