In [1]:
import numpy as np
import pandas as pd
import emot

from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
from scipy.special import softmax
from bertopic import BERTopic

pd.set_option('display.max_columns', 0)

  from .autonotebook import tqdm as notebook_tqdm


## Topic Modelling & Sentiment Analysis

In this notebook, we will investigate some of the common topics that are mentioned in user comments.

### Import Dataset

We first import the datasets into the notebook

In [2]:
# Import datasets
df_listings = pd.read_csv('../data/processed/listings_processed.csv', parse_dates = ['host_since', 'first_review', 'last_review'])
df_calendar = pd.read_csv('../data/processed/calendar_processed.csv', parse_dates = ['date'])
df_reviews = pd.read_csv('../data/processed/reviews_processed.csv', parse_dates = ['date'])

In [3]:
# Initial check
df_reviews

Unnamed: 0,listing_id,date,reviewer_id,reviewer_name,comments
0,18270,2011-03-17,184985,Matthew,"Great place, great location, great host."
1,13188,2010-02-21,34595,Rebecca,"We had a wonderful time! The place is cozy, wa..."
2,13188,2010-08-27,199181,Lillian,What a fabulous home away from home! The apar...
3,18270,2011-04-06,99094,Jessica,I had a lovely and comfortable stay at Ran's....
4,18270,2011-05-02,434670,Catherine,"Nice room, great location and friendly host!"
...,...,...,...,...,...
272596,1167658394866031187,2024-06-01,75090412,Georgina,"Loved the place. Easy to check-in, Driana was ..."
272597,1167658394866031187,2024-06-08,115858985,Jason,Driana and her husband were the ideal hosts! ...
272598,1168245100183046449,2024-06-11,62180956,Nathaniel,Javier was extremely hospitable to my family. ...
272599,1168449186334658361,2024-06-09,42273592,Agi,Wir waren 2 Nächte in das Haus. Wir haben uns ...


In [4]:
# Summary statistics
df_reviews.describe(include = ['O'])

Unnamed: 0,reviewer_name,comments
count,272600,272560
unique,43121,265705
top,David,.
freq,2053,244


In [5]:
# Checking for null values
df_reviews.isna().sum()

listing_id        0
date              0
reviewer_id       0
reviewer_name     1
comments         41
dtype: int64

### Initial Remarks

We first note the following remarks
- Some comments are in different languages 

- Some entries have no comments

- There can be multiple comments in a listing


### Processing Null Entries

We need to handle some of the null comments before diving further into our analysis.

Since comments that are NaN do not have any meaning, we will drop them from our dataset.

In [6]:
# Drop NaN comments
df_reviews = df_reviews.dropna(subset = ['comments']).reset_index(drop = True)
df_reviews.isna().sum()

listing_id       0
date             0
reviewer_id      0
reviewer_name    1
comments         0
dtype: int64

For the lone NaN entry in `reviewer_name`, we will just fill the value with `unknown`

In [7]:
# Replace nan in reviewer_name with 'unknown'
df_reviews.loc[df_reviews['reviewer_name'].isna(), 'reviewer_name'] = 'unknown'
df_reviews.isna().sum()

listing_id       0
date             0
reviewer_id      0
reviewer_name    0
comments         0
dtype: int64

### Convert to Lowercase

We first normalize our text by converting the comments into lowercase

In [8]:
# Convert to lowercase
df_reviews.loc[:, 'comments'] = df_reviews['comments'].str.lower()
df_reviews.loc[:, 'reviewer_name'] = df_reviews['reviewer_name'].str.lower()
df_reviews

Unnamed: 0,listing_id,date,reviewer_id,reviewer_name,comments
0,18270,2011-03-17,184985,matthew,"great place, great location, great host."
1,13188,2010-02-21,34595,rebecca,"we had a wonderful time! the place is cozy, wa..."
2,13188,2010-08-27,199181,lillian,what a fabulous home away from home! the apar...
3,18270,2011-04-06,99094,jessica,i had a lovely and comfortable stay at ran's....
4,18270,2011-05-02,434670,catherine,"nice room, great location and friendly host!"
...,...,...,...,...,...
272555,1167658394866031187,2024-06-01,75090412,georgina,"loved the place. easy to check-in, driana was ..."
272556,1167658394866031187,2024-06-08,115858985,jason,driana and her husband were the ideal hosts! ...
272557,1168245100183046449,2024-06-11,62180956,nathaniel,javier was extremely hospitable to my family. ...
272558,1168449186334658361,2024-06-09,42273592,agi,wir waren 2 nächte in das haus. wir haben uns ...


### Translating Comments from Different Languages

We note that some of the comments are of different languages. Since these comments can have significant meaning, we want to keep as much information as possible in our dataset.

What we could do to extract information from these comments is to translate the comments to English using a language model, noting that some information can be lossed or not translated properly.

We can do so via some of the Huggingface models, which can be imported via the `transformers` library

In [62]:
# # Translator class
# class Translator:

#     def __init__(self) -> None:
#         self.lang_detector = LanguageDetectorBuilder.from_all_spoken_languages().build()
#         self.lang_translator = GoogleTranslator(source = 'auto', target = 'en')

#     def translate(self, s):
#         lang = self.lang_detector.detect_language_of(s)
#         if not lang or lang == Language.ENGLISH: return s
#         translation = self.lang_translator.translate(s)
#         return translation
    
#     def translate_batch(self, lst):
#         return [self.translate(item) for item in lst]

### Handling Emoticons

We note that some of our comments have emoticons such as `:-)` and `:)`. 

Since these icons will be removed when we start removing punctuations from the list, we need to find a way to preserve the meanings of these emoticons in the text.

One way to do that is to translate the emoticon into plain text, and perform sentiment analysis on that

In [9]:
# Replace emoticons with meaning
def replace_emoticons(s):
    emot_obj = emot.core.emot()
    res = emot_obj.emoticons(s)
    if res['flag']:
        for emoticon, meaning in zip(res['value'], res['mean']):
            new = s.replace(emoticon, meaning.lower())
        return new
    return s

In [13]:
[replace_emoticons(s) for s in df_reviews['comments'].tolist()]

In [91]:
df_reviews.loc[:, 'comments'] = df_reviews['comments'].apply(replace_emoticons)

: 

: 

### Remove Punctuations 

We will remove all "neutral" punctuations in the comments, except for the following
- `'` which represents a shorthand for some stopwords

- `-` which connects different terms together

We note that for the exceptions above, the punctuations are between letters and so we can also remove instances that are not enclosed inbetween two letters as well

In [60]:
# Remove punctuations
df_reviews.loc[:, 'comments'] = df_reviews['comments'].str.replace(pat = r'[[!"#$%&()*+,.\/:;<=>?@[\\\]^_`{|}~]', repl = ' ', regex = True)
df_reviews.loc[:, 'comments'] = df_reviews['comments'].str.replace(pat = r"(?<![a-zA-Z])[-']+|[-']+(?![a-zA-Z])", repl = ' ', regex = True)
df_reviews

Unnamed: 0,listing_id,date,reviewer_id,reviewer_name,comments
0,18270,2011-03-17,184985,matthew,great place great location great host
1,13188,2010-02-21,34595,rebecca,we had a wonderful time the place is cozy wa...
2,13188,2010-08-27,199181,lillian,what a fabulous home away from home the apar...
3,18270,2011-04-06,99094,jessica,i had a lovely and comfortable stay at ran's ...
4,18270,2011-05-02,434670,catherine,nice room great location and friendly host
...,...,...,...,...,...
272555,1167658394866031187,2024-06-01,75090412,georgina,loved the place easy to check-in driana was ...
272556,1167658394866031187,2024-06-08,115858985,jason,driana and her husband were the ideal hosts ...
272557,1168245100183046449,2024-06-11,62180956,nathaniel,javier was extremely hospitable to my family ...
272558,1168449186334658361,2024-06-09,42273592,agi,wir waren 2 nächte in das haus wir haben uns ...


In [64]:
df_reviews['comments'].sort_values(ascending = True)[:20]

213797     
35275      
124384     
231809     
128763     
53199      
69219      
155278     
232224     
216260     
67667      
259638     
150044     
174123     
102610     
102606     
271067     
165829     
165836     
90203      
Name: comments, dtype: object

### Tokenize Comments

To conduct sentiment analysis, we want to first tokenize the comments. 

Note that since a comment can contain multiple sentences, we want to analyze the sentiments sentence-wise, and so we will perform the following steps

- Tokenize comments into different sentences

- For each comments, tokenize a sentence into words

In [None]:
# Tokenize sentences
df_reviews.loc[:, 'comments_tokenized'] = df_reviews['comments'].apply(word_tokenize)
df_reviews['comments_tokenized']

### Processing Host Names

We note that the host names mentioned in the review do not have significant impact on the comments. 

Therefore, we will try to replace all instances of the host's name with `host` to reduce the dimension of our word space

We can do so by

- Taking the host name associated with `listing_id`

- Search for host name in the comments and replace the name with `host`

In [None]:
df_reviews['comments'].sort_values()

In [15]:
# # Merge host name to df_reviews by listing_id
# df_listings[['listing_id',  'host_name']]
# df_reviews = df_reviews.merge(df_listings[['listing_id', 'host_name']], on = 'listing_id', how = 'inner')
# df_reviews

# # Lowercase host_name and replace instance in comments with 'host'
# df_reviews['host_name'] = df_reviews['host_name'].str.lower()
# df_reviews['comments'] = df_reviews.apply(lambda row : str(row['comments']).replace(row['host_name'], 'host'), axis = 1)
# df_reviews['comm']

In [None]:
df_reviews['comments'].sort_values()

### Remove Stopwords

We will next remove all of the stopwords from the comments

In [None]:
# Stopwords
def remove_stopwords(tokens, stopwords):
    removed = [t for t in tokens if t not in stopwords]
    return removed

stopwords_set = set(stopwords.words())
stopwords_set.add("'s")
df_reviews.loc[:, 'comments_tokenized'] = df_reviews['comments_tokenized'].apply(remove_stopwords, stopwords = stopwords_set)
df_reviews['comments_tokenized']

### Join Tokens into Sentences

Having done our required text preprocessing, we will join the tokens together to get a corpus for further analysis

In [None]:
# Join tokens

def join_tokens(tokens):
    return ' '.join(tokens)

df_reviews['comments_cleaned'] = df_reviews['comments_tokenized'].apply(join_tokens)
df_reviews['comments_cleaned']

### Get Sentiments

We can now perform sentiment analysis on the comments

In [104]:
# sentiment function
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

def get_sentiment(text, model, config, tokenizer):
    try:
        encoded = tokenizer(text, return_tensors = 'pt', padding = True)
        output = model(**encoded)
        scores = softmax(output[0][0].detach().numpy())
        return config.id2label[np.argmax(scores)]
    except:
        print(text)
        return text

def sentiment_analysis(series):

    # Setting up configurations
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    res = series.apply(get_sentiment, model = model, config = config, tokenizer = tokenizer)
    return res

In [None]:
sentiment_analysis(df_reviews['comments_cleaned'])