In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import spacy 
import nltk 
import string
import gensim 
import matplotlib.colors as mcolors
nltk.download('omw-1.4')
nltk.download('wordnet')
en = spacy.load('en_core_web_sm')

from langdetect import detect, detect_langs
from deep_translator import GoogleTranslator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
from gensim.models import CoherenceModel
from wordcloud import WordCloud, STOPWORDS


In [None]:
review_data = pd.read_csv('reviews.csv.gz', compression='gzip',
                   error_bad_lines=False)

Translating reviews

In [None]:
#detect languages of each review
review_data['comments'] = review_data['comments'].astype('str')
def det(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

review_data['Lang'] = review_data['comments'].apply(det)

In [None]:
#translate reviews not in English
def translation(rev, lan):
    if lan != 'en':
        try: 
            comment_translated = GoogleTranslator(source='auto', target='en').translate(rev)
        except:
            comment_translated = np.nan
    else:
        comment_translated = rev
    return comment_translated
                
review_data['comments_translated'] = review_data.apply(lambda x: translation(x.comments, x.Lang), axis=1)

In [None]:
#drop NA's and reset index
review_data = review_data.dropna()
review_data.reset_index(drop=True, inplace=True)

Sentiment scores

In [None]:
#obtain sentiment scores for all reviews
analyser = SentimentIntensityAnalyzer()

def pos_score(x):
    score = analyser.polarity_scores(x)
    return score['pos']

review_data['positivity_score'] = review_data.apply(lambda x: pos_score(x['comments']), axis=1)

def neg_score(x):
    score = analyser.polarity_scores(x)
    return score['neg']

review_data['negativity_score'] = review_data.apply(lambda x: neg_score(x['comments']), axis=1)

def neu_score(x):
    score = analyser.polarity_scores(x)
    return score['neu']

review_data['neutral_score'] = review_data.apply(lambda x: neu_score(x['comments']), axis=1)

def comp_score(x):
    score = analyser.polarity_scores(x)
    return score['compound']

review_data['compound_score'] = review_data.apply(lambda x: comp_score(x['comments']), axis=1)

In [None]:
#to pickle file
review_data.to_pickle("reviews_vader_translated.pkl")

In [None]:
df_cc = review_data.drop(columns=['reviewer_name', 'comments_translated'])
agg_review_scores = df_cc.groupby('listing_id').mean()
agg_review_scores = agg_review_scores.drop(columns=['id'])
agg_review_scores = agg_review_scores.drop(columns=['reviewer_id'])

In [None]:
agg_review_scores.to_pickle("agg_review_scores_translated.pkl")

EDA Reviews

In [None]:
#completely positive reviews
review_data[review_data.positivity_score == 1].shape[0]

In [None]:
#distribution sentiment scores
warnings.filterwarnings('ignore')
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(review_data['positivity_score'])
plt.subplot(1,2,2)
sns.distplot(review_data['negativity_score'])


In [None]:
#distribution sentiment scores
warnings.filterwarnings('ignore')
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(review_data['compound_score'])
plt.subplot(1,2,2)
sns.distplot(review_data['neutral_score'])

In [None]:
#additional columns for length and number of words
def len_rev(rev):
    leng = len(rev)
    return leng

review_data['len_review']  = review_data.apply(lambda x: len_rev(x.loc['comments_translated']), axis=1)

def words_no(rev):
    words = rev.split()
    return len(words)

review_data['no_words']  = review_data.apply(lambda x: words_no(x.loc['comments_translated']), axis=1)
review_data.describe()
review_data = review_data.drop(columns=['no_words', 'len_review'])

Review Recency

In [None]:
#add column for recency
from datetime import datetime
today = datetime.today()

def date_time(date):
    new_date = datetime.strptime(date, '%Y-%m-%d')
    days_tot = (today-new_date).days
    return days_tot

review_data['days_ago'] = review_data.apply(lambda x: date_time(x.loc['date']), axis=1)


In [None]:
#add column for new sentiment scores with weight for recency
shortest_days = min(review_data['days_ago'])
review_data['recency_weight'] = shortest_days/df_rt['days_ago']

tot_rw = review_data.groupby('listing_id').sum()
tot_rw.rename(columns ={'recency_weight':'sum_rec_weight'}, inplace = True)
tot_rw = tot_rw['sum_rec_weight'].copy()

review_data = pd.merge(review_data, tot_rw, on='listing_id')
review_data['weighted_neg_score'] = review_data['negativity_score']*review_data['recency_weight']/review_data['sum_rec_weight'] #sum recency_weight per listing
review_data['weighted_pos_score'] = review_data['positivity_score']*review_data['recency_weight']/review_data['sum_rec_weight'] #sum recency_weight per listing
review_data['weighted_neutral_score'] = review_data['neutral_score']*review_data['recency_weight']/review_data['sum_rec_weight'] #sum recency_weight per listing
review_data['weighted_comp_score'] = review_data['compound_score']*review_data['recency_weight']/review_data['sum_rec_weight'] #sum recency_weight per listing
review_data = review_data.drop(columns=['id', 'reviewer_id'])

In [None]:
#add weighted sentiment scores per listing
agg_weighted_rs = review_data.groupby('listing_id').mean()
agg_weighted_rs = agg_weighted_rs[['weighted_neg_score', 'weighted_pos_score', 'weighted_neutral_score','weighted_comp_score']].copy()

agg_weighted_rs.to_pickle("agg_weighted_rs.pkl")


Topic Modelling

In [None]:
#cleaning reviews
#lower case, remove punctuation, remove stopwords, lemmatization

stop = set(en.Defaults.stop_words)
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    doc = re.sub('<[^>]+>', '', doc)
    doc = re.sub("´", "'", doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    final = re.sub(' u ', ' ', normalized)
    return final

doc_clean = [clean(doc).split() for doc in review_data['comments_translated']] 

In [None]:
#creating the term dictionary of the corpus and converting corpus to document term matrix with dictionary
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
#creating LDA model and training on document term matrix
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=6, alpha=0.3, eta=0.1, id2word = dictionary, passes=2, random_state=1)

In [None]:
#coherence scores for LDA Model
coherencemodel = CoherenceModel(model=ldamodel, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
print(coherencemodel.get_coherence())

In [None]:
#tune hyperparameters alpha and eta for each number of topics (change num_topics)

a=[0.1,0.2,0.3, 0.4, 'symmetric']
b= [0.01,0.1,0.2,'symmetric']
num_topics = 9
coherence_values = {}
model_list = []
for A in a:
    for B in b:
        model = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=2, alpha=A, eta=B, random_state=1)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
        coh=coherencemodel.get_coherence()
        ab = str(A)+'+'+str(B)+':'+str(coh)
        print(ab)
        coherence_values[ab]= coh

In [None]:
#get topic probabilities per topic
topics = num_topics
topic_probs = []
for i in range(len(doc_term_matrix)):
    probs_i={}
    for j in range(topics):
        key = 'topic_'+str(j)
        scores = ldamodel.__getitem__(doc_term_matrix[i], eps=None)
        try:
            topic_prob = scores[j][1]
        except:
            topic_prob = 0
        probs_i[key]=topic_prob
    topic_probs.append(probs_i)

In [None]:
#get aggregated topic probabilities per listing
df_pr = pd.DataFrame(topic_probs)
review_data = review_data.join(df_pr)
agg_topics = review_data.groupby('listing_id').mean()
agg_topics = agg_topics.drop(columns=['id', 'reviewer_id'])
#change value '9t' for num_topics
agg_topics.to_pickle("topic_modelling_values_9t.pkl")

In [None]:
#topics representative words
all_topics = ldamodel.print_topics(num_topics=9, num_words=10)
all_topics

In [None]:
#wordcloud of Top N words in each topic
##code based on https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#5.-Build-the-Topic-Model
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = ldamodel.show_topics(formatted=False)

fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()