# AirBnB Munich - Analysis of Prices and Customer Reviews

### Import Libraries

In [None]:
# Read in necessary libraries
import pandas as pd
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(12,7))

import seaborn as sns
from scipy import stats

import nltk
from nltk.corpus import stopwords, words
from nltk.stem.wordnet import WordNetLemmatizer

import string
import spacy
from spacy.lang.en import English

import gensim
from gensim import corpora
from gensim.models import Phrases

import pyLDAvis.gensim

### Read and wrangle data

In [None]:
# data downloaded 31/07/19, week 31
df_calendar = pd.read_csv("C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/calendar.csv")
df_listings_summary = pd.read_csv('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/listings_summary.csv')
df_reviews = pd.read_csv('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/reviews.csv')

In [None]:
# clean the prices and return float values
def clean_price(x):
    x = x.replace(",", "")
    return float(x[1:])

# convert "available" column to booleans
def clean_boolean(x):
    if x == "t":
        return True
    else:
        return False

# convert and clean
df_calendar['price'] = df_calendar['price'].apply(clean_price)
df_calendar['adjusted_price'] = df_calendar['adjusted_price'].apply(clean_price)
df_calendar['available'] = df_calendar['available'].apply(clean_boolean)

# convert string to datetime
df_calendar['date'] = pd.to_datetime(df_calendar['date'])
df_reviews["date"] = pd.to_datetime(df_reviews['date'])

# create month column
df_calendar['Calendar Month'] = df_calendar['date'].dt.month
df_calendar['Calendar Week'] = df_calendar['date'].dt.week

df_calendar.rename(columns={"price": "Price/Night (€)"}, inplace = True)

### Price Analysis

In [None]:
S = df_calendar['Price/Night (€)']
S[~((S-S.mean()).abs() > 3*S.std())].hist(bins = 100, color='#FF6666')
plt.axvline(x=S.median(), color='k', linestyle='--')
plt.ylabel('Frequency')
plt.xlabel('Price/Night (€)')
plt.title('Munich AirBnB Prices (June 2019 - July 2020)', fontsize=15)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.boxplot(x="Calendar Week", y="Price/Night (€)", data = df_calendar, showfliers=False, color="seagreen").set_title("Munich AirBnB Price Boxplots per CW",fontsize=15)

In [None]:
october_festival = np.array(df_calendar[(df_calendar['Calendar Week']>37) & (df_calendar['Calendar Week']<41)]['Price/Night (€)'])
rest = np.array(df_calendar[(df_calendar['Calendar Week']<38) | (df_calendar['Calendar Week']>40)]['Price/Night (€)'])

In [None]:
results = stats.ttest_ind(october_festival, rest, equal_var=False)
alpha = 0.05
if (results[0] > 0) & (results[1]/2 < alpha):
    print("reject null hypothesis, AirBnB prices of {} are greater than during {}".format('October Festival','rest of the year'))
    print("p-value: " + str(results[1]/2))
    print("t-stat: " + str(results[0]))
else:
    print("accept null hypothesis")
    print("p-value: " +str(results[1]/2))
    print("t-stat: " + str(results[0]))

### Merge data

In [None]:
# join df_calendar and df_reviews to combine price and review information
df_calendar = df_calendar.groupby('listing_id').mean().reset_index()
df_calendar = df_calendar[["listing_id", "Price/Night (€)"]]
merged = pd.merge(df_calendar, df_reviews, how='inner', on='listing_id')
merged = pd.merge(merged, df_listings_summary, how='inner', left_on='listing_id', right_on='id')

merged["review_length"] = merged["comments"].apply(lambda x: len(str(x).split()))

### Data Preprocessing for LDA

In [None]:
documents = merged['comments'].tolist()

In [None]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
  
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
stop = set(stopwords.words('english'))
word_set = set(words.words())
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in str(doc).lower().split() if i not in stop and i in word_set])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split() if not hasNumbers(word) and len(word) > 1)
    return normalized

documents = [clean(doc) for doc in documents]  

In [None]:
# src: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

'''
Data Preprocessing:
- Tokenization: Split text into words, lowercase and remove punctuation 
- stopwords removed
- words lemmatized: words in third person are changed to first person, verbs in past and future are changed into present

'''
spacy.load("en")
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens
  
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)
  

In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

text_data = []
for doc in documents:
    tokens = prepare_text_for_lda(doc)
    text_data.append(tokens)
  
bigram = Phrases(text_data, min_count = 20)
trigram = Phrases(bigram[text_data], min_count = 20)  
  
for idx in range(len(text_data)):
    for token in bigram[text_data[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            text_data[idx].append(token)
    for token in trigram[text_data[idx]]:
        if '_' in token:
            # Token is a trigram, add to document.
            text_data[idx].append(token)

keep_row = []
for k in text_data:
    if not k:
        keep_row.append(False)
    else:
        keep_row.append(True)

In [None]:
text_data = [i for (i, v) in zip(text_data, keep_row) if v]
merged = merged[keep_row]

In [None]:
remove = ["wonderful", "beautiful", "fantastic", "absolutely", "thoughtful", "getting", "extremely", "definitely", "willing", "super", "great", "really", "recommend", "would", "thank", "highly", "amaze", "perfect", "lovely", "welcome", "everything", "helpful", "ideal", "seine", "toller", "johannes", "nett", "excelente", "excellent", "value", "provide", "awesome", "strongly"]
remove = list(set(remove))
for text in text_data:
    for word in remove:
        while True:
            if word in text:
                text.remove(word)
            else:
                break

### Build LDA Model

In [None]:
# contain dict from text_data containing number of times a word appears
dictionary = corpora.Dictionary(text_data)

# filter out tokens that appear in less than 20 docs, in more than 0.8 docs (fraction), keep the first 100 000 most frequent tokens
dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=100000)

# for each doc create dict reporting how many words and how many times those words appear (bag of words)  
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/corpus.pkl', 'wb'))
dictionary.save('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/dictionary.gensim')

In [None]:
NUM_TOPICS = 10
# create LDA model, number of passes is number of training passes over document
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes = 15)
ldamodel.save('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/ldamodel.gensim')


# sort_topic false means that gensim topic ordering is equal to pyLDAvis topic ordering (difference gensim starts at 0, pyLDAvis at 1)
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_display, 'C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/lda.html')

### Use LDA Model for inference

In [None]:
len(labels)

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/ldamodel.gensim')
dictionary = gensim.corpora.Dictionary.load('C:/Users/Marvin/Documents/Data Science/Udacity/Data_Scientist_Nanodegree/7_Introduction_to_Data_Science/Project_Blog_Post/dictionary.gensim')

# get topic distribution for given doc
labels = []
for doc in text_data:
    topics = ldamodel.get_document_topics(dictionary.doc2bow(doc))
    prob = 0
    for topic in topics:
        if topic[1] > prob:
            prob = topic[1]
            label = topic[0]
    if prob > 0.5:
        labels.append(label + 1)
    else:
        labels.append(None)

merged['Topics'] = labels

In [None]:
grouped = merged.groupby(['Topics'])['Price/Night (€)'].describe()

In [None]:
grouped

In [None]:
x = np.arange(len(grouped))
ax1 = plt.subplot(1,1,1)
w = 0.3
#plt.xticks(), will label the bars on x axis with the respective country names.
plt.xticks(x + w /2, grouped.index.astype(int))
count =ax1.bar(x, grouped['count'], width=w, color='b', align='center')
plt.ylabel('Reviews Count')
plt.xlabel('Topics')
# ax1.grid(False)
ax2 = ax1.twinx()
ax2.grid(False)
# mean prices per topic
price =ax2.bar(x + w, grouped['mean'] , width=w,color='g',align='center')
#Set the Y axis label as average prices.
plt.ylabel('Average Prices')
#To set the legend on the plot we have used plt.legend()
plt.legend([count, price],['Reviews Count', 'Average Prices'])
#To show the plot finally we have used plt.show().
plt.title('Reviews Frequency vs Average Prices')
plt.show()

In [None]:
value = 'room_type'

for i in range(1,11):
    plt.figure(figsize=(6,5))
    baseline = pd.value_counts(merged[value])
    topic_specific = pd.value_counts(merged[merged['Topics']==i][value])

    baseline_ratio = baseline / baseline.sum()
    topic_specific_ratio = topic_specific / topic_specific.sum()

    topic_specific_ratio.subtract(baseline_ratio).dropna().sort_values(ascending = False)[:10].plot.bar()
    plt.xticks(rotation=90)
    plt.title('Topic ' + str(i))

### Plot Latitude and longitude on map

In [None]:
lat_long_df = (merged.groupby(['latitude', 'longitude']).size() 
   .sort_values(ascending=False) 
   .reset_index(name='count') 
   .drop_duplicates(subset='longitude'))


In [None]:
from gmplot import gmplot
max_lat = lat_long_df['latitude'].max()
min_lat = lat_long_df['latitude'].min()
max_lon = lat_long_df['longitude'].max()
min_lon = lat_long_df['longitude'].min()

mymap = gmplot.GoogleMapPlotter(
    min_lat + (max_lat - min_lat) / 2, 
    min_lon + (max_lon - min_lon) / 2, 
    16)
mymap.heatmap(lat_long_df['latitude'], lat_long_df['longitude'])

mymap.draw('my_gm_plot.html')

In [None]:
merged['neighbourhood'].value_counts()[:3]

In [None]:
lat_long_df = lat_long_df[:int(lat_long_df.shape[0]*0.2)]
keep_row = []
for x, y in zip(merged['longitude'], merged['latitude']):
    keep = False
    for xi, yi in zip(lat_long_df['longitude'], lat_long_df['latitude']):
        if (xi == x) and (yi == y):
            keep = True
            break
    keep_row.append(keep)

In [None]:
merged = merged[keep_row].shape