In [None]:
import numpy as np
import pandas as pd
import seaborn.apionly as sns
import matplotlib.pyplot as plt
import csv, os
from datetime import date, datetime as dt

# statistics package
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
import scipy.stats as sp

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

import sys
!conda install --yes --prefix {sys.prefix} plotly
import plotly.graph_objs as go

# !conda install --yes --prefix {sys.prefix} wordcloud
!conda install -c conda-forge wordcloud=1.2.1
import wordcloud

%matplotlib inline

In [None]:
binpath = 'C:/Users/khanhngu/Downloads/singapore-airbnb' 
csv_name = os.path.join(binpath, 'listings.csv')
df = pd.read_csv(csv_name)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# locate columns with null values
sns.set(rc={'figure.figsize':(19.7,8.27)})
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')


In [None]:
to_drop = ['last_review','reviews_per_month']
df.drop(to_drop, inplace=True, axis=1)
df.dropna(inplace=True)
df.shape

# Exploratory Data Analysis

In [None]:
sns.distplot(df['price'])

In [None]:
sns.countplot(df["neighbourhood_group"])

In [None]:
sns.scatterplot(x='price',y='minimum_nights',data=df)

In [None]:
# map location on SG map
plt.figure(figsize=(10,6))
sns.scatterplot(df.longitude,df.latitude,hue=df.neighbourhood_group)
plt.ioff()

# Hotel Distribution through a Heatmap

In [None]:
ng = df[df.price <250]
plt.figure(figsize=(10,6))
sns.boxplot(y="price",x ='neighbourhood_group' ,data = ng)
plt.title("neighbourhood_group price distribution < 250")
plt.show()

# Room types occupied by neighbourhood_group

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x = 'room_type',hue = "neighbourhood_group",data = df)
plt.title("Room types occupied by the neighbourhood_group")
plt.show()

In [None]:
#catplot room type and price
plt.figure(figsize=(10,6))
sns.catplot(x="room_type", y="price", data=df);
plt.ioff()

In [None]:
#word cloud
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in df.name)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_words=200, background_color="white").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.show()

In [None]:
#initializing empty list where we are going to put our name strings
_names_=[]
#getting name strings from the column and appending it to the list
for name in df.name:
    _names_.append(name)
#setting a function that will split those name strings into separate words   
def split_name(name):
    spl=str(name).split()
    return spl
#initializing empty list where we are going to have words counted
_names_for_count_=[]
#getting name string from our list and using split function, later appending to list above

for x in _names_:
    for word in split_name(x):
        word=word.lower()
        _names_for_count_.append(word)
        
#we are going to use counter
from collections import Counter
#let's see top 25 used words by host to name their listing
_top_20_w=Counter(_names_for_count_).most_common()
_top_20_w=_top_20_w[0:20]

#now let's put our findings in dataframe for further visualizations
sub_w=pd.DataFrame(_top_20_w)
sub_w.rename(columns={0:'Words', 1:'Count'}, inplace=True)


In [None]:
#we are going to use barplot for this visualization
plt.figure(figsize=(10,6))
viz_5=sns.barplot(x='Words', y='Count', data=sub_w)
viz_5.set_title('Counts of the top 20 used words for listing names')
viz_5.set_ylabel('Count of words')
viz_5.set_xlabel('Words')
viz_5.set_xticklabels(viz_5.get_xticklabels(), rotation=80)

# Room sortings according to max number of reviews

In [None]:
df1=df.sort_values(by=['number_of_reviews'],ascending=False).head(1000)
df1.head()

# Room Availability

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df.longitude, df.latitude, c=df.availability_365, cmap='spring', edgecolor='black', linewidth=1\
            , alpha=1)

cbar = plt.colorbar()
cbar.set_label('availability_365')

In [None]:
plt.figure(figsize=(10,6))
sub_6=df[df.price<500]
viz_4=sub_6.plot(kind='scatter', x='longitude',y='latitude',label='availability_365',c='price',cmap=plt.get_cmap('jet'),colorbar=True,alpha=0.4,figsize=(10,10))
viz_4.legend()
plt.ioff()

# Prediction Model

In [None]:
#prepare data
df.drop(['name','id','host_name'],axis=1,inplace=True)

In [None]:
# Converting cateorical features to numberic features


'''Encode labels with value between 0 and n_classes-1.'''
le = preprocessing.LabelEncoder()                                            # Fit label encoder
le.fit(df['neighbourhood_group'])
df['neighbourhood_group']=le.transform(df['neighbourhood_group'])    # Transform labels to normalized encoding.

le = preprocessing.LabelEncoder()
le.fit(df['neighbourhood'])
df['neighbourhood']=le.transform(df['neighbourhood'])

le = preprocessing.LabelEncoder()
le.fit(df['room_type'])
df['room_type']=le.transform(df['room_type'])

df.sort_values(by='price',ascending=True,inplace=True)

df.head()

In [None]:
#Train Linear Regression model

lm = LinearRegression()

X = df[['host_id','neighbourhood_group','neighbourhood','latitude','longitude','room_type','minimum_nights','number_of_reviews','calculated_host_listings_count','availability_365']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

lm.fit(X_train,y_train)

# getting predictions

In [None]:
predicts = lm.predict(X_test)
error_airbnb = pd.DataFrame({
        'Actual Values': np.array(y_test).flatten(),
        'Predicted Values': predicts.flatten()})
error_airbnb.head()

In [None]:
title=['Pred vs Actual']
fig = go.Figure(data=[
    go.Bar(name='Predicted', x=error_airbnb.index, y=error_airbnb['Predicted Values']),
    go.Bar(name='Actual', x=error_airbnb.index, y=error_airbnb['Actual Values'])
])

fig.update_layout(barmode='group')
fig.show()