# Part A - Data Visualization Project (preparation for dashboard)
***
## Numan SAHNOU - Matthieu ECCHER
<br></br>
### Analysis on the New York AirBnb Open Dataset (2019)
Kaggle : https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

* __Context__

Since 2008, guests and hosts have used Airbnb to expand on traveling possibilities and present more unique, personalized way of experiencing the world. This dataset describes the listing activity and metrics in NYC, NY for 2019.

* __Content__

This data file includes all needed information to find out more about hosts, geographical availability, necessary metrics to make predictions and draw conclusions.

### __Open questions :__

* What is the type room repartition?
* What are the most frequent words that occures in rental ads? 
* What is the price repartition deppending of the district?
* I want to spend a certain amount of time, so how many airbnb can I choose? 
* Is there any noticeable difference of traffic among different district and what could be the reason for it?
* What are the most reviewed neigborhood?

In [1]:
import dash
import dash_core_components as dcc
import pandas as pd
from plotly import graph_objs as go
import plotly.express as px
from plotly.graph_objs import *
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go

#API Keys and datasets
mapbox_access_token = 'pk.eyJ1IjoiZ29sZGVkaXRpb24yMTIiLCJhIjoiY2tld3dvMGxmMGJsbjM1bXV5cXNjam84cSJ9.32Xt4hp12-2Fa3Rk2XFLgQ' #personal token to access mapbox api
airbnb_data = pd.read_csv("AB_NYC_2019.csv")

## Observation & statistics

In [2]:
airbnb_data.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365


In [3]:
airbnb_data['price'].describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [4]:
#We remove outliers that are more than 4 times the standard deviations from the mean to keep only the rooms with the most popular pricing
import statistics 
std = statistics.stdev(airbnb_data['price']) * 4
airbnb_data = airbnb_data[airbnb_data['price'] < std]
airbnb_data['price'].describe()

count    48565.000000
mean       139.675692
std        110.802396
min          0.000000
25%         69.000000
50%        105.000000
75%        175.000000
max        956.000000
Name: price, dtype: float64

In [5]:
airbnb_data['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [6]:
airbnb_data['neighbourhood'].unique()
airbnb_data['neighbourhood_group'].unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [7]:
airbnb_data['number_of_reviews'].median()

5.0

### NLP - Data preprocessing for wordcloud (Tokenization)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

tokenizer = Tokenizer()
airbnb_data.name= airbnb_data.name.astype(str)
tokenizer.fit_on_texts(airbnb_data.name)
airbnb_data.name= airbnb_data.name.apply(lambda x : text_to_word_sequence(x))

## Wordcloud 

due to the fact that plotly does not have a native solution to create a wordcloud, we searched on the web and found somewone who did it with his data so we took inspiration of his code and we applied it to our data (but we did analyze de code and we did understand how it works)

In [9]:
from wordcloud import WordCloud, STOPWORDS
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

def plotly_wordcloud(text):
    wc = WordCloud(stopwords = set(STOPWORDS),
                   max_words = 200,
                   max_font_size = 100)
    wc.generate(text)
    
    word_list=[]
    freq_list=[]
    fontsize_list=[]
    position_list=[]
    orientation_list=[]
    color_list=[]

    for (word, freq), fontsize, position, orientation, color in wc.layout_:
        word_list.append(word)
        freq_list.append(freq)
        fontsize_list.append(fontsize)
        position_list.append(position)
        orientation_list.append(orientation)
        color_list.append(color)
        
    # get the positions
    x=[]
    y=[]
    for i in position_list:
        x.append(i[0])
        y.append(i[1])
            
    # get the relative occurence frequencies
    new_freq_list = []
    for i in freq_list:
        new_freq_list.append(i*100)
    new_freq_list
    
    trace = go.Scatter(x=x*20, 
                       y=y*20, 
                       textfont = dict(size=new_freq_list,
                                       color=color_list),
                       hoverinfo='text',
                       hovertext=['{0}{1}'.format(w, f) for w, f in zip(word_list, freq_list)],
                       mode="text",  
                       text=word_list
                      )
    
    layout = go.Layout(autosize=True,
                   xaxis=dict(showgrid=False, 
                              showticklabels=False,
                              zeroline=False,
                              automargin=True),
                   yaxis=dict(showgrid=False,
                              showticklabels=False,
                              zeroline=False,
                              automargin=True),
                   margin=go.layout.Margin(pad=1000),
                  )
    
    fig = go.Figure(data=[trace], layout=layout)
    
    return fig

tab=[]
airbnb_data.name= airbnb_data.name.apply(lambda x : " ".join(x))
airbnb_data.name.apply(lambda x : tab.append(x))

text=" ".join(tab)

In [10]:
wordcloud_name = plotly_wordcloud(text)
wordcloud_name.show()

# Visualizing Airbnb Data

### Mapbox

In [11]:
fig = px.scatter_mapbox(airbnb_data, lat="latitude", lon="longitude", hover_name="name", hover_data=["neighbourhood_group", "neighbourhood", "room_type","price"],
                        color_continuous_scale=px.colors.cyclical.IceFire, zoom=9.5, height=600, width=1000, color="neighbourhood_group")
fig.update_layout(mapbox_style="dark", mapbox_accesstoken=mapbox_access_token)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Repartition of the room types

### Pie chart

In [12]:
roomdf = airbnb_data.groupby('room_type').size()/airbnb_data['room_type'].count()*100
labels = roomdf.index


# Use `hole` to create a donut-like pie chart


fig = go.Figure(data=[go.Pie(labels=labels, values=airbnb_data.groupby('room_type').size(), pull=[0, 0.2, 0],textinfo='percent+label')])

fig.show()

#### Histogram

In [13]:
fig8 = px.histogram(airbnb_data, x="room_type")
fig8.show()

### Repartition of the minimum nights requirement for each type of room (log scale)

In [14]:
fig5 = px.histogram(airbnb_data, x="minimum_nights", color= "room_type",log_y=True, log_x=True, width=1000)
fig5.show()

### Repartition of the number of reviews for each neighborhood in each neighborhood group (Manhattan, Brooklyn, etc)

In [15]:
fig7 = px.bar(airbnb_data[airbnb_data['neighbourhood_group']=='Manhattan'], x="number_of_reviews", y="neighbourhood", color= "neighbourhood", log_x=True)
fig7.show()

### Price repartition according to room types

In [16]:
fig3 = px.box(airbnb_data, x='room_type',y='price', log_y=True)

fig3.show()

### Price repartition among each neighborhood group and different room types

In [17]:
fig4 = px.box(airbnb_data, x='neighbourhood_group',y='price', log_y=True , color= "room_type")

fig4.show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56c1b310-8e15-4ecf-bb4f-7753d0a34f78' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>