# Airbnb NYC 2019


In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rcParams['font.size'] = 12
mpl.rcParams['axes.labelsize'] = 10

Load data and perform first exploration

In [None]:
df_airbnb = pd.read_csv('./Data/AB_NYC_2019.csv')

In [None]:
df_airbnb.head()

In [None]:
print('Number of entries: {} \nNumber of features: {}'.format(df_airbnb.shape[0], df_airbnb.shape[1]))

A first look suggests that columns such as **id**, **name**, **host_id** and **host_name** can be discarded from the analysis. 

The column **name** could be used to incorporate some more features (keywords appearing in the name). This would be discarded on the first simple models.

In [None]:
# drop features id, host_id and host_name
df_airbnb.drop(['id', 'name', 'host_id', 'host_name'], axis=1, inplace=True)

Now we will look for missing data on the remaining features

In [None]:
df_airbnb.isnull().sum()

We have missing data for the following features:
* last_review --> comes from entries without any review
* reviews_per_month --> comes from entries without any review; we will replace it by 0

In [None]:
# replace NaN by 0 for column reviews per month
df_airbnb.reviews_per_month.fillna(0, inplace=True)

So far we have 12 columns, 11 of them corresponding to features and 1 corresponding to our target variable, price. 

First we will start by taking a look to the price to see its distribution and check there are no inconsistencies.

Then we will explore the features and their distributions and correlation among them. We can expect high correlation between:
* neighbourhood_group, neighbourhood, latitude and longitude
* number_of_reviews and reviews_per_month (can also be related with last_review)

### Exploration of price

In [None]:
f, ax = plt.subplots(1, 2, figsize=(15,5))
df_airbnb['price'].plot(kind='hist', bins=80, title='Price distribution', ax=ax[0])
df_airbnb['price'].plot(kind='hist', bins=80,logx=True, logy=True, title='Price distribution', ax=ax[1])
ax[0].set_xlabel('Price $')
ax[1].set_xlabel('Log Price $')
plt.show()

In [None]:
df_airbnb['price'].describe()

In [None]:
print('Number of cases with price = $0: ',len(df_airbnb[df_airbnb.price == 0]))

In [None]:
# drop cases with price 0
df_airbnb.drop(df_airbnb[df_airbnb.price < 10]. index, axis=0, inplace=True)

In [None]:
def print_pct_price(attribute, value):
    print('Appartments with {} over ${}: {:.2f}%'.\
          format(attribute, \
                 value, \
                 100 * len(df_airbnb[df_airbnb[attribute] > value])/len(df_airbnb)))

print_pct_price('price', 200)
print_pct_price('price', 300)
print_pct_price('price', 500)
print_pct_price('price', 1000)

In [None]:
f, ax = plt.subplots(1, 2, figsize=(15,5))
df_airbnb[df_airbnb['price'] <= 400].price.plot(kind='hist', bins=100,title='Price distribution below $300', ax=ax[0])
df_airbnb[df_airbnb['price'] > 400].price.plot(kind='hist', bins=100, title='Price distribution above $300', ax=ax[1])
ax[0].set_xlabel('Price $')
ax[1].set_xlabel('Price $')
plt.show()


Insights: 
* Mean price is \\$153 and median price is \\$106. This reflects large outliers that increase the mean price.
* Min price is \\$0 while max price is \\$10k. We will drop the cases with price = \\$0 since they may correspond to errors.
* Around 95% of the appartments have a price lower than \\$300. From these, a good amount concentrate in a range lower than \\$100. We can also observe some peaks around round prices (100, 150, etc.)
* Appartments over \\$1k represent less than 0.5\% of the cases.

### Exploration of features

The features **neighbourhood** is left out of the plot due to the large number of possible values. The feature **last_review** is left out of the plot since we cannot relate it to the date when the prices were retrieved.

In [None]:
f, ax = plt.subplots(3,3, figsize=(40,30))
ax = ax.flatten()
for idx, col in enumerate(df_airbnb.drop(['neighbourhood', 'price', 'last_review'], axis=1).columns):
    if df_airbnb[col].dtype == object:
        df_airbnb.groupby(col).count().iloc[:,1].plot(kind='bar', ax=ax[idx], title=col)
        ax[idx].tick_params(axis='x', rotation=0)
    else:
        df_airbnb[col].plot(kind='hist', bins=50, ax=ax[idx], title=col)

Insights: 
* Neighbourhood: Brooklyn and Manhattan are the most represented ones.
* Most locations offered are entire appartments or private rooms. Shared rooms are very rare.
* Number of reviews and reviews per month seem to follow a Poisson distribution.
* Minimum nights and host listing count may offer more insights when taking logarithm of the values. 
* Availability is highly concentrated aroud 1 and then there is a great decrease for the rest of values.

#### Analysis per neighbourhood

In [None]:
df_airbnb.groupby('neighbourhood_group').agg({'neighbourhood':['count','nunique'], 
                                              'price':['min', 'mean', 'median', 'max'],
                                             'number_of_reviews':['min', 'mean', 'median', 'max']})

If we analyse by neighbourhood, we can highlight:
* Manhattan is the one grouping less neighbourhoods with around 60% of the neighbourhoods that Queens groups (which is the one with the largest value).
* In terms of price, they all have almost the same minimum price. Nevertheless, Manhattan shows a median price that is almost 3 times the median of the lowest price (Bronx). The maximum price at Bronx is 25% of the maximum at Manhattan, Brooklyn and Staten Island. 
* In terms of number of reviews, the median is much lower than the mean in all cases, showing the presence of large sporadic numbers and a great amount of low values.

In [None]:
f, ax = plt.subplots(1,2, figsize=(20,5))
ax = ax.flatten()

sns.violinplot(data=df_airbnb[df_airbnb.price < 300], x='neighbourhood_group', y='price', ax=ax[0], \
               title='Price vs Neighbourhood')
sns.violinplot(data=df_airbnb, x='neighbourhood_group', y='price', ax=ax[1], \
               title='Price vs Neighbourhood')
plt.show()

#### Analysis of correlation

In [None]:
#X = df_airbnb.drop(['price', 'neighbourhood', 'last_review'], axis=1)
X = pd.get_dummies(df_airbnb.drop(['price', 'neighbourhood', 'last_review'], axis=1))


In [None]:
X = X.to_numpy()

In [None]:
corr_matrix = np.corrcoef(X).round(decimals=2)
corr_matrix

#### Analysis type of room

In [None]:
f, ax = plt.subplots(1,2, figsize=(20,5))
ax = ax.flatten()

sns.violinplot(data=df_airbnb[df_airbnb.price < 500], x='room_type', y='price', ax=ax[0], \
               title='Price vs Neighbourhood')
sns.violinplot(data=df_airbnb, x='room_type', y='price', ax=ax[1], \
               title='Price vs Neighbourhood')
plt.show()

#### Analysis minimum nights

In [None]:
ncode categorical variables with one-hot encoding


In [None]:
f, ax = plt.subplots(2,2, figsize=(20,12))
ax = ax.flatten()

df_airbnb.plot(kind='scatter', x='calculated_host_listings_count', y='price', ax=ax[0], \
               title='Price vs Number of properties per host')
df_airbnb.plot(kind='scatter', x='number_of_reviews', y='price', ax=ax[1],\
              title='Price vs Number of reviews per property')
df_airbnb.plot(kind='scatter', x='minimum_nights', y='price', ax=ax[2],\
              title='Price vs Number of min nights')
df_airbnb.plot(kind='scatter', x='availability_365', y='price', ax=ax[3], \
              title='Price vs Availability')
plt.show()

We will use the neighbourhood group as categorical variable.

## TODO THINGS:

* First models based on numerical and categorical data --> drop name, host_name


In [None]:
# drop features
df_models = df_airbnb.drop(['name', 'neighbourhood', 'last_review'], axis=1)
# convert categorical variables to one-hot encoding
df_models = pd.get_dummies(df_models)

In [None]:
df_models.describe()

In [None]:
f, ax = plt.subplots(3,2,figsize=(20,14))
ax = ax.flatten()
df_models.minimum_nights.plot(kind='hist', ax=ax[0], bins=100, logx=True,\
                              logy=True, title='Distribution number of reviews')
df_models.number_of_reviews.plot(kind='hist', ax=ax[1], bins=100, logx=True, \
                                 logy=True, title='Distribution number of reviews')
df_models.reviews_per_month.plot(kind='hist', ax=ax[2], bins=100, logx=True,
                                 logy=True, title='Distribution reviews per month')
df_models.calculated_host_listings_count.plot(kind='hist', ax=ax[3], bins=100, logx=True,\
                                              logy=True, title='Distribution hosts listings')
df_models.availability_365.plot(kind='hist', ax=ax[4], bins=100, title='Distribution availability')
plt.show()

In [None]:
df_models.quantile(0.95)

In [None]:
# create X and y matrices 
X = df_models.drop(['price'], axis=1).to_numpy()
y = df_models['price'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

scaler = StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

In [None]:
plt.plot(X_test_transformed[:,1])