# Question 3: Is it possible to predict rates per night and person of Airbnb listings in Berlin?
The third question around the Airbnb Berlin data set is about the possibility to predict rates per night and person for Airbnb homes in Berlin. The steps involved in answering the question are as follows:
* Load Libraries
* Import Data Set
* Pre-Processing
* Analyze Data Set & Evaluate Results

## Load Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import folium

%matplotlib inline

## Import Data Set

In [2]:
# Import listings data
df_listings = pd.read_csv('/Users/patrick.peltier/Documents/Udacity Data Science Nanodegree/Data/Airbnb/listings_berlin.csv')

# Preview data
print(df_listings.shape)
pd.options.display.max_columns = None
df_listings.head(1)

(22572, 96)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2015,https://www.airbnb.com/rooms/2015,20180912025131,2018-09-12,Berlin-Mitte Value! Quiet courtyard/very central,Great location! 30 of 75 sq meters. This wood...,A+++ location! This „Einliegerwohnung“ is an e...,Great location! 30 of 75 sq meters. This wood...,none,It is located in the former East Berlin area o...,"This is my home, not a hotel. I rent out occas...","Close to U-Bahn U8 and U2 (metro), Trams M12, ...","Simple kitchen/cooking, refrigerator, microwav...",Always available,"No parties No events No pets No smoking, not e...",,,https://a0.muscache.com/im/pictures/260fd609-7...,,2217,https://www.airbnb.com/users/show/2217,Ian,2008-08-18,"Key Biscayne, Florida, United States",Believe in sharing economy.,within an hour,100%,,t,https://a0.muscache.com/im/pictures/21428a22-4...,https://a0.muscache.com/im/pictures/21428a22-4...,Mitte,3.0,3.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"Berlin, Berlin, Germany",Mitte,Brunnenstr. Süd,Mitte,Berlin,Berlin,10119,Berlin,"Berlin, Germany",DE,Germany,52.534537,13.402557,f,Guesthouse,Entire home/apt,3,1.0,1.0,2.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",,$60.00,,,$200.00,$30.00,1,$28.00,4,1125,6 weeks ago,t,5,34,64,154,2018-09-12,109,2016-04-11,2018-09-08,92.0,9.0,9.0,10.0,10.0,10.0,9.0,t,,,f,f,strict_14_with_grace_period,f,f,3,3.69


## Pre-Processing

## Analyses

In [None]:
df_listings.select_dtypes(exclude=["float64", "int"]).columns

In [None]:
# Can we predict price by picture?
df_listings["picture_url"].values[:5]

In [None]:
# Binary column conversion
tf_cols = ["host_is_superhost","host_has_profile_pic","host_identity_verified","is_location_exact",
          "has_availability","requires_license","instant_bookable","is_business_travel_ready",
           "require_guest_profile_picture","require_guest_phone_verification"]

for col in tf_cols:
    df_listings[col] = df_listings[col].astype(str).apply(lambda x: 1 if x == 't' else 0).astype(int)

In [None]:
# Create "host_age" column as calculated age of host
df_listings["host_since"] = pd.to_datetime(df_listings["host_since"])
df_listings["calendar_last_scraped"] = pd.to_datetime(df_listings["calendar_last_scraped"])
df_listings["host_age"] = df_listings["calendar_last_scraped"] - df_listings["host_since"]
df_listings["host_age"] = df_listings["host_age"] / np.timedelta64(1,'D')

# Summary statistics
df_listings["host_age"].describe()
df_listings["host_age"].astype(float).hist(bins=50)

In [None]:
# Cast "host_respose_rate" column to float
df_listings["host_response_rate"] = df_listings["host_response_rate"].astype(str)
df_listings["host_response_rate"] = df_listings["host_response_rate"].apply(lambda x: x.replace('%','').replace(',',''))
df_listings["host_response_rate"] = df_listings["host_response_rate"].astype(float)

# Summary statistics
df_listings["host_response_rate"].describe()

In [None]:
# Scatterplot(s)
#plt.scatter(df_listings["price"], df_listings["bedrooms"])
plt.scatter(df_listings[df_listings["price"] <= df_listings["price"].quantile(.99)][["price"]],
            df_listings[df_listings["price"] <= df_listings["price"].quantile(.99)]["bedrooms"])
plt.ylabel("Bedrooms")
plt.xlabel("Airbnb Listing Price in $")
plt.title("Title");
#plt.xlabel("Airbnb Listing Price in $")
#plt.ylabel("Frequency")
#plt.title("Histogram of Airbnb Berlin Listing Prices");

In [None]:
# Numeric columns
num_cols = ["accommodates","guests_included","minimum_nights","availability_30","availability_60",
           "availability_90","availability_365","number_of_reviews","calculated_host_listings_count",
            "host_listings_count","host_total_listings_count","bathrooms","bedrooms","beds","square_feet",
            "price","weekly_price","monthly_price","security_deposit","cleaning_fee","extra_people",
            "review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin",
            "review_scores_communication","review_scores_location","host_age","review_scores_value",
            "review_scores_location","review_scores_value","reviews_per_month",]
#num_cols = num_cols + tf_cols

# Summary statistics
df_listings[num_cols].describe()

In [None]:
# Pairplots
sns.set(style='darkgrid', context='notebook')
sns.pairplot(df_listings[num_cols].dropna(axis=0))#, size=3)
plt.show()

In [None]:
# Correlation Plot
corrs = df_listings[num_cols].corr()
plt.subplots(figsize=(25,20))
sns.set(font_scale=1)
sns.heatmap(corrs, yticklabels=num_cols, xticklabels=num_cols, cbar=True, annot=True, square=True, fmt=".2f");

In [None]:
# Private property types
private_property = ["Apartment","Loft","House","Townhouse","Boat","Bungalow","Tiny house","Houseboat","Camper/RV","Villa",
                    "Cabin","Cottage","Castle","Train","Treehouse","Cave","Hut","Chalet","In-law","Barn","Tipi"]

In [None]:
# Frequencies / property type
prices_properties = df_listings[["property_type"]]
prices_properties = prices_properties.groupby("property_type").size().reset_index(name="counts")

# Average rate / property type
prices_properties = df_listings[["property_type", "price"]]
prices_properties = prices_properties.groupby("property_type", as_index=False).median()

# Plot
plt.subplots(figsize=(15,5))
sns.set(style="darkgrid")
ax = sns.barplot(x="property_type",
                 y="price",
                 data=prices_properties)
plt.xticks(rotation=90);
plt.title("Median Rate / Property Type")
plt.xlabel("Airbnb Property Type")
plt.ylabel("Median Price");