# Hotel Data Exploration

We chose the following cities for an analysis of hotel listing factors:
* Amsterdam 
* Beijing
* Las Vegas
* Los Angeles
* Madrid
* New York
* Paris
* Sydney

The source for hotel data in this notebook is a 2016 World Hotel dataset downloaded from here https://github.com/lucasmonteiro001/free-world-hotel-database/blob/master/hotels.csv.zip. Because our Airbnb dataset was much larger and from the current year, we continued to look for other sources and did additional exploration using data downloaded from TripAdvisor (see "hotel-tripadvisor" directory).

Plots were used intermittently to help direct data exploration and data cleaning. For discussion of conclusions, see Airbnb_vs_Hotel.ipynb in main directory.

---

In [6]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import os
import math
import json
# gmaps and geojson
import geopy.distance
import gmaps
import gmaps.geojson_geometries

# Google developer API key
from config import gkey
gmaps.configure(api_key=gkey)

# List of Eight World Cities Chosen
cityL=["Amsterdam","Beijing","Las Vegas","Los Angeles","Madrid","New York","Paris","Sydney"]

# file to save geocodes for each city
geo_data_file = os.path.join("Resources","geocodes.csv")

# Hotels CSV File
file_one = 'hotels.csv'

In [7]:
# Get location and bounding box latitude and longitude for each city using Google geocode
base_url = "https://maps.googleapis.com/maps/api/geocode/json?"

geobounds=[]
params={
    "key": gkey
}
for city in cityL:
    params["address"] = city
    response = requests.get(base_url, params=params)
    geo_data = response.json()
    cdict={"city":city}
    cdict["lat"] = geo_data["results"][0]["geometry"]["location"]["lat"]
    cdict["lng"] = geo_data["results"][0]["geometry"]["location"]["lng"]
    cdict["lat1"] = geo_data["results"][0]["geometry"]["bounds"]["northeast"]["lat"]
    cdict["lng1"] = geo_data["results"][0]["geometry"]["bounds"]["northeast"]["lng"]
    cdict["lat2"] = geo_data["results"][0]["geometry"]["bounds"]["southwest"]["lat"]
    cdict["lng2"] = geo_data["results"][0]["geometry"]["bounds"]["southwest"]["lng"]
    geobounds.append(cdict)

geodf = pd.DataFrame(geobounds)

# save geography coords
geodf.to_csv(geo_data_file, index = False)

geodf

IndexError: list index out of range

In [None]:
# retrieve geography coords without re-running cell above
geodf = pd.read_csv(geo_data_file, index_col = "city")
geodf

In [None]:
# Hotels DataFrame
citydf = pd.read_csv(file_one)
citydf.head()

In [None]:
# Convert latitude into numeric
citydf['latitude'] = pd.to_numeric(citydf['latitude'], errors='coerce')

In [None]:
citydf.dtypes

In [None]:
# retrieve Hotels info saved in folders with city names, merge into one dataframe

# function for returning distance to point center for row with latitude and longitude in df
def get_distance (row, center):
    start = (row['latitude'], row['longitude'])
    return geopy.distance.distance(start, center).km

dfexists = False

for city in cityL:
    #print(path)
    citydf = pd.read_csv(file_one)
    citydf['latitude'] = pd.to_numeric(citydf['latitude'], errors='coerce')
    citydf["key"]=city
    # make sure this only includes locations within city bounding box
    coords = geodf.loc[city]
    citydf = citydf.loc[(citydf["latitude"] <= coords["lat1"]) & (citydf["latitude"] >= coords["lat2"])&(citydf["longitude"] <= coords["lng1"]) & (citydf["longitude"] >= coords["lng2"])]
    # add a new column, distance to center
    center = (coords["lat"],coords["lng"])
    citydf["center_distance"] = citydf.apply (lambda row: get_distance(row, center),axis=1)
    if dfexists:
        allcitydf = pd.concat([allcitydf, citydf], ignore_index=True)
    else:
        allcitydf = citydf
        dfexists = True


allcitydf.head()

In [None]:
# get approx city bounding box size for metro size area comparison, approx listing density
sizeL = []

for city in cityL:
    cdict={"city":city}
    coords = geodf.loc[city]
    width = geopy.distance.distance((coords['lat1'],coords['lng1']),(coords['lat2'],coords['lng1'])).km
    height = geopy.distance.distance((coords['lat1'],coords['lng1']),(coords['lat1'],coords['lng2'])).km
    cdict["width"]=width
    cdict["height"]=height
    cdict["area"] = width * height
    #cdict["listing_count"] = allcitydf.loc[allcitydf["key"]==city]['id'].count()
    #cdict["list/sq_km"] = cdict["listing_count"]/cdict["area"]
    sizeL.append(cdict)
    
boundsizedf = pd.DataFrame(sizeL).set_index("city")
boundsizedf

In [None]:
# list cols in db
allcitydf.columns

In [None]:
# Convert Price into Numeric
allcitydf['price']= allcitydf['price'].astype(float)
allcitydf.count()

In [None]:
# Multiple Rating by 20
allcitydf['stars'] *= 20
allcitydf.head(2)

In [None]:
# Delete Prices equal to 99999.0

allcitydf = allcitydf.loc[allcitydf['price']!=99999.0]
allcitydf["price"].describe()

In [None]:
# dataframe with useful categories
catsdf = allcitydf[['hotelName','latitude','longitude','price','stars','key','center_distance']]
catsdf.head()

In [None]:
# Rename star Column
catsdf = catsdf.rename(columns={"stars":"rating"})
catsdf.head(2)

In [None]:
# Number of counts after removing the value 99999
catsdf.count()

In [None]:
# get average ratings/price for each city 
meandf = catsdf.groupby(["key"])[['price','rating']].mean()
meandf.head(10)

In [None]:
# bar plot mean price comparison for each city
meandf["price"].plot(kind='bar')
plt.xlabel("City")
plt.ylabel("Price per Night (US$)")
plt.title ("Average Price for Hotels Rooms")
plt.savefig("Images/Hotels_Prices.png")
plt.show()

In [None]:
# bar plot mean rating comparison for each city
meandf["rating"].plot(kind='bar')
plt.xlabel("City")
plt.ylabel("Rating")
plt.title ("Average Rating for Hotels")
plt.savefig("Images/Hotels_Rating.png")
plt.show()

In [8]:
# Plot Heatmap of New York Listings with Intensity by Price

newyorkdf= catsdf.loc[catsdf["key"]=="New York"]
locations = newyorkdf[["latitude", "longitude"]]
pricing = newyorkdf["price"]

coords = geodf.loc["New York"]
cen = (coords["lat"],coords["lng"])
fig = gmaps.figure(map_type='HYBRID', center=cen, zoom_level=11)
heatmap_layer = gmaps.heatmap_layer(locations, weights=pricing,
                                    max_intensity=400, point_radius=5)
fig.add_layer(heatmap_layer)
plt.savefig("Images/NY_price_heatmap.png")
fig

NameError: name 'catsdf' is not defined

In [61]:
# Heatmap with intensity by rating
rating = newyorkdf["rating"]

coords = geodf.loc["New York"]
cen = (coords["lat"],coords["lng"])
fig = gmaps.figure(map_type='HYBRID', center=cen, zoom_level=11)
heatmap_layer = gmaps.heatmap_layer(locations, weights=rating,
                                    max_intensity=400, point_radius=5)
fig.add_layer(heatmap_layer)
plt.savefig("Images/NY_rating_heatmap.png")
fig

Figure(layout=FigureLayout(height='420px'))

<Figure size 432x288 with 0 Axes>