In [1]:
# Load necessary packages
import requests
import time
import os
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup

# Get info for specified cities

In [2]:
# List of cities to consider - can replace as desired
cities_list = ['New York City', 'Los Angeles']

In [3]:
# Request Data
r = requests.get('http://insideairbnb.com/get-the-data/')

In [4]:
# Initialize parser
# Parse all table rows
soup = BeautifulSoup(r.text)
nodes = soup.find_all('tr')
data_list = [] # For storing row data
for node in nodes: # Enumerate through all rows
    node_data = [] # For storing data for a specific row
    columns = node.find_all('td') # Get all columns for row
    if columns: # Only want to proceed if we have data
        for c in columns: # Find all data points
            try:
                node_data.append(c.a['href']) # Will only work for URL
            except:
                node_data.append(c.text) # Otherwise, we just need the text value
        data_list.append(node_data)

In [5]:
# Build dataframe with all data
col_names = ['Data Date', 'Country/City', 'Data URL', 'Description'] # Column names to use
all_data = pd.DataFrame(data_list, columns=col_names)
all_data.head()

Unnamed: 0,Data Date,Country/City,Data URL,Description
0,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Listings data
1,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Calendar Data
2,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Review Data
3,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Summary information and metrics for listings i...
4,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Summary Review data and Listing ID (to facilit...


In [6]:
# Subset to desired cities
specific_data = all_data.loc[all_data['Country/City'].isin(cities_list)]
specific_data.head()

Unnamed: 0,Data Date,Country/City,Data URL,Description
308,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Listings data
309,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Calendar Data
310,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Review Data
311,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Summary information and metrics for listings i...
312,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Summary Review data and Listing ID (to facilit...


In [7]:
# Print where data was/wasn't found for:
matches = specific_data['Country/City'].unique() # List of matches
print('Data found for:', matches)
print('No Data found for:', list(set(matches) ^ set(cities_list)))

Data found for: ['Los Angeles' 'New York City']
No Data found for: []


# Download Listings, Calendar, Review data

In [8]:
# Subset to URLs of interest
# All will have the same description
to_download = specific_data.copy().loc[specific_data['Description'].str.contains(r'Detailed (?:Listings|Calendar|Review)')]
to_download['DataType'] = to_download['Description'].str.extract(r'(Listings|Calendar|Review)') # Flag to use for storing data
to_download.head()

Unnamed: 0,Data Date,Country/City,Data URL,Description,DataType
308,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Listings data,Listings
309,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Calendar Data,Calendar
310,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Review Data,Review
427,"05 March, 2022",New York City,http://data.insideairbnb.com/united-states/ny/...,Detailed Listings data,Listings
428,"05 March, 2022",New York City,http://data.insideairbnb.com/united-states/ny/...,Detailed Calendar Data,Calendar


In [9]:
# Iterate and download files
start = time.time() # For timing purposes
listings, calendar, reviews = [], [], [] # For storing data
chunk_size = 5000 # Number of rows to read in at once
for idx, row in to_download.iterrows():
    # Read in file in chunks
    # Pandas allows reading directly from the url link
    raw_data = pd.read_csv(row['Data URL'], chunksize=chunk_size, compression='gzip')
    combined_data = pd.concat(raw_data) # Concat all chunks
    combined_data['Country_City'] = row['Country/City'] # Add identifier
    combined_data['Data Date'] = row['Data Date'] # Date identifier for data
    if row['DataType'] == 'Listings': # Append to listings
        listings.append(combined_data)
    elif row['DataType'] == 'Calendar': # Append to calendar
        calendar.append(combined_data)
    elif row['DataType'] == 'Review': # Append to reviews
        reviews.append(combined_data)

# Combine to single df
listings_df = pd.concat(listings)
calendar_df = pd.concat(calendar)
reviews_df = pd.concat(reviews)
print('Data read completed in:', str(time.time() - start), 'sec')

Data read completed in: 182.50235271453857 sec


In [21]:
# Write data to parquet dataset for use in model
if not os.path.exists('./Data/'): # Check if data folder exists
    os.makedirs('./Data/') # Create if not

# Generate pyarrow tables
listings_pq = pa.Table.from_pandas(listings_df)
calendar_pq = pa.Table.from_pandas(calendar_df)
reviews_pq = pa.Table.from_pandas(reviews_df)

# Write to parquet datasets
pq.write_to_dataset(listings_pq, root_path='./Data/listings/',
                   partition_cols=['Country_City', 'Data Date'],
                   use_legacy_dataset=False)
pq.write_to_dataset(calendar_pq, root_path='./Data/calendar/',
                   partition_cols=['Country_City', 'Data Date'],
                   use_legacy_dataset=False)
pq.write_to_dataset(reviews_pq, root_path='./Data/reviews/',
                   partition_cols=['Country_City', 'Data Date'],
                   use_legacy_dataset=False)