In [157]:
import requests
import re
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup

In [158]:
# List of cities to consider - can replace as desired
cities_list = ['New York City', 'Los Angeles']

In [159]:
# Request Data
r = requests.get('http://insideairbnb.com/get-the-data/')

In [160]:
# Initialize parser
# Parse all table rows
soup = BeautifulSoup(r.text)
nodes = soup.find_all('tr')
data_list = [] # For storing row data
for node in nodes: # Enumerate through all rows
    node_data = [] # For storing data for a specific row
    columns = node.find_all('td') # Get all columns for row
    if columns: # Only want to proceed if we have data
        for c in columns: # Find all data points
            try:
                node_data.append(c.a['href']) # Will only work for URL
            except:
                node_data.append(c.text) # Otherwise, we just need the text value
        data_list.append(node_data)

In [161]:
# Build dataframe with all data
col_names = ['Data Date', 'Country/City', 'Data URL', 'Description'] # Column names to use
all_data = pd.DataFrame(data_list, columns=col_names)
all_data.head()

Unnamed: 0,Data Date,Country/City,Data URL,Description
0,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Listings data
1,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Calendar Data
2,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Detailed Review Data
3,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Summary information and metrics for listings i...
4,"08 March, 2022",Amsterdam,http://data.insideairbnb.com/the-netherlands/n...,Summary Review data and Listing ID (to facilit...


In [162]:
# Subset to desired cities
specific_data = all_data.loc[all_data['Country/City'].isin(cities_list)]
specific_data.head()

Unnamed: 0,Data Date,Country/City,Data URL,Description
308,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Listings data
309,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Calendar Data
310,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Detailed Review Data
311,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Summary information and metrics for listings i...
312,"08 March, 2022",Los Angeles,http://data.insideairbnb.com/united-states/ca/...,Summary Review data and Listing ID (to facilit...


In [164]:
# Print where data was/wasn't found for:
matches = specific_data['Country/City'].unique() # List of matches
print('Data found for:', matches)
print('No Data found for:', list(set(matches) ^ set(cities_list)))

Data found for: ['Los Angeles' 'New York City']
No Data found for: []
