# Listings

In [5]:
import pandas as pd

df_2020_03 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2020-03-16/data/listings.csv.gz', compression='gzip')
df_2019_18 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2019-10-18/data/listings.csv.gz', compression='gzip')
df_2018_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2018-10-11/data/listings.csv.gz', compression='gzip')
df_2017_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/data/listings.csv.gz', compression='gzip')
df_2016_09 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/data/listings.csv.gz', compression='gzip')
df_2015_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/data/listings.csv.gz', compression='gzip')

df_original_listings = pd.concat([df_2020_03, df_2019_18, df_2018_10, df_2017_10, df_2016_09, df_2015_10]).drop_duplicates('id')
print('Len total =', len(df_original_listings))

df_listings = df_original_listings
df_listings = df_listings.replace('nan', '', regex=True)
df_listings['summary'] = df_listings['summary'].astype(str)
df_listings['space'] = df_listings['space'].astype(str)
df_listings['description'] = df_listings['description'].astype(str)
df_listings['neighborhood_overview'] = df_listings['neighborhood_overview'].astype(str)
df_listings['host_neighbourhood'] = df_listings['host_neighbourhood'].astype(str)
df_listings['neighbourhood_cleansed'] = df_listings['neighbourhood_cleansed'].astype(str)
df_listings['notes'] = df_listings['notes'].astype(str)
df_listings['review_scores_rating'] = df_listings['review_scores_rating'].astype(float)
df_listings = df_listings[df_listings['review_scores_rating'].notna()]
print(len(df_listings))
df_listings['review_scores_rating']

Len total = 12480
9684


0        99.0
1        95.0
2        96.0
3        93.0
4        95.0
        ...  
2552     96.0
2554     99.0
2555     99.0
2556    100.0
2557     87.0
Name: review_scores_rating, Length: 9684, dtype: float64

In [33]:
from collections import Counter
from itertools import chain
import itertools
import string
import re
import spacy

# Download the natural language dataset for english, if this is the first run
# !python -m spacy download en_core_web_sm

# Load spacy
nlp = spacy.load('en_core_web_sm')

# Add in common words that occurr in all listings, that really don't mean anything useful here
nlp.Defaults.stop_words |= {'many', 'great', 'nearby', 'short', 'major', 'north', 'south', 'east', 'west', 'easy', 'true', 'local', 'public', 'mi', 'several', 'such', 'main', 'other'}

print('great' in nlp.Defaults.stop_words)

# Creates a Counter of adjectives used in the "neighborhood overview" field of all listings
# within the neighborhood name provided. 
def get_neighborhood_adjectives(name):
  
  # List of the "neighborhood_overview" field for all listings for this neighborhood
  overviews = df_listings.loc[df_listings['neighbourhood_cleansed'] == name]['neighborhood_overview']
  
  list_overviews = '  '.join([overview for overview in overviews])
    
  # Combine all overviews into one, and load it into spacy
  text = nlp(list_overviews)
  
  # Get the adjectives
  adjectives = [token.lemma_ for token in text if token.pos_ == "ADJ" and not token.is_stop]
  
  # Count the adjectives
  counter = Counter(adjectives)
  
  # Remove pronoun forms
  counter.pop('-PRON-', None)
  
  return (len(overviews), counter)

  
north_end = get_neighborhood_adjectives('North End')
print('Listings in north end:', north_end[0])
print('North end is:', north_end[1])

True
Listings in north end: 347
North end is: Counter({'italian': 184, 'historic': 120, 'good': 90, 'old': 75, 'true': 53, 'entire': 45, 'great': 42, 'narrow': 41, 'fine': 31, 'famous': 29, 'amazing': 27, 'residential': 26, 'american': 26, 'public': 25, 'close': 23, 'european': 19, 'easy': 19, 'short': 18, 'rich': 16, 'fresh': 16, 'accessible': 16, 'authentic': 15, 'beautiful': 15, 'yummy': 14, 'friendly': 13, 'safe': 13, 'perfect': 13, 'favorite': 13, 'notable': 13, 'local': 12, 'delicious': 12, 'vibrant': 12, 'little': 9, 'homemade': 9, 'baked': 9, 'incredible': 9, 'convenient': 9, 'countless': 9, 'square': 8, 'historical': 8, 'young': 8, 'wonderful': 7, 'quiet': 7, 'right': 7, 'lively': 7, 'unbeatable': 7, 'charming': 7, 'unique': 6, 'cozy': 6, 'new': 6, 'sure': 6, 'popular': 6, 'small': 6, 'brief': 6, 'bountiful': 6, 'breathtaking': 6, 'plenty': 6, 'colonial': 6, 'central': 5, 'colorful': 5, 'green': 5, 'walkable': 5, 'main': 5, 'endless': 4, 'ideal': 4, 'modern': 4, 'quaint': 4, '

In [30]:
# for neighborhood in df_listings['neighbourhood_cleansed'].unique():
#     ads = get_neighborhood_adjectives(neighborhood)
#     print(neighborhood, ':', ads[0])
#     print(neighborhood, 'is:', ads[1].most_common(5))

# Reviews

In [4]:
import pandas as pd

reviews_2020_03 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2020-03-16/data/reviews.csv.gz', compression='gzip')
reviews_2019_18 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2019-10-18/data/reviews.csv.gz', compression='gzip')
reviews_2018_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2018-10-11/data/reviews.csv.gz', compression='gzip')
reviews_2017_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/data/reviews.csv.gz', compression='gzip')
reviews_2016_09 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/data/reviews.csv.gz', compression='gzip')
reviews_2015_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/data/reviews.csv.gz', compression='gzip')

df_reviews = pd.concat([reviews_2020_03, reviews_2019_18, reviews_2018_10, reviews_2017_10, reviews_2016_09, reviews_2015_10])
df_reviews['comments'] = df_reviews['comments'].astype(str)
print('Len total =', len(df_reviews))

Len total = 794377


# Vibe Factor Calculations

In [7]:
# Calculates the neighborhood accuracy statistic for a listing
# Neighborhood accuracy is the dot product between the neighborhood description
# and the reviews left for the place. This metric is greatest when the experience
# of the visitors most matched the vibe that was portrayed by the listing.

# Neighborhood accuracy is calculated as the sum of the number of adjectives that appear 
# in both the neighborhood description and in the reviews for a listing,
# weighted by the frequency by which those adjectives occur in all 
# neighborhood descriptions within that neighborhood.

# calc_neighborhood_accuracy(comments_list) takes in a list of strings as the comments
# for that listing. It returns a scalar neighborhood accuracy score.
def calc_neighborhood_accuracy(comments_list):
  neighborhood_accuracy = 0

  listing_comments = re.sub('[^\w\s]+', '', ' '.join(comments_list)).split()
  comment_freq = Counter(listing_comments)
  comment_words = list(comment_freq.keys())

  for i in range(len(adjectives)):
    if (adjectives[i] in comment_words):
      neighborhood_accuracy += frequencies[i] * comment_freq[adjectives[i]]

  rating = df_listings.loc[df_listings['id'] == listing_id]['review_scores_rating']    
  if len(rating) == 0:
    print('listing id:', listing_id, 'rating:', rating)    
  neighborhood_accuracy /= len(comments_list)

  return neighborhood_accuracy

# Calculates the mean sentiment of all reviews for a listing.
def calc_happiness(comments_list):
  

# Generate all data

In [19]:
def gen_data():
  vibedict = []
  for neighborhood in df_listings['neighbourhood_cleansed'].unique():

    # Primary adjectives and their frequencies, used by all listings within to describe this neighborhood
    neighborhood_adjectives = get_neighborhood_adjectives(neighborhood)[1]
    adjectives = list(neighborhood_adjectives)
    frequencies = [neighborhood_adjectives[adjective] for adjective in adjectives]

    # for each listing within this neighborhood
    for listing_id in df_listings.loc[df_listings['neighbourhood_cleansed'] == neighborhood]['id']:
      
      # List of comments for that listing
      comments_list = df_reviews.loc[df_reviews['listing_id'] == listing_id]['comments'].tolist()
      if len(comments_list) == 0:
        continue

      # Neighborhood accuracy metric ("Consistency")
      consistency = calc_neighborhood_accuracy(comments_list)

      # Average rating ("Enjoyability")
      enjoyability = df_listings.loc[df_listings['id'] == listing_id]['review_scores_rating'].values[0]
      
      # Number of ratings ("Popularity")
      popularity = len(comments_list)
      
      # Add this listing and its metrics to the dictionary
      vibedict.append({'listing_id': listing_id,
                       'lat': df_listings.loc[df_listings['id'] == listing_id]['latitude'].values[0],
                       'lon': df_listings.loc[df_listings['id'] == listing_id]['longitude'].values[0],
                       'enjoyability': enjoyability,
                       'consistency': consistency,
                       'popularity': popularity})

  vibetable = pd.DataFrame(columns=['listing_id', 'lat', 'lon', 'enjoyability',  'consistency', 'popularity'])
  vibetable = vibetable.append(vibedict, ignore_index=True)
  return vibetable

In [20]:
vibetable = gen_data()

In [21]:
import numpy as np

scaled_table = vibetable

perception_accuracy = vibetable['consistency']
perception_90 = np.quantile(perception_accuracy, .90)
scaled_table['consistency'] = vibetable['consistency'].div(perception_90).clip(upper=1)

num_reviews = vibetable['popularity']
reviews_90 = np.quantile(num_reviews, .90)
scaled_table['popularity'] = vibetable['popularity'].div(reviews_90).clip(upper=1)

ratings = vibetable['enjoyability']
ratings_90 = np.quantile(ratings, .90)
scaled_table['enjoyability'] = np.log(vibetable['enjoyability'].div(ratings_90).clip(upper=1))

In [22]:
scaled_table.to_csv("dist/data/vibetable.csv", index=False)
scaled_table.to_json("dist/data/vibetable.json", orient='index')

In [None]:
import matplotlib.pyplot as plt
plt.hist(scaled_table['rating'], bins=50)

In [None]:
df_listings.iloc[1]['id']

In [None]:
df_listings.loc[df_listings['id'] == 5506]['review_scores_rating'].values[0]

In [None]:
list(df_reviews)

In [None]:
df_reviews


In [None]:
len(df_reviews)