# Listings

In [2]:
import pandas as pd

df_2020_03 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2020-03-16/data/listings.csv.gz', compression='gzip')
df_2019_18 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2019-10-18/data/listings.csv.gz', compression='gzip')
df_2018_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2018-10-11/data/listings.csv.gz', compression='gzip')
df_2017_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/data/listings.csv.gz', compression='gzip')
df_2016_09 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/data/listings.csv.gz', compression='gzip')
df_2015_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/data/listings.csv.gz', compression='gzip')

df_original_listings = pd.concat([df_2020_03, df_2019_18, df_2018_10, df_2017_10, df_2016_09, df_2015_10]).drop_duplicates('id')
print('Len total =', len(df_original_listings))

HTTPError: HTTP Error 403: Forbidden

In [None]:
df_listings = df_original_listings
df_listings = df_listings.replace('nan', '', regex=True)
df_listings['summary'] = df_listings['summary'].astype(str)
df_listings['space'] = df_listings['space'].astype(str)
df_listings['description'] = df_listings['description'].astype(str)
df_listings['neighborhood_overview'] = df_listings['neighborhood_overview'].astype(str)
df_listings['host_neighbourhood'] = df_listings['host_neighbourhood'].astype(str)
df_listings['neighbourhood_cleansed'] = df_listings['neighbourhood_cleansed'].astype(str)
df_listings['notes'] = df_listings['notes'].astype(str)
df_listings['review_scores_rating'] = df_listings['review_scores_rating'].astype(float)
df_listings = df_listings[df_listings['review_scores_rating'].notna()]
print(len(df_listings))
df_listings['review_scores_rating']

In [None]:
from collections import Counter
from itertools import chain
import itertools
import string
import re

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

def get_neighborhood_adjectives(name):
  neigh = df_listings.loc[df_listings['neighbourhood_cleansed'] == name]

  overviews = [n.split() for n in neigh['neighborhood_overview']]
  adjectives = []
  for overview in overviews:
    useless_words = ['many', 'great', 'nearby', 'short', 'major', 'north', 'south', 'east', 'west', 'easy', 'true', 'local', 'public', 'mi', 'several', 'such', 'main', 'other']
    text = [word for word in overview if word.lower() not in useless_words]
    text = re.sub('[^\w\s]+', '', ' '.join(text))
    tagged = nltk.FreqDist(nltk.pos_tag(text.split()))
    jj = [wt[0] for (wt, _) in tagged.most_common() if wt[1] == 'JJ']
    adjectives.append(jj)

  flatAdjectives = list(itertools.chain.from_iterable(adjectives))

  counter = Counter(flatAdjectives)
  return (len(neigh), counter)

# north_end = get_neighborhood_adjectives('North End')
# print('In north end:', north_end[0])
# print('North end is:', north_end[1])

In [None]:
# for neighborhood in df_listings['neighbourhood_cleansed'].unique():
#     ads = get_neighborhood_adjectives(neighborhood)
#     print(neighborhood, ':', ads[0])
#     print(neighborhood, 'is:', ads[1].most_common(5))

# Reviews

In [None]:
import pandas as pd

reviews_2020_03 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2020-03-16/data/reviews.csv.gz', compression='gzip')
reviews_2019_18 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2019-10-18/data/reviews.csv.gz', compression='gzip')
reviews_2018_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2018-10-11/data/reviews.csv.gz', compression='gzip')
reviews_2017_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/data/reviews.csv.gz', compression='gzip')
reviews_2016_09 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/data/reviews.csv.gz', compression='gzip')
reviews_2015_10 = pd.read_csv('http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/data/reviews.csv.gz', compression='gzip')

df_reviews = pd.concat([reviews_2020_03, reviews_2019_18, reviews_2018_10, reviews_2017_10, reviews_2016_09, reviews_2015_10])
df_reviews['comments'] = df_reviews['comments'].astype(str)
print('Len total =', len(df_reviews))

In [None]:
df_reviews

# Vibe Vectors

In [None]:
vibedict = []

for neighborhood in df_listings['neighbourhood_cleansed'].unique():
  neighborhood_adjectives = get_neighborhood_adjectives(neighborhood)[1]
  adjectives = list(neighborhood_adjectives)
  frequencies = [neighborhood_adjectives[adjective] for adjective in adjectives]

  # for each listing within this neighborhood
  for listing_id in df_listings.loc[df_listings['neighbourhood_cleansed'] == neighborhood]['id']:
    comments_list = df_reviews.loc[df_reviews['listing_id'] == listing_id]['comments'].tolist()
    if len(comments_list) == 0:
      continue

    # Neighborhood accuracy is essentially the dot product between the neighborhood description
    # and the reviews left for the place. This indicates the accuracy of the neighborhood
    # description and the vibe that people who stayed felt.

    # Neighborhood accuracy is the sum of the number of adjectives that appear 
    # in both the neighborhood description and in the reviews for a listing,
    # weighted by the frequency by which those adjectives occur in all 
    # neighborhood descriptions within that neighborhood. 
    neighborhood_accuracy = 0

    listing_comments = re.sub('[^\w\s]+', '', ' '.join(comments_list)).split()
    comment_freq = Counter(listing_comments)
    comment_words = list(comment_freq.keys())

    for i in range(len(adjectives)):
      if (adjectives[i] in comment_words):
        neighborhood_accuracy += frequencies[i] * comment_freq[adjectives[i]]
        # neighborhood_accuracy += 1

    rating = df_listings.loc[df_listings['id'] == listing_id]['review_scores_rating']    
    if len(rating) == 0:
      print('listing id:', listing_id, 'rating:', rating)    
    neighborhood_accuracy /= len(comments_list)
    vibedict.append({'listing_id': listing_id,
                     'lat': df_listings.loc[df_listings['id'] == listing_id]['latitude'].values[0],
                     'lon': df_listings.loc[df_listings['id'] == listing_id]['longitude'].values[0],
                     'rating': df_listings.loc[df_listings['id'] == listing_id]['review_scores_rating'].values[0], 
                     'neighborhood_accuracy': neighborhood_accuracy, 
                     'num_reviews': len(comments_list)})

vibetable = pd.DataFrame(columns=['listing_id', 'lat', 'lon', 'rating',  'neighborhood_accuracy', 'num_reviews'])
vibetable = vibetable.append(vibedict, ignore_index=True)

In [None]:
scaled_table = vibetable

perception_accuracy = vibetable['neighborhood_accuracy']
perception_90 = np.quantile(perception_accuracy, .95)
scaled_table['neighborhood_accuracy'] = vibetable['neighborhood_accuracy'].div(perception_90).clip(upper=1)

num_reviews = vibetable['num_reviews']
reviews_90 = np.quantile(num_reviews, .95)
scaled_table['num_reviews'] = vibetable['num_reviews'].div(reviews_90).clip(upper=1)

ratings = vibetable['rating']
ratings_90 = np.quantile(ratings, .95)
scaled_table['rating'] = np.log(vibetable['rating'].div(ratings_90).clip(upper=1))



In [None]:
scaled_table.to_csv("dist/data/vibetable.csv", index=False)
scaled_table.to_json("dist/data/vibetable.json", orient='index')

In [None]:
import matplotlib.pyplot as plt
plt.hist(scaled_table['rating'], bins=50)

In [None]:
df_listings.iloc[1]['id']

In [None]:
df_listings.loc[df_listings['id'] == 5506]['review_scores_rating'].values[0]

In [None]:
list(df_reviews)

In [None]:
df_reviews


In [None]:
len(df_reviews)