In [33]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import seaborn as sns

In [34]:
#load the listings data
listings = pd.read_csv('listings.csv')
listings.shape

(3818, 92)

In [37]:
#reduce the data to just the columns of the cleaning fee and the cleaning score
listings_cleaning = listings[['id','cleaning_fee','review_scores_cleanliness','review_scores_rating']]
#droping the NaN values
listings_cleaning_dropped = listings_cleaning.dropna()
#1408 rows were dropped
#convert the costs from string to numbers
listings_cleaning_scores = listings_cleaning_dropped.copy()
listings_cleaning_scores['cleaning_fee'] = listings_cleaning_dropped['cleaning_fee'].replace('[\$,]', '', regex=True).astype(float)
listings_cleaning_scores

Unnamed: 0,id,cleaning_fee,review_scores_cleanliness,review_scores_rating
1,953595,40.0,10.0,96.0
2,3308979,300.0,10.0,97.0
4,278830,125.0,9.0,92.0
5,5956968,40.0,10.0,95.0
7,856550,25.0,10.0,97.0
...,...,...,...,...
3809,6120046,75.0,7.0,80.0
3810,262764,85.0,8.0,92.0
3811,8578490,20.0,8.0,100.0
3813,8101950,230.0,10.0,80.0


In [38]:
#analyse the costs
listings_cleaning_scores.describe()

Unnamed: 0,id,cleaning_fee,review_scores_cleanliness,review_scores_rating
count,2410.0,2410.0,2410.0,2410.0
mean,4923788.0,60.956432,9.585477,94.5361
std,2778773.0,48.418408,0.75821,6.515599
min,4291.0,5.0,4.0,20.0
25%,2591475.0,25.0,9.0,93.0
50%,5175170.0,50.0,10.0,96.0
75%,7346451.0,80.0,10.0,99.0
max,9995551.0,300.0,10.0,100.0


In [39]:
#categorize the prices into 4 categories:
#[0-50] = 'low'
#[51-100] = 'mid'
#[101-150] = 'high'
#[151-300] = 'very high'
cleaning_categorized = listings_cleaning_scores.copy()
cleaning_categorized['fees_categorized'] = pd.cut(listings_cleaning_scores['cleaning_fee'], 
                                                       bins=[0, 50, 100, 150, 300], include_lowest=True, 
                                                       labels=['low', 'mid', 'high', 'very high'])
cleaning_categorized

Unnamed: 0,id,cleaning_fee,review_scores_cleanliness,review_scores_rating,fees_categorized
1,953595,40.0,10.0,96.0,low
2,3308979,300.0,10.0,97.0,very high
4,278830,125.0,9.0,92.0,high
5,5956968,40.0,10.0,95.0,low
7,856550,25.0,10.0,97.0,low
...,...,...,...,...,...
3809,6120046,75.0,7.0,80.0,mid
3810,262764,85.0,8.0,92.0,mid
3811,8578490,20.0,8.0,100.0,low
3813,8101950,230.0,10.0,80.0,very high


In [42]:
#we now want to the the average of the prices for each day
categories_average_scores = cleaning_categorized.groupby(['fees_categorized'])[['review_scores_cleanliness','review_scores_rating']].mean()
categories_average_scores.head()

Unnamed: 0_level_0,review_scores_cleanliness,review_scores_rating
fees_categorized,Unnamed: 1_level_1,Unnamed: 2_level_1
low,9.56044,94.744322
mid,9.597641,94.137615
high,9.686047,94.645349
very high,9.654545,94.545455
