# Imports

In [109]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings as warn
warn.filterwarnings("ignore")

# Gathering Data

In [110]:
new_york = pd.read_csv(r"data\NY_Listings.csv", encoding= "ISO-8859-1")
new_york.head(2)

Unnamed: 0,Listing ID,Name,Host ID,Host Name,Host Response Rate,Host Is Superhost,Host total listings count,Street,City,Neighbourhood cleansed,...,Number of reviews,Last Review Date,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Reviews per month
0,2515,Sunny Private Room,16286162,Pat,1.0,False,4.0,Bronx| NY| United States,Bronx,Allerton,...,66,8/25/17,96,10,9,10,10,9,10,1.77
1,2539,Comfy bedroom minutes to Manhattan,44260966,Alicia,1.0,False,1.0,Bronx| NY| United States,Bronx,Soundview,...,38,9/8/17,89,10,9,9,10,9,9,1.54


# Cleaning

In [111]:
new_york = new_york[new_york["Number of reviews"]>0]
new_york = new_york[new_york["Availability 365"]>0]
new_york = new_york[new_york["Host total listings count"]>0]
new_york = new_york[new_york["Review Scores Accuracy"]>0]
new_york = new_york[new_york["Review Scores Cleanliness"]>0]
new_york = new_york[new_york["Review Scores Checkin"]>0]
new_york = new_york[new_york["Review Scores Communication"]>0]
new_york = new_york[new_york["Review Scores Location"]>0]
new_york = new_york[new_york["Review Scores Value"]>0]
new_york.drop("Last Review Date", axis=1, inplace=True)
new_york.drop("Maximum nights", axis=1, inplace=True)
new_york.drop("Minimum nights", axis=1, inplace=True)
new_york.drop("Amenities", axis=1, inplace=True)
new_york.drop("longitude", axis=1, inplace=True)
new_york.drop("latitude", axis=1, inplace=True)
new_york.drop("State", axis=1, inplace=True)
new_york.drop("Country", axis=1, inplace=True)
new_york.drop("City", axis=1, inplace=True)
new_york.drop("Street", axis=1, inplace=True)
new_york.drop("Host Response Rate", axis=1, inplace=True)
new_york.drop("Host Name", axis=1, inplace=True)
new_york.drop("Name", axis=1, inplace=True)


### Making of the graphs

Reviews correlation graph

In [112]:
review_cols = [
    c for c in new_york.columns
    if "review" in c.lower() and "reviews" not in c.lower()
]
corr_between_reviews = new_york[review_cols].corr()
corr_between_reviews["Review Scores Rating"].drop("Review Scores Rating")


Review Scores Accuracy         0.741678
Review Scores Cleanliness      0.696431
Review Scores Checkin          0.590115
Review Scores Communication    0.631062
Review Scores Location         0.445268
Review Scores Value            0.748207
Name: Review Scores Rating, dtype: float64

Price groups graph

In [114]:
new_york["price_group"] = pd.qcut(new_york["Price"], q=20, labels=[f"{i*5}-{(i+1)*5}%" for i in range(20)])
new_york.groupby("price_group")["Review Scores Value"].mean()

Unnamed: 0,Listing ID,Host ID,Host Is Superhost,Host total listings count,Neighbourhood cleansed,Property type,Room type,Accommodates,Bathrooms,Bedrooms,...,Number of reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Reviews per month,price_group
0,2515,16286162,False,4.0,Allerton,House,Private room,1.0,1.0,1.0,...,66,96,10,9,10,10,9,10,1.77,5-10%
1,2539,44260966,False,1.0,Soundview,House,Private room,1.0,1.0,1.0,...,38,89,10,9,9,10,9,9,1.54,0-5%
2,2595,105394139,False,16.0,Fordham,House,Private room,4.0,3.0,2.0,...,18,90,9,9,10,9,9,9,3.83,30-35%
3,3330,104262517,False,1.0,Fordham,Apartment,Entire home/apt,4.0,1.0,1.0,...,7,85,9,10,8,9,9,9,0.67,60-65%
4,3647,58126473,False,1.0,Fordham,Apartment,Private room,2.0,1.0,1.0,...,56,95,10,10,10,10,9,10,3.70,15-20%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44301,21176357,5949541,True,1.0,Grant City,House,Entire home/apt,3.0,1.0,1.0,...,18,99,10,10,10,10,9,10,3.44,30-35%
44303,21176433,2265022,False,1.0,Mariners Harbor,Apartment,Entire home/apt,6.0,2.0,3.0,...,11,95,9,9,10,10,9,10,0.40,65-70%
44305,21177066,6319915,False,2.0,St. George,Apartment,Entire home/apt,2.0,1.0,1.0,...,56,100,10,10,10,10,10,10,2.25,20-25%
44306,21177156,1715301,True,2.0,St. George,House,Entire home/apt,6.0,1.0,2.0,...,27,98,10,10,10,10,10,10,4.76,40-45%


Property types graph

In [117]:
relevent_property_types = new_york["Property type"].value_counts()
relevent_apartments = new_york[new_york["Property type"].isin(relevent_property_types[relevent_property_types >= 10].index)]
relevent_apartments.groupby("Property type")["Review Scores Rating"].mean()

Property type
Apartment          93.566119
Bed & Breakfast    93.010417
Boutique hotel     94.384615
Bungalow           92.785714
Condominium        95.387755
Dorm               88.347826
Guest suite        95.878049
Guesthouse         91.000000
Hostel             85.857143
House              92.993743
Loft               95.089580
Other              90.821839
Timeshare          91.900000
Townhouse          94.578045
Villa              90.000000
Name: Review Scores Rating, dtype: float64

Bedrooms and Bathrooms graph

In [128]:
bedrooms_and_bathrooms = new_york.groupby(["Bedrooms", "Bathrooms"]).agg(
    Average_Rating = ("Review Scores Rating", "mean"),
    Amount = ("Review Scores Rating", "size")
)
bedrooms_and_bathrooms[bedrooms_and_bathrooms["Count"] >= 10].sort_values("Average_Rating", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Average_Rating,Count
Bedrooms,Bathrooms,Unnamed: 2_level_1,Unnamed: 3_level_1
3.0,3.5,97.923077,13
2.0,3.0,96.5,10
4.0,3.0,96.310345,29
3.0,3.0,96.051282,39
3.0,2.5,95.95,60
4.0,2.5,95.583333,36
2.0,2.5,94.521739,46
2.0,2.0,94.275488,461
2.0,1.5,94.198758,161
4.0,3.5,94.105263,19


Neighbourhoods & Locations graph

In [139]:
neighbourhood_location_rating = new_york.groupby("Neighbourhood cleansed").agg(
    Average_Score = ("Review Scores Location", "mean"),
    Amount = ("Review Scores Location", "size")
)
neighbourhood_location_rating[neighbourhood_location_rating["Amount"] >= 10].sort_values("Average_Score", ascending=False)

Unnamed: 0_level_0,Average_Score,Amount
Neighbourhood cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1
Battery Park City,10.000000,17
Flatiron District,9.979167,48
NoHo,9.978261,46
Cobble Hill,9.957447,47
Greenwich Village,9.943128,211
...,...,...
Borough Park,8.564103,39
Brownsville,8.515152,33
Morris Heights,8.416667,12
East New York,8.394958,119


Superhost graphs

In [141]:
host_types = new_york.groupby("Host Is Superhost").size()
host_types / host_types.sum() * 100


Host Is Superhost
False    82.39098
True     17.60902
dtype: float64

In [143]:
host_types = new_york.groupby("Host Is Superhost")["Review Scores Rating"].mean()

In [142]:
new_york.columns

Index(['Listing ID', 'Host ID', 'Host Is Superhost',
       'Host total listings count', 'Neighbourhood cleansed', 'Property type',
       'Room type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Price',
       'Availability 365', 'Calendar last scraped', 'Number of reviews',
       'Review Scores Rating', 'Review Scores Accuracy',
       'Review Scores Cleanliness', 'Review Scores Checkin',
       'Review Scores Communication', 'Review Scores Location',
       'Review Scores Value', 'Reviews per month', 'price_group'],
      dtype='object')