In [35]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'browser'

In [36]:
%matplotlib inline
df = pd.read_csv('data/listings 2.csv')

In [37]:
cols_to_keep=[
    'id','name','host_id','host_name','host_is_superhost','host_listings_count',
    'price', 'number_of_reviews','review_scores_rating','availability_365','room_type',
    'latitude','longitude'
]
df=df[cols_to_keep].copy()

In [38]:
#data cleaning
# Clean price column
df['price'] = df['price'].replace('[\$,]', '', regex=True)
df = df[df['price'].notnull() & (df['price'] != '')]
df['price'] = df['price'].astype(float)
df['reviews_per_listing'] = df['number_of_reviews'] / df['host_listings_count'].replace(0, np.nan)

df['price_per_review'] = df['price'] / df['number_of_reviews'].replace(0, np.nan)

# Estimate reviews per month (if reviews_per_month column doesn't exist)
df['estimated_reviews_per_month'] = df['number_of_reviews'] / 12
# Drop missing review scores
df.dropna(subset=['price', 'review_scores_rating'], inplace=True)

# Map superhost values to 'Yes'/'No'
df['host_is_superhost'] = df['host_is_superhost'].map({'t': 'Yes', 'f': 'No'})


In [39]:

#  Price Distribution by Superhost Status
fig = px.box(
    df,
    x='host_is_superhost',
    y='price',
    color='host_is_superhost',
    points='all',
    title='Price Distribution by Superhost Status',
    labels={'host_is_superhost': 'Superhost', 'price': 'Price ($)'}
)
fig.update_layout(showlegend=False, template='plotly_white')
fig.show()


In [40]:
# Price per Review (Efficiency)
fig = px.box(
    df,
    x='host_is_superhost',
    y='price_per_review',
    color='host_is_superhost',
    title='Price Per Review by Superhost Status',
    labels={'host_is_superhost': 'Superhost', 'price_per_review': 'Price per Review ($)'},
    points='all'
)
fig.update_layout(showlegend=False, template='plotly_white')
fig.show()

In [41]:
#Review Score by Superhost Status

fig = px.box(
    df,
    x='host_is_superhost',
    y='review_scores_rating',
    color='host_is_superhost',
    points='all',
    title='Review Scores by Superhost Status',
    labels={'host_is_superhost': 'Superhost', 'review_scores_rating': 'Review Rating'}
)
fig.update_layout(showlegend=False, template='plotly_white')
fig.show()



In [42]:
# Number of Reviews by Superhost Status
fig = px.box(
    df,
    x='host_is_superhost',
    y='number_of_reviews',
    color='host_is_superhost',
    points='all',
    title='Number of Reviews by Superhost Status',
    labels={'host_is_superhost': 'Superhost', 'number_of_reviews': 'Number of Reviews'}
)
fig.update_layout(showlegend=False, template='plotly_white')
fig.show()

In [43]:
df_map = df[['latitude', 'longitude', 'host_is_superhost', 'price', 'room_type']].dropna().copy()
fig = px.scatter_mapbox(
    df_map,
    lat="latitude",
    lon="longitude",
    color="host_is_superhost",
    hover_data=["price", "room_type"],
    zoom=11,
    height=500,
    mapbox_style="carto-positron",
    title="Airbnb Listings by Superhost Status (Map View)"
)

fig.show()



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [44]:
print(df.columns.tolist())

['id', 'name', 'host_id', 'host_name', 'host_is_superhost', 'host_listings_count', 'price', 'number_of_reviews', 'review_scores_rating', 'availability_365', 'room_type', 'latitude', 'longitude', 'reviews_per_listing', 'price_per_review', 'estimated_reviews_per_month']
