# Key Insights
**1. Market Dynamics & Pricing Intelligence**
> * **Weekend vs Weekday Premium Analysis**
> * **Price Elasticity by City**
> * **Superhost Premium**
___________________________________________________________________________________________________________________________________________________
**2. Location Intelligence & Accessibility**
>  * **Metro Proximity ROI**
> * **City Center vs Periphery Analysis**
___________________________________________________________________________________________________________________________________________________
**3. Guest Experience & Revenue Optimization**
> * **Satisfaction-Price Correlation**
> * **Capacity Utilization**
> * **Cleanliness Premium**
___________________________________________________________________________________________________________________________________________________
**4. Market Segmentation & Business Intelligence**
> * **Business vs Leisure Demand**
> * **Room Type Profitability**
> * **Multi-listing Strategy**   

# Importing libraries

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px 
import numpy as np 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default="iframe_connected"

# Loading Data

In [2]:
cities=['amsterdam','athens','barcelona','berlin','budapest','lisbon','london','paris','rome','vienna']
weekend_data={}
weekdays_data={}
for city in cities:
    # Load weekday data
    weekdays_path = f"{city}_weekdays.csv"
    weekdays_data[city] = pd.read_csv(weekdays_path)
    # Load weekend data  
    weekend_path = f"{city}_weekends.csv"
    weekend_data[city] = pd.read_csv(weekend_path)
#loading multiple data in two dictioneries with key is the name of city
#now combining the two dictionaries in one dataset for analyzing it 
all_data = []

for city in cities:
    if city in weekdays_data:
        # Adding weekday data
        weekday_df = weekdays_data[city].copy()
        weekday_df['city'] = city.title()
        weekday_df['day_type'] = 'weekday'
        all_data.append(weekday_df)
        
        # Adding weekend data
        weekend_df = weekend_data[city].copy()
        weekend_df['city'] = city.title()
        weekend_df['day_type'] = 'weekend'
        all_data.append(weekend_df)

# Combinining all data
df = pd.concat(all_data, ignore_index=True)

In [3]:
df

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,day_type
0,0,194.033698,Private room,False,True,2.0,False,1,0,10.0,...,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,Amsterdam,weekday
1,1,344.245776,Private room,False,True,4.0,False,0,0,8.0,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,Amsterdam,weekday
2,2,264.101422,Private room,False,True,2.0,False,0,1,9.0,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,Amsterdam,weekday
3,3,433.529398,Private room,False,True,4.0,False,0,1,9.0,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,Amsterdam,weekday
4,4,485.552926,Private room,False,True,2.0,True,0,0,10.0,...,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,Amsterdam,weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51702,1794,715.938574,Entire home/apt,False,False,6.0,False,0,1,10.0,...,0.530181,0.135447,219.402478,15.712158,438.756874,10.604584,16.37940,48.21136,Vienna,weekend
51703,1795,304.793960,Entire home/apt,False,False,2.0,False,0,0,8.0,...,0.810205,0.100839,204.970121,14.678608,342.182813,8.270427,16.38070,48.20296,Vienna,weekend
51704,1796,637.168969,Entire home/apt,False,False,2.0,False,0,0,10.0,...,0.994051,0.202539,169.073402,12.107921,282.296424,6.822996,16.38568,48.20460,Vienna,weekend
51705,1797,301.054157,Private room,False,True,2.0,False,0,0,10.0,...,3.044100,0.287435,109.236574,7.822803,158.563398,3.832416,16.34100,48.19200,Vienna,weekend


# Exploring Data

In [4]:
df.isna().sum()
#no null values

Unnamed: 0                    0
realSum                       0
room_type                     0
room_shared                   0
room_private                  0
person_capacity               0
host_is_superhost             0
multi                         0
biz                           0
cleanliness_rating            0
guest_satisfaction_overall    0
bedrooms                      0
dist                          0
metro_dist                    0
attr_index                    0
attr_index_norm               0
rest_index                    0
rest_index_norm               0
lng                           0
lat                           0
city                          0
day_type                      0
dtype: int64

In [5]:
df.describe()


Unnamed: 0.1,Unnamed: 0,realSum,person_capacity,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,1620.502388,279.879591,3.161661,0.291353,0.350204,9.390624,92.628232,1.15876,3.191285,0.68154,294.204105,13.423792,626.856696,22.786177,7.426068,45.671128
std,1217.380366,327.948386,1.298545,0.45439,0.477038,0.954868,8.945531,0.62741,2.393803,0.858023,224.754123,9.807985,497.920226,17.804096,9.799725,5.249263
min,0.0,34.779339,2.0,0.0,0.0,2.0,20.0,0.0,0.015045,0.002301,15.152201,0.926301,19.576924,0.592757,-9.22634,37.953
25%,646.0,148.752174,2.0,0.0,0.0,9.0,90.0,1.0,1.453142,0.24848,136.797385,6.380926,250.854114,8.75148,-0.0725,41.39951
50%,1334.0,211.343089,3.0,0.0,0.0,10.0,95.0,1.0,2.613538,0.413269,234.331748,11.468305,522.052783,17.542238,4.873,47.50669
75%,2382.0,319.694287,4.0,1.0,1.0,10.0,99.0,1.0,4.263077,0.73784,385.756381,17.415082,832.628988,32.964603,13.518825,51.471885
max,5378.0,18545.450285,6.0,1.0,1.0,10.0,100.0,10.0,25.284557,14.273577,4513.563486,100.0,6696.156772,100.0,23.78602,52.64141


* **outliers in the realSum which represents the price of Airbnb.(right skewness).**
* **outliers in distance from city center (some are really far and some are really close).**
* **outliers in metro distance  the same as city center.**
 

In [6]:
df.describe(include="O")

Unnamed: 0,room_type,city,day_type
count,51707,51707,51707
unique,3,10,2
top,Entire home/apt,London,weekend
freq,32648,9993,26207


# Feature Engineering

**what will each person pay for an Airbnb for each city?**
**price per person**

In [7]:

df["Price_Per_Person"]=df["person_capacity"]/df["realSum"]

**what is the price of each bedroom in case of single one it's already shown but what if a family rents and each one will pay?**

In [8]:
df["Price_Per_Bedroom"]=df["realSum"]/df["bedrooms"]

**is it central or not (close to city center?)**

In [9]:
df["Is_Central"]=(df["dist"]<=df['dist'].quantile(.25).astype(int))

**as we did city center we will do metro**

In [10]:
df["Is_metro_Close"]=(df['metro_dist']<=df["metro_dist"].quantile(0.25).astype(int))

**Mapping level of satisfiction**

In [11]:
df["Satisfaction_Level"]=pd .cut(df["guest_satisfaction_overall"],bins=[0,80,90,95,100],labels=["Low","Medium","High","Excellent"])

# EDA

In [12]:
fig = px.histogram(df, x="realSum", nbins=50, title="Distribution of RealSum Values")
fig.show()


**very abvious right skewnesss**

> **Weekdend VS Weekdays Analysis**

In [13]:
weekend_analysis=df.groupby(['city','day_type'])['realSum'].agg(['mean','count']).reset_index()
weekend_pivot = weekend_analysis.pivot(index='city', columns='day_type', values='mean')
#calculating percentage
weekend_pivot["weekend_premium"]=(((weekend_pivot['weekend']-weekend_pivot['weekday'])/weekend_pivot['weekday'])*100).round(2)
weekend_pivot=weekend_pivot.sort_values('weekend_premium',ascending=False)
weekend_pivot



day_type,weekday,weekend,weekend_premium
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amsterdam,545.020526,604.828018,10.97
Budapest,168.429367,185.120628,9.91
Barcelona,288.391667,300.27794,4.12
Berlin,240.220422,249.252516,3.76
Rome,201.618053,209.130063,3.73
Lisbon,236.345459,240.044051,1.56
London,360.230348,364.389747,1.15
Vienna,240.384834,242.739524,0.98
Paris,398.786678,387.028589,-2.95
Athens,155.866982,147.580456,-5.32


In [14]:

weekend_pivot = weekend_pivot.reset_index()
fig = px.bar(
    weekend_pivot,
    x="city",
    y="weekend_premium",
    title="Weekend Premium by City (%)",
    color_discrete_sequence=["blue"]
)


fig.update_layout(
    xaxis_title="City",
    yaxis_title="Weekend Premium (%)",
    title_font=dict(size=14, family="Arial", color="black"),
    xaxis_tickangle=-45,
    yaxis_gridcolor='rgba(0,0,0,0.3)',  
    template='plotly_white'
)

fig.show()

> **Super Hosting**

In [15]:
super_host_comp=df.groupby(['city','host_is_superhost']).agg({
    'realSum':["mean","count"],
    'guest_satisfaction_overall':'mean',
    'cleanliness_rating':'mean'
}).round(2)
#getting the best of all cities
super_host_premium={}
for city in df['city'].unique():
    city_data=df[df['city']==city]
    superhost_price=city_data[city_data['host_is_superhost']==True]['realSum'].mean()
    regualr_price=city_data[city_data['host_is_superhost']==False]['realSum'].mean()
    premium=((superhost_price-regualr_price)/regualr_price)*100
    super_host_premium[city]=premium
superhost_premium_series = pd.Series(super_host_premium).sort_values(ascending=False)



In [16]:

df_superhost = superhost_premium_series.reset_index()
df_superhost.columns = ['City', 'Superhost Premium (%)']


fig = px.bar(
    df_superhost,
    x='City',
    y='Superhost Premium (%)',
    title='Superhost Premium by City (%)',
    color_discrete_sequence=['purple']
)


fig.update_layout(
    xaxis_title='City',
    yaxis_title='Superhost Premium (%)',
    title_font=dict(size=14, family='Arial', color='black'),
    xaxis_tickangle=-45,
    yaxis_gridcolor='rgba(0,0,0,0.3)',  
    template='plotly_white'
)

fig.show()

> **Location Analysis**

In [17]:
#categorizing metro and city center distances
df['distance_citycenter_category']=pd.cut(df['dist'],bins=[0,2,5,10,float('inf')],labels=['Very Close (<500m)', 'Close (500m-1km)', 
                                    'Medium (1-2km)', 'Far (>2km)'])
df["metro_category"]=pd.cut(df["metro_dist"], bins=[0, 500, 1000, 2000, float('inf')],
                             labels=['Very Close (<500m)', 'Close (500m-1km)', 
                                    'Medium (1-2km)', 'Far (>2km)'])
# Price by distance from city center and metro
center_analysis = df.groupby('distance_citycenter_category')['realSum'].agg(['mean', 'count'])


metro_analysis = df.groupby('metro_category')['realSum'].agg(['mean', 'count'])







In [18]:

fig = make_subplots(rows=1, cols=2, subplot_titles=(
    "Price by Distance from City Center",
    "Price by Metro Distance"
))


fig.add_trace(
    go.Bar(
        x=center_analysis.index,
        y=center_analysis["mean"],
        name="City Center Distance",
        marker_color='lightcoral'
    ),
    row=1, col=1
)


fig.add_trace(
    go.Bar(
        x=metro_analysis.index,
        y=metro_analysis["mean"],
        name="Metro Distance",
        marker_color='lightblue'
    ),
    row=1, col=2
)


fig.update_layout(
    height=500,
    width=1000,
    title_text="Price Impact by Location Factors",
    template='plotly_white',
    showlegend=False
)


fig.update_xaxes(tickangle=45)

fig.show()

> **Guest Satisfaction analysis**

In [19]:
satisfaction_analysis = df.groupby('Satisfaction_Level')['realSum'].agg(['mean', 'count'])

fig = px.box(
    df,
    x="Satisfaction_Level",
    y="realSum",
    title="Price Distribution by Satisfaction Level",
    color="Satisfaction_Level",
    points="outliers", 
    template="plotly_white"
)


fig.update_layout(
    xaxis_title="Satisfaction Level",
    yaxis_title="Price (€)",
    showlegend=False
)

fig.show()





> **Satisfaction and Cleanliness**

In [20]:

fig = px.scatter(
    df,
    x="cleanliness_rating",
    y="guest_satisfaction_overall",
    title="Cleanliness vs Satisfaction",
    opacity=0.5,
    color_discrete_sequence=["red"],
    trendline="ols"
)


fig.update_layout(
    xaxis_title="Cleanliness Rating",
    yaxis_title="Guest Satisfaction Overall",
    template="plotly_white"
)

fig.show()

> **Room Type & Capacity Analysis**

In [21]:
room_analysis = df.groupby('room_type')['realSum'].agg(['mean', 'count'])
capacity_analysis = df.groupby('person_capacity')['realSum'].agg(['mean', 'count'])
bedroom_analysis = df.groupby('bedrooms')['realSum'].agg(['mean', 'count'])


In [22]:

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        "Average Price by Room Type",
        "Price by Person Capacity",
        "Price by Number of Bedrooms",
        "Room Type Distribution"
    ],
    specs=[[{}, {}], [{}, {"type": "domain"}]]  # Last one is for pie chart
)


fig.add_trace(
    go.Bar(
        x=room_analysis.index,
        y=room_analysis["mean"],
        marker_color="lightgreen",
        name="Room Price"
    ),
    row=1, col=1
)


fig.add_trace(
    go.Scatter(
        x=capacity_analysis.index[:10],
        y=capacity_analysis["mean"].head(10),
        mode="lines+markers",
        marker=dict(color="blue"),
        name="Capacity Price"
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=bedroom_analysis.index[:8],
        y=bedroom_analysis["mean"].head(8),
        marker_color="coral",
        name="Bedroom Price"
    ),
    row=2, col=1
)


fig.add_trace(
    go.Pie(
        labels=df["room_type"].value_counts().index,
        values=df["room_type"].value_counts().values,
        textinfo="label+percent",
        marker=dict(colors=["#FFDDC1", "#FFC2D1", "#C1E1FF", "#C2F5C2"])
    ),
    row=2, col=2
)

fig.update_layout(
    height=700,
    width=1000,
    title_text="Room Type Distribution and Pricing Analysis",
    template="plotly_white",
    showlegend=False
)


fig.update_xaxes(tickangle=45)

fig.show()

> **Business vs Leisure Analysis**

In [23]:
# Business listing analysis
business_analysis = df.groupby(['city', 'biz']).agg({
    'realSum': ['mean', 'count'],
    'guest_satisfaction_overall': 'mean',
    'person_capacity': 'mean'
}).round(2)
business_premium = {}
for city in df['city'].unique():
    city_data = df[df['city'] == city]
    if len(city_data[city_data['biz'] == True]) > 0:
        business_price = city_data[city_data['biz'] == True]['realSum'].mean()
        leisure_price = city_data[city_data['biz'] == False]['realSum'].mean()
        premium = ((business_price - leisure_price) / leisure_price * 100)
        business_premium[city] = premium

business_premium_series = pd.Series(business_premium).sort_values(ascending=False)

In [24]:

df_business = business_premium_series.reset_index()
df_business.columns = ['City', 'Business Premium (%)']

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=df_business['City'],
        y=df_business['Business Premium (%)'],
        marker_color='navy',
        name='Business Premium'
    )
)


fig.add_shape(
    type="line",
    x0=-0.5,
    x1=len(df_business['City']) - 0.5,
    y0=0,
    y1=0,
    line=dict(color="red", dash="dash", width=2),
)


fig.update_layout(
    title="Business Listing Premium by City (%)",
    xaxis_title="City",
    yaxis_title="Business Premium (%)",
    xaxis_tickangle=45,
    template="plotly_white",
    height=500,
    width=1000,
    showlegend=False,
    shapes=[dict(type='line', xref='paper', x0=0, x1=1, yref='y', y0=0, y1=0,
                 line=dict(color='red', width=2, dash='dash'))]
)

fig.show()

# summary

****

> * **Prices vary as some Airbnb have very high prices so there is a right skewness.**
> * **Amsterdam exhibits the highest average prices for Airbnb listings in Europe it is the highest city also in weekends as all cities are pricey in weekends so it makes perfect since surprisingly vienna is the lowest.**
> * **Quantifying the exact revenue advantage of being a superhost across cities shows that Athens takes the highest place while london takes the lowest place.**
> * **Most Aibnb are close to metro but not all of them are at the same distance from city centers.**
> * **Price by satisfaction level shows that the more pricey the more good reviews which is normal and predictable what you pay what you get!**
> * **The relation between cleanliness and satisfaction is directly propotional (no one is satisfied in an unclean place).**
> * **Entire home and aprtment are the most expensive however shared rooms are more convinent.**
> * **When the number of persons capcity per place increase the price also increase.**
> * **Airbnb having 6 bedrooms are most found and most expensive unlike 8 which ofc pricey but not as available.**
> * **Most airbnb are Entire home and shared rooms are very little percentage.**
> * **Most city for business performance is Barcelona while the least is amestrdam due to its prices.** 