In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Load the dataset
script_dir = os.getcwd()
data_path = os.path.join(script_dir, "../data/clean/cleaned_listings.csv")
df = pd.read_csv(data_path)

In [3]:
df.head()  # Inspect the first few rows

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price
0,197677,Oshiage Holiday Apartment ...,964081,Yoshimi & Marek ...,Sumida Ku,35.71707,139.82608,Entire home/apt,12000
1,776070,Kero-kero house room 1 ...,801494,Kei ...,Kita Ku,35.73844,139.76917,Private room,9652
2,905944,4F - Near Shinjuku & Shibuya ...,4847803,Best Stay In Tokyo! ...,Shibuya Ku,35.67878,139.67847,Entire home/apt,25738
3,1016831,5 mins Shibuya Cat modern sunny Shimokita ...,5596383,Wakana ...,Setagaya Ku,35.658,139.67134,Private room,23286
4,1196177,Stay with host Cozy private room Senju area ...,5686404,Yukiko ...,Adachi Ku,35.744731,139.797384,Private room,7500


In [4]:
df.info()  # Get basic info about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14805 entries, 0 to 14804
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             14805 non-null  int64  
 1   name           7548 non-null   object 
 2   host_id        14805 non-null  int64  
 3   host_name      10968 non-null  object 
 4   neighbourhood  14805 non-null  object 
 5   latitude       14805 non-null  float64
 6   longitude      14805 non-null  float64
 7   room_type      14805 non-null  object 
 8   price          14805 non-null  int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 1.0+ MB


In [5]:
df.isnull().sum()  # Check for missing values

id                  0
name             7257
host_id             0
host_name        3837
neighbourhood       0
latitude            0
longitude           0
room_type           0
price               0
dtype: int64

In [6]:
df.describe()  # Summary statistics for numerical columns

Unnamed: 0,id,host_id,latitude,longitude,price
count,14805.0,14805.0,14805.0,14805.0,14805.0
mean,6.103899e+17,296040400.0,35.698957,139.736314,20688.608105
std,4.937904e+17,187363200.0,0.041508,0.07292,28521.09957
min,197677.0,322234.0,35.52094,139.081322,1280.0
25%,42007050.0,129096300.0,35.68864,139.698287,9800.0
50%,8.634584e+17,273470500.0,35.70442,139.727374,15000.0
75%,1.049645e+18,500438300.0,35.723244,139.79141,24286.0
max,1.189054e+18,585981900.0,35.840764,139.91402,999999.0


In [7]:
df.median(numeric_only=True)  # Calculate Median for numerical columns

id           8.634584e+17
host_id      2.734705e+08
latitude     3.570442e+01
longitude    1.397274e+02
price        1.500000e+04
dtype: float64

In [8]:
df.mode(numeric_only=True).iloc[0]  # Calculate Mode for numerical columns

id           1.976770e+05
host_id      2.296271e+08
latitude     3.569625e+01
longitude    1.397864e+02
price        6.105000e+03
Name: 0, dtype: float64

In [9]:
df.nunique()  # Unique values count for categorical columns

id               14805
name              6509
host_id           3650
host_name         1711
neighbourhood       49
latitude         11373
longitude        11946
room_type            4
price             5103
dtype: int64

In [10]:
# Distribution of Listings Across Neighborhoods
df["neighbourhood"].value_counts()

neighbourhood
Shinjuku Ku             2897
Sumida Ku               1945
Taito Ku                1711
Toshima Ku              1419
Shibuya Ku               977
Ota Ku                   600
Minato Ku                532
Setagaya Ku              516
Nakano Ku                454
Katsushika Ku            429
Kita Ku                  428
Suginami Ku              357
Koto Ku                  299
Edogawa Ku               297
Itabashi Ku              269
Bunkyo Ku                262
Chuo Ku                  232
Arakawa Ku               226
Shinagawa Ku             186
Adachi Ku                181
Chiyoda Ku               128
Meguro Ku                102
Nerima Ku                 80
Musashino Shi             38
Hachioji Shi              28
Machida Shi               21
Ome Shi                   21
Hino Shi                  20
Kokubunji Shi             19
Mitaka Shi                16
Chofu Shi                 14
Komae Shi                 12
Higashimurayama Shi        9
Akiruno Shi                8


In [11]:
# Correlation Analysis
# Select only the numerical columns for the correlation matrix
numerical_df = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()  # Create the correlation matrix
price_correlations = correlation_matrix["price"].sort_values(
    ascending=False
)  # Focus on correlation with 'price'
print("Price Correlations:\n", price_correlations)

Price Correlations:
 price        1.000000
longitude    0.017443
id           0.012482
host_id      0.004989
latitude    -0.015501
Name: price, dtype: float64


In [12]:
# Distribution and Analysis of Key Metrics
# Listings count by Host
listings_per_host = (
    df.groupby("host_id")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="listings_count")
)
print("Listings per Host:\n", listings_per_host)

Listings per Host:
         host_id  listings_count
0     229627088             105
1     110379659              92
2      42244167              77
3     228133407              74
4     151970670              73
...         ...             ...
3645  583413018               1
3646    5710367               1
3647  579471652               1
3648  581650135               1
3649  584786875               1

[3650 rows x 2 columns]


In [13]:
# Summary statistics for neighborhoods
neighbourhood_stats = (
    df.groupby("neighbourhood")["price"]
    .agg(["count", "mean", "median", "std"])
    .sort_values("count", ascending=False)
    .reset_index()
)
print("Neighbourhood Stats:\n", neighbourhood_stats)
print("\nEND OF EXPLORATORY DATA ANALYSIS!")

Neighbourhood Stats:
            neighbourhood  count          mean   median           std
0   Shinjuku Ku            2897  22459.536072  16500.0  24805.046004
1   Sumida Ku              1945  18002.829820  13571.0  22919.144744
2   Taito Ku               1711  24927.583285  17476.0  42568.128466
3   Toshima Ku             1419  18149.673714  14429.0  11963.605358
4   Shibuya Ku              977  27029.126919  20349.0  25916.258595
5   Ota Ku                  600  14878.225000  12103.5  10798.170345
6   Minato Ku               532  25747.605263  19593.5  23159.709331
7   Setagaya Ku             516  20669.333333  13642.5  19761.025448
8   Nakano Ku               454  17948.704846  13395.0  13534.548962
9   Katsushika Ku           429  20627.491841  17000.0  16562.734660
10  Kita Ku                 428  17317.348131  12900.0  15985.344087
11  Suginami Ku             357  16094.380952  11643.0  15949.593956
12  Koto Ku                 299  23269.257525  12571.0  52483.089493
13  Edogawa 