In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_theme()

pd.set_option('display.max_rows', 25)          
pd.set_option('display.max_columns', 100)      
pd.set_option('display.width', None)             
pd.set_option('display.max_colwidth', 100)       
pd.set_option("display.precision", 6) 

In [2]:
df_rome = pd.read_csv("rom_airbnb_listings.csv")

In [3]:
df_rome.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,75474,Villa in Rome · 9 bedrooms · 10 beds · 6.5 baths,400717,Marina,,III Monte Sacro,41.98959,12.52154,Entire home/apt,1973.0,3,2,2023-06-20,0.31,1,332,2,
1,2737,Place to stay in Rome · ★4.80 · 1 bedroom · 1 bed · 1.5 baths,3047,Elif,,VIII Appia Antica,41.87136,12.48215,Private room,50.0,7,5,2015-05-08,0.05,6,345,0,
2,3079,Rental unit in Rome · ★4.53 · 1 bedroom · 1 bed · 1 bath,3504,Laura,,I Centro Storico,41.895,12.49117,Entire home/apt,120.0,30,21,2022-04-30,0.13,6,289,0,


In [4]:
df_rome.nunique()

id                                29357
name                               9263
host_id                           15645
host_name                          3648
neighbourhood_group                   0
neighbourhood                        15
latitude                          16230
longitude                         18566
room_type                             4
price                               871
minimum_nights                       52
number_of_reviews                   585
last_review                        1813
reviews_per_month                   825
calculated_host_listings_count       47
availability_365                    366
number_of_reviews_ltm               137
license                            5711
dtype: int64

In [5]:
df_rome.shape

(29357, 18)

In [6]:
df_rome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29357 entries, 0 to 29356
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              29357 non-null  int64  
 1   name                            29357 non-null  object 
 2   host_id                         29357 non-null  int64  
 3   host_name                       29348 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   29357 non-null  object 
 6   latitude                        29357 non-null  float64
 7   longitude                       29357 non-null  float64
 8   room_type                       29357 non-null  object 
 9   price                           27381 non-null  float64
 10  minimum_nights                  29357 non-null  int64  
 11  number_of_reviews               29357 non-null  int64  
 12  last_review                     

In [7]:
df_rome.describe()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
count,29357.0,29357.0,0.0,29357.0,29357.0,27381.0,29357.0,29357.0,25086.0,29357.0,29357.0,29357.0
mean,3.411734e+17,170193600.0,,41.891868,12.480778,172.378912,3.213714,55.229145,1.72192,9.096468,184.374187,15.867868
std,4.223487e+17,180446700.0,,0.035648,0.048908,712.298687,16.04447,89.37523,1.671394,29.698329,132.577118,21.009852
min,2737.0,1944.0,,41.656792,12.2385,8.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,20146280.0,20428280.0,,41.885448,12.46116,76.0,1.0,3.0,0.4,1.0,50.0,0.0
50%,45694500.0,86835400.0,,41.89658,12.47789,105.0,2.0,18.0,1.21,2.0,196.0,6.0
75%,8.120685e+17,297561100.0,,41.90648,12.50441,157.0,3.0,68.0,2.6,6.0,314.0,25.0
max,1.046345e+18,550825600.0,,42.12131,12.835699,80000.0,999.0,1879.0,38.56,265.0,365.0,601.0


# Null values

In [8]:
df_rome.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             9
neighbourhood_group               29357
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                              1976
minimum_nights                        0
number_of_reviews                     0
last_review                        4271
reviews_per_month                  4271
calculated_host_listings_count        0
availability_365                      0
number_of_reviews_ltm                 0
license                           22564
dtype: int64

In [9]:
# Delete features "neighbourhood_group" and "license"

df_rome.drop("neighbourhood_group", axis=1, inplace=True)

df_rome.drop("license", axis=1, inplace=True)

In [10]:
# Delete feature "last_review"

df_rome.drop("last_review", axis=1, inplace=True)

In [11]:
# Delete features "id" and "host_id" --> not important for analysies

df_rome.drop("id", axis=1, inplace=True)

df_rome.drop("host_id", axis=1, inplace=True)

In [12]:
# Delete 9 samples of null values from column "host_name":

df_rome.drop(df_rome[df_rome["host_name"].isnull()].index, inplace=True)

In [13]:
#Column "price" --> fill null values with "105" --> 50 % price:

df_rome["price"] = df_rome["price"].fillna(105)

In [14]:
#Column "reviews_per_month" --> fill null values with mean:

df_rome["reviews_per_month"].fillna(df_rome["reviews_per_month"].mean(),inplace=True)

# Checking outliers

In [15]:
# Checking price > 10.000:

df_rome[df_rome["price"]  > 10000]

# Delete price > 10.000:
    
df_rome.drop(df_rome[df_rome["price"] > 10000].index, inplace=True)

In [16]:
# Checking minimum_nights > 30:

df_rome[df_rome["minimum_nights"]  > 30]

# Delete  minimum_nights > 30:
    
df_rome.drop(df_rome[df_rome["minimum_nights"] > 30].index, inplace=True)

In [17]:
# exporting cleaned dataframe

#df_rome.to_csv("rom_airbnb_listings_cleaned.csv",index=False)

In [18]:
df_rome.head(3)

Unnamed: 0,name,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
0,Villa in Rome · 9 bedrooms · 10 beds · 6.5 baths,Marina,III Monte Sacro,41.98959,12.52154,Entire home/apt,1973.0,3,2,0.31,1,332,2
1,Place to stay in Rome · ★4.80 · 1 bedroom · 1 bed · 1.5 baths,Elif,VIII Appia Antica,41.87136,12.48215,Private room,50.0,7,5,0.05,6,345,0
2,Rental unit in Rome · ★4.53 · 1 bedroom · 1 bed · 1 bath,Laura,I Centro Storico,41.895,12.49117,Entire home/apt,120.0,30,21,0.13,6,289,0
