## Libraries used for EDA

In [1]:
import pandas as pd #data manipulation and analysis
import matplotlib.pyplot as plt #visualizations

## Overview of the Dataset

In [2]:
calendar= pd.read_csv('/content/drive/MyDrive/Airbnb/Airbnb_original_data/calendar2024.csv') #Load the dataset
calendar.head() #Display few rows

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,586421,2024-06-30,f,$250.00,,4.0,31.0
1,586421,2024-07-01,f,$250.00,,4.0,31.0
2,586421,2024-07-02,t,$250.00,,4.0,31.0
3,586421,2024-07-03,f,$250.00,,4.0,31.0
4,586421,2024-07-04,f,$250.00,,4.0,31.0


In [3]:
print(calendar.shape) #Understand the dimension of the dataset

(7631731, 7)


In [4]:
print(calendar.info()) #Understand the structure of the dataset: its dimensions, columns and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7631731 entries, 0 to 7631730
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           object 
 4   adjusted_price  float64
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 407.6+ MB
None


## Handling Missing Values

In [5]:
print(calendar.isnull().sum()) #checking for missing values

listing_id              0
date                    0
available               0
price                   0
adjusted_price    7631731
minimum_nights          1
maximum_nights          1
dtype: int64


In [6]:
calendar.drop('adjusted_price', axis=1, inplace=True) #drop 'adjusted_price', it only contains missing values

In [7]:
missing_rows = calendar[calendar['minimum_nights'].isna() | calendar['maximum_nights'].isna()] #understand where 'minimum_nights' and 'maximum_nights' are missing
print(missing_rows)

         listing_id        date available      price  minimum_nights  \
1294655    26895700  2024-06-29         f  $2,200.00             NaN   

         maximum_nights  
1294655             NaN  


In [8]:
listing_26895700 = calendar[calendar['listing_id'] == 26895700] #'minimum_nights' and 'maximum_nights' in other dates for listing_id= 26895700
print(listing_26895700)

         listing_id        date available      price  minimum_nights  \
1294655    26895700  2024-06-29         f  $2,200.00             NaN   
1294656    26895700  2024-06-30         f  $2,200.00             3.0   
1294657    26895700  2024-07-01         f  $2,200.00             3.0   
1294658    26895700  2024-07-02         f  $2,200.00             3.0   
1294659    26895700  2024-07-03         f  $2,200.00             3.0   
...             ...         ...       ...        ...             ...   
1295015    26895700  2025-06-24         t  $2,200.00             3.0   
1295016    26895700  2025-06-25         t  $2,200.00             3.0   
1295017    26895700  2025-06-26         t  $2,200.00             3.0   
1295018    26895700  2025-06-27         t  $2,200.00             3.0   
1295019    26895700  2025-06-28         t  $2,200.00             3.0   

         maximum_nights  
1294655             NaN  
1294656          1125.0  
1294657          1125.0  
1294658          1125.0  
12946

In [9]:
calendar.loc[(calendar['listing_id'] == 26895700) & (calendar['date'] == '2024-06-29'), 'minimum_nights'] = 3 #change missing 'minimum_nights' with 3 (next day value)
calendar.loc[(calendar['listing_id'] == 26895700) & (calendar['date'] == '2024-06-29'), 'maximum_nights'] = 1125 #change missing 'maximum_nights' with 1125 (next day value)

In [10]:
print(calendar.isnull().sum()) #checking that everything is ok

listing_id        0
date              0
available         0
price             0
minimum_nights    0
maximum_nights    0
dtype: int64


## Data Type Adjustment

In [11]:
calendar['date'] = pd.to_datetime(calendar['date']) #'date' column to datetime format
calendar['available'] = calendar['available'].map({'t': True, 'f': False}) #from 't' and 'f' to boolean (True/False)
calendar['price'] = calendar['price'].str.replace(r'[$,]', '', regex=True).astype(float) #removing dollar signs and commas and converting it to float
calendar['minimum_nights'] = calendar['minimum_nights'].astype(int) #converting to integer
calendar['maximum_nights'] = calendar['maximum_nights'].astype(int) #converting to integer

## Handling duplicates

In [12]:
duplicates = calendar.groupby(['date', 'listing_id']).size().reset_index(name='count')
duplicates = duplicates[duplicates['count'] > 1]
duplicates.shape #checking for duplicates

(0, 3)

## Exploring Individual Variables

In [13]:
# Understand the date range in the dataset
min_date = calendar['date'].min()
max_date = calendar['date'].max()
print(f"Minimum Date: {min_date}")
print(f"Maximum Date: {max_date}")

Minimum Date: 2024-06-29 00:00:00
Maximum Date: 2025-06-29 00:00:00


In [14]:
# 'available' distinct values
available_dvalues = calendar['available'].unique()
print(available_dvalues)

[False  True]


In [15]:
# 'listing_id' distinct values
listingid_dvalues = calendar['listing_id'].unique()
print(len(listingid_dvalues))

20909


In [16]:
# Does all listing_ids have 365 days of data?
listings_date_count = calendar.groupby('listing_id')['date'].count()
listings_less_than_365 = listings_date_count[listings_date_count < 365]
listings_less_than_365 = listings_less_than_365.sort_values(ascending=False)
listings_less_than_365

Unnamed: 0_level_0,date
listing_id,Unnamed: 1_level_1
1167158993163820880,337
1166613185844530189,336


In [17]:
# Remove entries with listings that have less then 365 days of data
listings_to_remove = listings_less_than_365.index
calendar = calendar[~calendar['listing_id'].isin(listings_to_remove)]

In [18]:
# What is the distribution of 'price'?
print(calendar['price'].describe())

count    7.631058e+06
mean     1.245855e+03
std      1.687680e+03
min      2.100000e+01
25%      7.500000e+02
50%      1.000000e+03
75%      1.400000e+03
max      1.000000e+05
Name: price, dtype: float64


In [19]:
# 'price' column is in dkk currency and not us dollars or eur.
calendar.rename(columns={'price': 'price_dkk'}, inplace=True)

In [20]:
lower_threshold = 300 #setting minimum price threshold of 300 DKK (~ 40€) to exclude extremely low prices.
upper_threshold = 37303 #setting maximum price threshold of 37303 DKK (~ 5000€) to exclude extremely high prices.

# setting prices below 'lower_threshold' to 'lower_threshold' and above 'upper_threshold' to 'upper_threshold'.
calendar['price_dkk'] = calendar['price_dkk'].clip(lower= lower_threshold, upper= upper_threshold)
print(calendar['price_dkk'].describe())


count    7.631058e+06
mean     1.241103e+03
std      1.329003e+03
min      3.000000e+02
25%      7.500000e+02
50%      1.000000e+03
75%      1.400000e+03
max      3.730300e+04
Name: price_dkk, dtype: float64


In [21]:
# does the price of listings change over time?
price_variation = calendar.groupby('listing_id')['price_dkk'].nunique()
variable_price_listings = price_variation[price_variation > 1]
len(variable_price_listings)

0

In [22]:
# does the minimum_nights of listings change over time?
minimum_nights_variation = calendar.groupby('listing_id')['minimum_nights'].nunique()
variable_minimum_nights_listings = minimum_nights_variation[minimum_nights_variation > 1]
len(variable_minimum_nights_listings)

5030

In [23]:
# does the maximum_nights of listings change over time?
maximum_nights_variation = calendar.groupby('listing_id')['maximum_nights'].nunique()
variable_maximum_nights_listings = maximum_nights_variation[maximum_nights_variation > 1]
len(variable_maximum_nights_listings)

346

In [24]:
calendar.to_csv('/content/drive/MyDrive/Airbnb/Airbnb_cleaned_csvs/calendar_cleaned.csv', index=False)