In [1]:
# Data manipulation and numerical operations
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
INPUT_PATH = "Airbnb_Open_Data_Original_2022_2024.csv"
OUTPUT_UNENCODED_PATH = "airbnb_2022_2024_clean_unencoded.csv"
OUTPUT_ENCODED_PATH = "airbnb_2022_2024_clean_encoded.csv"
COLUMNS_TO_DROP = ['id', 'NAME','host id','host name', 'neighbourhood', 'Construction year', 'lat', 'long', 'country', 'country code',
                   'number of reviews', 'last review', 'reviews per month', 'calculated host listings count', 'house_rules', 'availability 365',
                   'license']
COL_RENAME = {'neighbourhood_group': 'boro'}    # Abbreviation for efficiency
BORO_CORRECTIONS = {'brookln': 'brooklyn', 'manhatan': 'manhattan', 'staten island': 'staten'}
ROOM_TYPE_CORRECTIONS = {'Entire home/apt': 'entire', 'Private room': 'privater',  'Shared room': 'sharedr', 'Hotel room': 'hotelr'}
MONEY_REGEX = {'\$': '', ' ': '', ',': ''}
MONEY_COLS = ('price', 'service_fee')

In [3]:
def get_string_value_counts(df):
    # Select string columns
    string_cols = df.select_dtypes(include=['object'])
    
    # Count unique categories for each string column
    unique_counts = string_cols.nunique()
    return unique_counts

In [4]:
# Define the path to the CSV file
df = pd.read_csv(INPUT_PATH).drop(columns=COLUMNS_TO_DROP)
df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
df.rename(columns=COL_RENAME, inplace=True)
dup_cols = df.columns.duplicated()
if any(dup_cols):
    print("WARNING! duplicate columns:", df.columns[df.columns.duplicated()].tolist())
print(df.shape)
df.head()

(26263, 9)


Unnamed: 0,host_identity_verified,boro,instant_bookable,cancellation_policy,room_type,price,service_fee,minimum_nights,review_rate_number
0,verified,Manhattan,False,moderate,Entire home/apt,$142,$28,30.0,4.0
1,,Manhattan,True,flexible,Private room,$620,$124,3.0,5.0
2,verified,Manhattan,,moderate,Entire home/apt,$281,$56,7.0,3.0
3,verified,Manhattan,,strict,Private room,$274,$55,4.0,
4,verified,Brooklyn,False,moderate,Private room,$203,$41,60.0,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26263 entries, 0 to 26262
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   host_identity_verified  26188 non-null  object 
 1   boro                    26259 non-null  object 
 2   instant_bookable        26243 non-null  object 
 3   cancellation_policy     26247 non-null  object 
 4   room_type               26263 non-null  object 
 5   price                   26234 non-null  object 
 6   service_fee             26220 non-null  object 
 7   minimum_nights          26120 non-null  float64
 8   review_rate_number      26176 non-null  float64
dtypes: float64(2), object(7)
memory usage: 1.8+ MB


In [6]:
print("Columns with more than 50% missing values: ")
print(df.columns[df.isnull().mean() > 0.5], "\n")
# df2= df.drop(df.columns[df.isnull().mean() > 0.5], axis=1)
# print("Total columns after dropping:", len(df.columns))

Columns with more than 50% missing values: 
Index([], dtype='object') 



In [7]:
# Dropping all rows with missing values
df2 = df.dropna()
del df
print(df2.shape)

(25883, 9)


In [8]:
# Remove rows with minimum nights < 0
neg_min_night_mask_s = df2['minimum_nights'] < 0
print(f"Removing {neg_min_night_mask_s.sum()} rows")
df2 = df2[~neg_min_night_mask_s]

Removing 2 rows


In [9]:
df2.describe()

Unnamed: 0,minimum_nights,review_rate_number
count,25881.0,25881.0
mean,9.963409,3.368572
std,32.56001,1.231214
min,1.0,1.0
25%,1.0,2.0
50%,3.0,3.0
75%,7.0,4.0
max,3455.0,5.0


In [10]:
get_string_value_counts(df2)

host_identity_verified       2
boro                         5
instant_bookable             2
cancellation_policy          3
room_type                    4
price                     1151
service_fee                231
dtype: int64

In [11]:
# Replace 'Binary features' values
df2['host_identity_verified'] = df2['host_identity_verified'].replace({'verified': True, 'unconfirmed': False})
# df2['neighbourhood_group'] = df2['neighbourhood_group'].replace({'Staten Island': 0, 'Bronx': 2, 'Manhattan': 3, 'Brooklyn': 4, 'Queens': 5})
df2['instant_bookable'] = df2['instant_bookable'].replace({'f': False, 't': True})
# df2['room_type'] = df2['room_type'].replace({'Hotel room': 0, 'Shared room': 1, 'Private room': 2, 'Entire home/apt': 3})
df2.dtypes

  df2['host_identity_verified'] = df2['host_identity_verified'].replace({'verified': True, 'unconfirmed': False})
  df2['instant_bookable'] = df2['instant_bookable'].replace({'f': False, 't': True})


host_identity_verified       bool
boro                       object
instant_bookable             bool
cancellation_policy        object
room_type                  object
price                      object
service_fee                object
minimum_nights            float64
review_rate_number        float64
dtype: object

In [12]:
# Clean the 'price' and 'service_fee' columns
for col in MONEY_COLS:
    df2[col] = df2[col].replace(MONEY_REGEX, regex=True).astype(float)
df2.dtypes

host_identity_verified       bool
boro                       object
instant_bookable             bool
cancellation_policy        object
room_type                  object
price                     float64
service_fee               float64
minimum_nights            float64
review_rate_number        float64
dtype: object

In [13]:
# Correcting typos on the dataset
df2['boro'] = df2['boro'].str.lower().replace(BORO_CORRECTIONS)
df2['room_type'] = df2['room_type'].replace(ROOM_TYPE_CORRECTIONS)

In [14]:
get_string_value_counts(df2)

boro                   5
cancellation_policy    3
room_type              4
dtype: int64

In [15]:
# Save the cleaned, un-encoded data for visualizations
df2.to_csv(OUTPUT_UNENCODED_PATH, index_label="index")

## Encode the categorical columns for ML and save

In [16]:
df_encoded = pd.get_dummies(df2)
del df2

In [17]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25881 entries, 0 to 26256
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   host_identity_verified        25881 non-null  bool   
 1   instant_bookable              25881 non-null  bool   
 2   price                         25881 non-null  float64
 3   service_fee                   25881 non-null  float64
 4   minimum_nights                25881 non-null  float64
 5   review_rate_number            25881 non-null  float64
 6   boro_bronx                    25881 non-null  bool   
 7   boro_brooklyn                 25881 non-null  bool   
 8   boro_manhattan                25881 non-null  bool   
 9   boro_queens                   25881 non-null  bool   
 10  boro_staten                   25881 non-null  bool   
 11  cancellation_policy_flexible  25881 non-null  bool   
 12  cancellation_policy_moderate  25881 non-null  bool   
 13  cancel

In [18]:
df_encoded.to_csv(OUTPUT_ENCODED_PATH, index_label="index")