# Prepping airbnb new york's listing data for prediction

In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Importing my data

Data was downloaded at airbnb's open portal, i chose Bordeaux data dated until the 15th June 2025

Gathering the columns i'll need onward

In [60]:
df_airbnb = pd.read_csv('Data/listings.csv')

wanted_cols = ['price','host_is_superhost','host_total_listings_count','neighbourhood_cleansed','property_type', 'room_type', 'accommodates', 'bathrooms','bedrooms', 'beds','availability_365','number_of_reviews']

excluded_cols = list(set(df_airbnb.columns) - set(wanted_cols))

cols_to_drop = [col for col in excluded_cols if col in df_airbnb.columns]
print(f"Dropping {len(cols_to_drop)} columns: {cols_to_drop[:5]}...")  # Show first 5

df_airbnb.drop(cols_to_drop, inplace=True, axis=1)

print(f"df_airbnb now has {df_airbnb.shape[1]} columns")
print("Remaining columns:", df_airbnb.columns.tolist())

Dropping 67 columns: ['host_name', 'neighborhood_overview', 'host_identity_verified', 'last_scraped', 'picture_url']...
df_airbnb now has 12 columns
Remaining columns: ['host_is_superhost', 'host_total_listings_count', 'neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'availability_365', 'number_of_reviews']


Checking column types and null counts

In [61]:
print(df_airbnb.dtypes,'\n')
print(df_airbnb.isnull().sum())

host_is_superhost             object
host_total_listings_count      int64
neighbourhood_cleansed        object
property_type                 object
room_type                     object
accommodates                   int64
bathrooms                    float64
bedrooms                     float64
beds                         float64
price                         object
availability_365               int64
number_of_reviews              int64
dtype: object 

host_is_superhost             248
host_total_listings_count       0
neighbourhood_cleansed          0
property_type                   0
room_type                       0
accommodates                    0
bathrooms                    4008
bedrooms                     1052
beds                         4023
price                        4038
availability_365                0
number_of_reviews               0
dtype: int64


# Null drops and type casts

dropping rows with null price and casting the needed types

checking null counts and types after the changes

In [62]:
df_airbnb.dropna(subset=['price'], inplace=True)

#type casting
df_airbnb['price'] =  df_airbnb['price'].str.replace('$','').str.replace(',','').astype(float)
df_airbnb['host_is_superhost'] = df_airbnb['host_is_superhost'].map({'t': 1, 'f': 0}).astype(bool)
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].astype('Int64')
df_airbnb['beds'] = df_airbnb['beds'].astype('Int64')

print(df_airbnb.dtypes,'\n')
print(df_airbnb.isnull().sum())

host_is_superhost               bool
host_total_listings_count      int64
neighbourhood_cleansed        object
property_type                 object
room_type                     object
accommodates                   int64
bathrooms                    float64
bedrooms                       Int64
beds                           Int64
price                        float64
availability_365               int64
number_of_reviews              int64
dtype: object 

host_is_superhost             0
host_total_listings_count     0
neighbourhood_cleansed        0
property_type                 0
room_type                     0
accommodates                  0
bathrooms                     3
bedrooms                     15
beds                          4
price                         0
availability_365              0
number_of_reviews             0
dtype: int64


# Missing values % calc

Calculating the missing values % to see if imputing is viable

In [63]:
(df_airbnb[['bathrooms', 'bedrooms', 'beds']].isnull().sum() / len(df_airbnb)) * 100

bathrooms    0.035950
bedrooms     0.179748
beds         0.047933
dtype: float64

# Imputing missing values
every ratio was below 5% so i'll imput missing values using the cols mean

In [64]:
df_airbnb['bathrooms'] = df_airbnb['bathrooms'].fillna(df_airbnb['bathrooms'].median())
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(df_airbnb['bedrooms'].median())
df_airbnb['beds'] = df_airbnb['beds'].fillna(df_airbnb['beds'].median())

print(df_airbnb.isnull().sum())

host_is_superhost            0
host_total_listings_count    0
neighbourhood_cleansed       0
property_type                0
room_type                    0
accommodates                 0
bathrooms                    0
bedrooms                     0
beds                         0
price                        0
availability_365             0
number_of_reviews            0
dtype: int64


# Checking Anormal values

In [65]:
df_airbnb.describe()

Unnamed: 0,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,price,availability_365,number_of_reviews
count,8345.0,8345.0,8345.0,8345.0,8345.0,8345.0,8345.0,8345.0
mean,15.959137,3.617376,1.279509,1.625884,2.040743,125.325824,195.32858,46.403595
std,131.519603,2.166303,0.63677,1.169893,1.458607,366.010877,122.199195,94.762861
min,1.0,1.0,0.0,0.0,0.0,12.0,0.0,0.0
25%,1.0,2.0,1.0,1.0,1.0,52.0,81.0,3.0
50%,2.0,3.0,1.0,1.0,2.0,79.0,196.0,14.0
75%,4.0,4.0,1.0,2.0,3.0,130.0,314.0,49.0
max,8726.0,16.0,7.5,9.0,24.0,10000.0,365.0,2764.0


host_total_listings_count, checked the big values, related to agencys rather than people so ok no abnormal ! everything else seems ok !

In [66]:
# Check unique values in categorical columns
categorical_cols = ['neighbourhood_cleansed', 'property_type', 'room_type']

for col in categorical_cols:
    unique_count = df_airbnb[col].nunique()
    print(f"{col}: {unique_count} unique values")
    if unique_count <= 10:  # Show top values if not too many
        print(f"Values: {df_airbnb[col].unique()[:10]}")  # Show first 10
    print()

neighbourhood_cleansed: 62 unique values

property_type: 54 unique values

room_type: 4 unique values
Values: ['Entire home/apt' 'Private room' 'Shared room' 'Hotel room']



# Standardization

Since the df will be use to guess prices, i'll standardize my values for better results !