1) INSTALL DEPENDENCIES

In [64]:
!pip install xgboost




2) LOAD DATA

In [116]:
import pandas as pd

df = pd.read_csv("/content/Airbnb_Data.csv", engine="python")


3) AMENITIES FEATURE

In [117]:
df['amenities_count'] = df['amenities'].apply(
    lambda x: x.count(',') if isinstance(x, str) else 0
)
df = df.drop(columns=['amenities'], errors='ignore')


4) BASIC CLEANING

In [118]:
for col in ['bathrooms', 'bedrooms', 'beds']:
    df[col] = df[col].fillna(df[col].median())

df['review_scores_rating'] = df['review_scores_rating'].fillna(0)
df = df.drop(columns=['thumbnail_url'], errors='ignore')

5) DATE FEATURES

In [119]:
date_cols = ['host_since', 'first_review', 'last_review']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

ref_date = df['last_review'].max()

df['host_experience_days'] = (ref_date - df['host_since']).dt.days.fillna(0)
df['days_since_last_review'] = (ref_date - df['last_review']).dt.days.fillna(0)

df = df.drop(columns=date_cols)

6) LOCATION & CITY FEATURES

In [120]:
from sklearn.cluster import KMeans
df['location_cluster'] = KMeans(n_clusters=25, random_state=42)\
    .fit_predict(df[['latitude', 'longitude']])

city_avg = df.groupby('city')['log_price'].mean()
df['city_avg_price'] = df['city'].map(city_avg)

7) HOST RESPONSE & RATE CLEANING

In [121]:
df['host_response_rate'] = (
    df['host_response_rate']
    .astype(str)
    .str.replace('%', '', regex=False)
    .replace('nan', '0')
    .astype(float)
)

8)  BOOLEAN CLEANING

In [122]:
bool_cols = [
    'instant_bookable','host_has_profile_pic',
    'host_identity_verified','cleaning_fee'
]

for col in bool_cols:
    df[col] = (
        df[col].astype(str).str.lower()
        .map({'t':1,'true':1,'yes':1,'f':0,'false':0,'no':0})
        .fillna(0).astype(int)
    )

9) DROP UNUSED COLUMNS

In [123]:
df = df.drop(columns=['id','name','description','zipcode','neighbourhood'], errors='ignore')

10) FEATURE LISTS

In [124]:
numeric_features = [
    'accommodates','bathrooms','bedrooms','beds',
    'latitude','longitude','number_of_reviews',
    'review_scores_rating','host_experience_days',
    'days_since_last_review','amenities_count',
    'city_avg_price','host_response_rate'
]

categorical_features = [
    'property_type','room_type','bed_type',
    'cancellation_policy','city','location_cluster'
]

boolean_features = [
    'cleaning_fee','instant_bookable',
    'host_has_profile_pic','host_identity_verified'
]

11) TRAINâ€“TEST SPLIT

In [125]:
X = df.drop(columns=['log_price'])
y = df['log_price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

12. PREPROCESSOR

In [126]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('bool', 'passthrough', boolean_features)
])

13. XGBOOST MODEL

In [127]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

xgb_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=8,
        min_child_weight=3,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ))
])

xgb_final.fit(X_train, y_train)
y_pred = xgb_final.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

rmse, mae, r2

(np.float64(0.37086995688107804), 0.2672973559165455, 0.7322608704214877)

In [128]:
import joblib

joblib.dump(xgb_final, "airbnb_price_model.pkl")


['airbnb_price_model.pkl']