In [14]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import boto3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor


In [15]:

s3 = boto3.client("s3", region_name="us-east-2")

obj = s3.get_object(
    Bucket="staywise-airbnb-data",
    Key="airbnb/processed/cleaned_airbnb.csv"
)

df = pd.read_csv(obj["Body"])
df.head()


Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,days_since_last_review,neighbourhood_group_Brooklyn,...,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,149,1,9,0.21,6,365,1900.0,True,...,False,False,False,False,False,False,False,False,True,False
1,40.75362,-73.98377,225,1,45,0.38,2,355,1686.0,False,...,False,False,False,False,False,False,False,False,False,False
2,40.80902,-73.9419,150,3,0,0.0,1,365,4662.0,False,...,False,False,False,False,False,False,False,False,True,False
3,40.68514,-73.95976,89,1,270,4.64,1,194,1641.0,True,...,False,False,False,False,False,False,False,False,False,False
4,40.79851,-73.94399,80,10,9,0.1,1,0,1869.0,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling only for Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    return rmse, mae


In [None]:
import mlflow
mlflow.set_experiment("airbnb_price_prediction")


<Experiment: artifact_location='file:///c:/Users/NItihlash/Downloads/mlruns/623490250443542741', creation_time=1763678560871, experiment_id='623490250443542741', last_update_time=1763678560871, lifecycle_stage='active', name='airbnb_price_prediction', tags={}>

In [25]:
with mlflow.start_run(run_name="LinearRegression"):

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    preds = lr.predict(X_test_scaled)

    rmse, mae = eval_metrics(y_test, preds)

    mlflow.log_param("model", "LinearRegression")
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)

    # log model
    mlflow.sklearn.log_model(lr, "linear_regression_model")

    print(f"LinearRegression → RMSE: {rmse:.2f}, MAE: {mae:.2f}")




LinearRegression → RMSE: 94.71, MAE: 56.05


In [26]:
with mlflow.start_run(run_name="RandomForest"):

    rf = RandomForestRegressor(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)

    rmse, mae = eval_metrics(y_test, preds)

    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("model", "RandomForest")
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)

    mlflow.sklearn.log_model(rf, "random_forest_model")

    print(f"RandomForest → RMSE: {rmse:.2f}, MAE: {mae:.2f}")




RandomForest → RMSE: 89.02, MAE: 50.57


In [27]:
with mlflow.start_run(run_name="XGBoost"):

    xgbm = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    xgbm.fit(X_train, y_train)
    preds = xgbm.predict(X_test)

    rmse, mae = eval_metrics(y_test, preds)

    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("max_depth", 6)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)

    mlflow.xgboost.log_model(xgbm, "xgboost_model")

    print(f"XGBoost → RMSE: {rmse:.2f}, MAE: {mae:.2f}")




XGBoost → RMSE: 86.65, MAE: 49.34


In [29]:
import mlflow
import pandas as pd


client = mlflow.tracking.MlflowClient()
experiment_name = "airbnb_price_prediction"
experiment = client.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.RMSE ASC"]
)

runs[["run_id", "metrics.RMSE", "metrics.MAE"]].head()



Unnamed: 0,run_id,metrics.RMSE,metrics.MAE
0,356b1cbefd0049608447462963ddc53e,86.65208,49.337254
1,bbcae1ca39134dc58b53841b7b3e220d,86.65208,49.337254
2,cc863933a4984ea7b5afb489d6fb2f01,86.954286,49.260193
3,5ddccd873b1a4f649c38740f0962ad22,86.954286,49.260193
4,2942b44a14134e5f858c502ec3973cbe,89.020111,50.572292


In [30]:
best_run_id = "356b1cbefd0049608447462963ddc53e"
best_run_id


'356b1cbefd0049608447462963ddc53e'

In [31]:
model_name = "airbnb_best_model"

mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/xgboost_model",
    name=model_name
)


  return FileStore(store_uri)
Successfully registered model 'airbnb_best_model'.
Created version '1' of model 'airbnb_best_model'.


<ModelVersion: aliases=[], creation_timestamp=1763682249422, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1763682249422, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='MAE', model_id='m-711800426f6e466abbee053ad8e9a3c1', run_id='356b1cbefd0049608447462963ddc53e', step=0, timestamp=1763681690275, value=49.33725357055664>,
 <Metric: dataset_digest=None, dataset_name=None, key='RMSE', model_id='m-711800426f6e466abbee053ad8e9a3c1', run_id='356b1cbefd0049608447462963ddc53e', step=0, timestamp=1763681690271, value=86.65208022784277>], model_id='m-711800426f6e466abbee053ad8e9a3c1', name='airbnb_best_model', params={'max_depth': '6', 'model': 'XGBoost', 'n_estimators': '300'}, run_id='356b1cbefd0049608447462963ddc53e', run_link=None, source='models:/m-711800426f6e466abbee053ad8e9a3c1', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [32]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

client.transition_model_version_stage(
    name=model_name,
    version=1,
    stage="Production"
)


  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1763682249422, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1763682254440, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='MAE', model_id='m-711800426f6e466abbee053ad8e9a3c1', run_id='356b1cbefd0049608447462963ddc53e', step=0, timestamp=1763681690275, value=49.33725357055664>,
 <Metric: dataset_digest=None, dataset_name=None, key='RMSE', model_id='m-711800426f6e466abbee053ad8e9a3c1', run_id='356b1cbefd0049608447462963ddc53e', step=0, timestamp=1763681690271, value=86.65208022784277>], model_id='m-711800426f6e466abbee053ad8e9a3c1', name='airbnb_best_model', params={'max_depth': '6', 'model': 'XGBoost', 'n_estimators': '300'}, run_id='356b1cbefd0049608447462963ddc53e', run_link=None, source='models:/m-711800426f6e466abbee053ad8e9a3c1', status='READY', status_message=None, tags={}, user_id=None, version=1>