In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import joblib
import boto3
from io import BytesIO



In [23]:
file_path = "s3://udemy-ds-lab/house_rentals.csv"
df = pd.read_csv(file_path)

In [24]:
df = df[df["ADS_CATEGORY_NAME"] == "Locations"]
# Per area price
df["ADS_PRICE_SQUARE"] = df["ADS_PRICE"] / df["ADS_ATTR_SQUARE"]

# Group rows into a single row by ZIPCODE, CITY, and REAL_ESTATE_TYPE
df_grouped = df.groupby(["ADS_GEO_ZIPCODE", "ADS_GEO_CITY", "ADS_ATTR_REAL_ESTATE_TYPE"])
# Aggregation summarises statistics like average, min, max and counts
# , or even applying lambda operation
df_aggregated = df_grouped.agg(COUNT_ADS=("ADS_ID", "count"), MED_PRICE=("ADS_PRICE_SQUARE", "median"))
df = df.merge(df_aggregated, on=["ADS_GEO_ZIPCODE", "ADS_GEO_CITY", "ADS_ATTR_REAL_ESTATE_TYPE"], how="inner")

df = df[
    (df["ADS_PRICE_SQUARE"] / (df["ADS_PRICE_SQUARE"] + df["MED_PRICE"]) >= 0.25) &
    (df["ADS_PRICE_SQUARE"] / (df["ADS_PRICE_SQUARE"] + df["MED_PRICE"]) < 0.75) &
    (df["ADS_PRICE_SQUARE"] < 150) &
    (df["ADS_PRICE_SQUARE"] > 0) &
    (df["ADS_ATTR_SQUARE"] >= 9) &
    (df["ADS_ATTR_SQUARE"] <= 300) &
    (df["COUNT_ADS"] >= 5)
]

In [6]:
def encode_attributes(df):
    
    def ADS_ATTR_FURNISHED_Encode_Python(x):
        if x == "Meublé":
            return 2
        elif x == "Non meublé":
            return 1
        else:
            return 0

    def ADS_ATTR_REAL_ESTATE_TYPE_Encode_Python(x):
        if x == "Maison":
            return 2
        elif x == "Appartement":
            return 1
        else:
            return 0

    df["ADS_ATTR_FURNISHED_NUM"] = df["ADS_ATTR_FURNISHED"].apply(ADS_ATTR_FURNISHED_Encode_Python)
    df["ADS_ATTR_REAL_ESTATE_TYPE_NUM"] = df["ADS_ATTR_REAL_ESTATE_TYPE"].apply(ADS_ATTR_REAL_ESTATE_TYPE_Encode_Python)

    # Select the final columns
    df = df[["ADS_GEO_LAT", "ADS_GEO_LNG", "ADS_ATTR_ROOMS", "ADS_ATTR_SQUARE",
                   "ADS_ATTR_REAL_ESTATE_TYPE_NUM", "ADS_ATTR_FURNISHED_NUM", "ADS_PRICE"]]

    return df

In [25]:
df = encode_attributes(df)
df.head()

Unnamed: 0,ADS_GEO_LAT,ADS_GEO_LNG,ADS_ATTR_ROOMS,ADS_ATTR_SQUARE,ADS_ATTR_REAL_ESTATE_TYPE_NUM,ADS_ATTR_FURNISHED_NUM,ADS_PRICE
0,50.64416,3.08592,2,50,1,2,850
1,50.6282,3.08117,3,68,1,1,1170
2,50.63334,3.04214,1,20,1,2,395
3,50.63716,3.06284,3,70,1,2,850
4,50.63716,3.06284,2,48,1,1,708


In [26]:
target = "ADS_PRICE"
predictors = [x for x in df.columns if x not in [target]]

In [None]:
def train_model(X_train, y_train,n_estimators=250):

    model = XGBRegressor(
        booster='gbtree',
        objective='reg:squarederror',
        learning_rate=0.05,
        max_depth=5,
        min_child_weight=1,
        gamma=1,
        subsample=0.75,
        colsample_bytree=0.75,
        scale_pos_weight=1,
        n_jobs=-1,
        verbosity=1,
        n_estimators=n_estimators
    )

    model.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=True)
    return model

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[target], test_size=0.1)
xgb_model = train_model(X_train, y_train)

[0]	validation_0-rmse:605.95617
[1]	validation_0-rmse:577.23892
[2]	validation_0-rmse:550.33712
[3]	validation_0-rmse:524.83629
[4]	validation_0-rmse:500.65446
[5]	validation_0-rmse:477.71842
[6]	validation_0-rmse:456.51855
[7]	validation_0-rmse:436.20705
[8]	validation_0-rmse:417.20453
[9]	validation_0-rmse:398.56055
[10]	validation_0-rmse:381.21341
[11]	validation_0-rmse:364.60241
[12]	validation_0-rmse:348.79119
[13]	validation_0-rmse:334.33763
[14]	validation_0-rmse:320.18251
[15]	validation_0-rmse:307.14939
[16]	validation_0-rmse:294.59989
[17]	validation_0-rmse:282.40871
[18]	validation_0-rmse:270.94208
[19]	validation_0-rmse:260.56304
[20]	validation_0-rmse:250.88853
[21]	validation_0-rmse:241.14214
[22]	validation_0-rmse:232.19141
[23]	validation_0-rmse:223.49799
[24]	validation_0-rmse:215.17607
[25]	validation_0-rmse:207.55444
[26]	validation_0-rmse:200.40853
[27]	validation_0-rmse:193.58054
[28]	validation_0-rmse:187.19902
[29]	validation_0-rmse:181.26946
[30]	validation_0-rm

In [16]:
predictions = xgb_model.predict(X_test)
r2_score(y_test, predictions)

0.8246515672767909

In [19]:
bucket_name = "udemy-ds-lab"
file_key = "model-artifact/house_rentability.joblib"

buffer = BytesIO()
joblib.dump(xgb_model, buffer)
s3 = boto3.client('s3')
s3.put_object(Bucket=bucket_name, Key=file_key, Body=buffer.getvalue())

{'ResponseMetadata': {'RequestId': '5VF0MGEZ3YJMBSWR',
  'HostId': 'N+9R/SOHbLck960RicQaeO0wyKLSUkB5slEl0H0bQ+s4n2oYvMYnx1Wuxpk6V+uA7gapGc5HAyE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'N+9R/SOHbLck960RicQaeO0wyKLSUkB5slEl0H0bQ+s4n2oYvMYnx1Wuxpk6V+uA7gapGc5HAyE=',
   'x-amz-request-id': '5VF0MGEZ3YJMBSWR',
   'date': 'Tue, 03 Oct 2023 13:25:50 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"fc09b4c36bdc62aebc2e7f3552d969dc"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"fc09b4c36bdc62aebc2e7f3552d969dc"',
 'ServerSideEncryption': 'AES256'}