Pipline for price prediction for Wrocław

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
import pickle

data = pd.read_csv("./../data/processed/1_analysis_rent.csv")

# Data cleaning
data = data[data["city"] == "wroclaw"]
#print(data.columns)
columns_to_drop = ["Unnamed: 0", "id", "city", "latitude", "longitude", "Period","poiCount",'schoolDistance', 'clinicDistance', 'postOfficeDistance',
       'kindergartenDistance', 'restaurantDistance', 'collegeDistance',
       'pharmacyDistance', 'ownership','buildingAge', "condition"]
target = "price"
x = data.drop([target] + columns_to_drop, axis=1)
y = data[target]

# Feature types
numerical = [
    "squareMeters",
    "rooms",
    "floor",
    "floorCount",
    "centreDistance",
]
categorical = [
    "buildingMaterial",
    "hasParkingSpace",
    "hasBalcony",
    "hasElevator",
    "hasSecurity",
    "hasStorageRoom",
]

print(x.columns)
for i in x.columns:
    print(x[i].unique())

# Preprocessing
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), numerical),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ]
)

# Pipeline

pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ("model",XGBRegressor(
        n_estimators=100,
        max_depth=7,
        learning_rate=0.05,
        random_state=99,
        subsample=0.6
    ))
])
# Fit for pipeline
pipeline.fit(x,y)

with open("./../ML_models/XGB_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)



Index(['type', 'squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear',
       'centreDistance', 'buildingMaterial', 'hasParkingSpace', 'hasBalcony',
       'hasElevator', 'hasSecurity', 'hasStorageRoom'],
      dtype='object')
['NoData' 'blockOfFlats' 'tenement' 'apartmentBuilding']
[ 50.    38.    56.    44.    61.    49.    80.    59.    47.    26.
  53.    55.    35.    35.3   30.    78.    37.    36.    70.6   52.
  33.    49.81  72.    46.    48.    42.6   45.    43.5   41.    37.56
  64.    44.91  51.    39.    58.    42.5   64.1   54.    43.    39.7
  40.    45.5   65.38  41.82  38.7  103.    90.    63.    32.    74.
  94.    53.17  60.    99.    32.01  42.    75.58  84.    65.    26.29
  31.    66.    70.   142.9   83.   106.    67.    89.    88.    68.
  75.57  51.5   75.    28.    27.    29.    34.    68.66  61.5   25.
 109.    71.71  52.58  57.    40.97  37.4   98.    35.08  60.61  39.6
  65.63  45.02  66.45  52.07  85.    49.08  47.5   37.5   62.3   63.5
  91.    76.  

Test ML model

In [2]:
import pickle
import pandas as pd

with open("./../ML_models/XGB_pipeline.pkl","rb") as f:
    model=pickle.load(f)

input_data = pd.DataFrame([{
    "squareMeters": 42.0,
    "rooms": 4,
    "floor": 4,
    "floorCount": 5,
    "buildYear": 2000,
    "centreDistance": 4.3,
    "type": "apartment",
    "buildingMaterial": "brick",
    'pharmacyDistance':0.1,
    'hasBalcony': "Yes",
    'hasSecurity': "Yes", 
    'ownership': "condominium",
    'restaurantDistance': 0.1, 
    'postOfficeDistance':1.0, 
    'schoolDistance':0.3, 
    'clinicDistance':3, 
    'kindergartenDistance':1.0,
    'condition':"premium",
    'hasParkingSpace':"yes",
    'hasStorageRoom':"no",
    'buildingAge':25,
    'hasElevator':"yes",
    'collegeDistance':1.5
}])

predicted_price = model.predict(input_data)[0]
print(f"Predicted price: {predicted_price:,.2f} PLN")


Predicted price: 3,146.21 PLN
