In [1]:
import numpy as np
import pandas as pd 


In [35]:
df = pd.read_csv("./appartment.csv",index_col="Index")

In [36]:
df

Unnamed: 0_level_0,City,Country,Apartment_Surface_Area_SQM,Building_Date,Room_Price_EUR
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,New York,USA,80,2000,274480
2,New York,USA,100,2010,343100
3,New York,USA,20,1995,114980
4,New York,USA,35,2008,160045
5,San Francisco,USA,75,2005,257325
...,...,...,...,...,...
94,Kribi,Cameroon,80,2017,274480
95,Kribi,Cameroon,20,2004,103740
96,Kribi,Cameroon,38,2019,172710
97,Hamburg,Germany,70,2016,240090


In [37]:
df.describe()
df.columns

Index(['City', 'Country', 'Apartment_Surface_Area_SQM', 'Building_Date',
       'Room_Price_EUR'],
      dtype='object')

In [71]:
# Simple model to predict Price base don Surface and date - No Encoder for catogorical var
X = df[['Apartment_Surface_Area_SQM', 'Building_Date']]
Y = df['Room_Price_EUR']
print(X)
print(Y)


       Apartment_Surface_Area_SQM  Building_Date
Index                                           
1                              80           2000
2                             100           2010
3                              20           1995
4                              35           2008
5                              75           2005
...                           ...            ...
94                             80           2017
95                             20           2004
96                             38           2019
97                             70           2016
98                             80           2020

[98 rows x 2 columns]
Index
1     274480
2     343100
3     114980
4     160045
5     257325
       ...  
94    274480
95    103740
96    172710
97    240090
98       274
Name: Room_Price_EUR, Length: 98, dtype: int64


In [72]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

X_train, X_test, Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=77)

roomPriceModel = LinearRegression() 

cross_val_scores = cross_val_score(roomPriceModel, X_train, Y_train, cv=5, scoring='r2')
print("R-squared scores for each fold (cross-validation):", cross_val_scores)





R-squared scores for each fold (cross-validation): [0.29619463 0.97485021 0.96755908 0.97820217 0.96479434]


In [80]:
# Train the model 

roomPriceModel.fit(X_train,Y_train)



In [81]:
Y_pred = roomPriceModel.predict(X_test)
train_set_score = roomPriceModel.score(X_train, Y_train)
print("R-squared score on the train set:", train_set_score)

test_set_score = roomPriceModel.score(X_test, Y_test)

print("R-squared score on the test set:", test_set_score)


R-squared score on the train set: 0.8546477210731518
R-squared score on the test set: 0.9787959274513923


In [82]:
# Evaluate the room price model 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 8670.53790465023
Mean Squared Error (MSE): 110369158.63301699
Root Mean Squared Error (RMSE): 10505.672688267849
R-squared (R2): 0.9787959274513923


In [86]:
# Play with your model 


myApartment = pd.DataFrame({'Apartment_Surface_Area_SQM': [40,20], 'Building_Date': [2022,2000]})
prediction = roomPriceModel.predict(myApartment)
print(prediction)

[156828.01624582 110358.22943254]


In [112]:
from skl2onnx import to_onnx
from skl2onnx.common.data_types import FloatTensorType


# Define the input and output types for ONNX conversion
initial_type = [('room_size_and_building_date', FloatTensorType([None, 2]))]
# Convert the   scikit-learn roomPriceModel to ONNX format
onnxRoomPriceModel = to_onnx(roomPriceModel,initial_type)
with open("roomPriceModel.onnx", "wb") as f:
    f.write(onnxRoomPriceModel.SerializeToString())
    print("ONNX model saved as 'roomPriceModel.onnx'")


ONNX model saved as 'roomPriceModel.onnx'


In [121]:
# test the onnx model with onnx runtime 
import onnxruntime as ort

oSession = ort.InferenceSession('roomPriceModel.onnx', providers=['CPUExecutionProvider'])
input_data = np.array([[40, 2022],[20, 2000]], dtype=np.float32)
output = oSession.run([], {"room_size_and_building_date": input_data})

predicted_price = output[0]
print(predicted_price)



[[156828.06]
 [110358.25]]
