### Importing all the required libraries

In [196]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [197]:
# importing the dataset
df = pd.read_csv("../data/house_details_v3.csv")

df.shape

(19976, 31)

### Cleaning the dataset and removing null values

In [198]:
# dropping the rows with 0 price and 0 habitable surface
df = df[(df["Price"] != 0)&(df["Habitable surface"] != 0.0)]
df.shape

(17952, 31)

In [199]:
# printing the head
df.head()

Unnamed: 0,id,Street,Housenumber,Box,Floor,City,Postalcode,Type,Subtype,Lacation area,...,Terrace,Garden,Garden surface,Facades,SwimmingPool,Condition,EPC score,Latitude,Longitude,Property url
0,10683816,Egmont Park,11,0,2.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
1,10683823,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,0,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
2,10683826,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
3,10683825,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
4,10683824,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...


In [200]:
# information about the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17952 entries, 0 to 19975
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 17952 non-null  int64  
 1   Street             17952 non-null  object 
 2   Housenumber        17952 non-null  object 
 3   Box                17952 non-null  object 
 4   Floor              17952 non-null  float64
 5   City               17952 non-null  object 
 6   Postalcode         17952 non-null  object 
 7   Type               17952 non-null  object 
 8   Subtype            17952 non-null  object 
 9   Lacation area      17952 non-null  object 
 10  Region             17952 non-null  object 
 11  District           17952 non-null  object 
 12  Province           17952 non-null  object 
 13  Price              17952 non-null  float64
 14  Type of sale       17952 non-null  object 
 15  Construction year  17952 non-null  float64
 16  Bedroom Count      17952 no

In [201]:
# info about the different postal codes and its value counts 
df["Postalcode"].value_counts()

Postalcode
1180        516
1000        488
8300        379
1050        366
9000        360
           ... 
4506 JH       1
8851          1
8125-404      1
204 00        1
7973          1
Name: count, Length: 995, dtype: int64

In [202]:
# columns that don't contribute to the prediction 
columns_to_drop = ["id","Street","City","Region","District","Province", "Housenumber", "Box", "Floor","Lacation area", "Type", "Type of sale", "Construction year", "EPC score","Latitude", "Longitude", "Property url" ]

In [203]:
# dropping the above column from the dataset
df.drop(columns=columns_to_drop, inplace=True)
df.columns


Index(['Postalcode', 'Subtype', 'Price', 'Bedroom Count', 'Habitable surface',
       'Kitchen type', 'Furnished', 'Fireplace', 'Terrace', 'Garden',
       'Garden surface', 'Facades', 'SwimmingPool', 'Condition'],
      dtype='object')

In [204]:
#Rename the column names
df.rename(columns={'Postalcode': 'postalcode', 'Subtype' : 'subtype', 'Price' : 'price',
       'Bedroom Count' : 'bedroom_count', 'Habitable surface' : 'habitable_surface', 'Kitchen type': 'kitchen_type','Furnished':'furnished','Fireplace':'fireplace','Terrace' : 'terrace','Garden' : 'garden', 'Garden surface' : 'garden_surface', 'SwimmingPool' : 'swimmingpool', 'Facades' : 'facades','Condition' : 'condition', 'Price' : 'price'}, inplace=True)
#df.rename(columns={'Habitable surface': 'Habitable_surface'}, inplace=True)
df.columns

Index(['postalcode', 'subtype', 'price', 'bedroom_count', 'habitable_surface',
       'kitchen_type', 'furnished', 'fireplace', 'terrace', 'garden',
       'garden_surface', 'facades', 'swimmingpool', 'condition'],
      dtype='object')

In [205]:
# drop the rows with no improper postalcode and with no region and province
#df.drop(df[lendf["postalcode"] != 4].index, inplace = True)

#df['postalcode']=df['postalcode'].astype(str)
desired_length = 4
df['postalcode'] = df['postalcode'][df['postalcode'].str.len() == desired_length]
df['postalcode'].info()




<class 'pandas.core.series.Series'>
Index: 17952 entries, 0 to 19975
Series name: postalcode
Non-Null Count  Dtype 
--------------  ----- 
17848 non-null  object
dtypes: object(1)
memory usage: 280.5+ KB


In [206]:
# removing the columns with bedroom_count more than 15
df['bedroom_count'] = df['bedroom_count'][df['bedroom_count'] <= 15]
df['bedroom_count'].value_counts()

bedroom_count
3.0     5809
2.0     5052
4.0     2936
1.0     1603
5.0     1275
6.0      538
0.0      328
7.0      171
8.0      104
9.0       46
10.0      36
12.0      13
11.0      10
15.0       5
13.0       3
14.0       1
Name: count, dtype: int64

In [207]:
df['kitchen_type'].value_counts()

kitchen_type
INSTALLED             5933
0                     4112
HYPER_EQUIPPED        2800
SEMI_EQUIPPED         2015
USA_HYPER_EQUIPPED    1607
NOT_INSTALLED          804
USA_INSTALLED          502
USA_SEMI_EQUIPPED      147
USA_UNINSTALLED         32
Name: count, dtype: int64

In [208]:
df['furnished'].value_counts()

furnished
False    8781
0        8582
True      589
Name: count, dtype: int64

In [209]:
df['furnished'] = df['furnished'].replace('0', False)
df["furnished"] = df['furnished'].astype(bool)

df['furnished'].value_counts()

furnished
True     9370
False    8582
Name: count, dtype: int64

In [210]:
df["furnished"] = df['furnished'].astype(int)
df['furnished'].value_counts()

furnished
1    9370
0    8582
Name: count, dtype: int64

In [211]:
df['fireplace'] = df['fireplace'].astype(int)
df['fireplace'].value_counts()

fireplace
0    12099
1     5853
Name: count, dtype: int64

In [212]:
df['terrace'].value_counts()

terrace
True    12426
0        5526
Name: count, dtype: int64

In [213]:
df['terrace'] = df['terrace'].replace('0', False)
df["terrace"] = df['terrace'].astype(bool)
df["terrace"] = df['terrace'].astype(int)

df['terrace'].value_counts()

terrace
1    12426
0     5526
Name: count, dtype: int64

In [214]:
df = df[~((df['garden']!='0')&(df['garden_surface']==0))]
df = df[~((df['garden']=='0')&(df['garden_surface']!=0))]
df.shape

(12274, 14)

In [215]:

df['garden_surface'].value_counts()

garden_surface
0.0       6939
170.0       35
220.0       33
200.0       33
180.0       32
          ... 
1409.0       1
1673.0       1
1936.0       1
169.0        1
1778.0       1
Name: count, Length: 1698, dtype: int64

In [216]:
df['garden'] = df['garden'].replace('0', False)
df["garden"] = df['garden'].astype(bool)
df["garden"] = df['garden'].astype(int)


In [217]:
df['garden'].value_counts()

garden
0    6939
1    5335
Name: count, dtype: int64

In [218]:
# Replacing all the 0's to False in the terrace column and changing it to bool datatype
df['swimmingpool'] = df['swimmingpool'].replace('0', False)
df["swimmingpool"] = df['swimmingpool'].astype(bool)
df["swimmingpool"] = df['swimmingpool'].astype(int)

In [219]:
df['swimmingpool'].value_counts()

swimmingpool
1    6595
0    5679
Name: count, dtype: int64

In [220]:
df['condition'].value_counts()

condition
GOOD              4111
AS_NEW            3262
0                 2029
TO_BE_DONE_UP     1082
TO_RENOVATE       1029
JUST_RENOVATED     704
TO_RESTORE          57
Name: count, dtype: int64

In [221]:
df['condition'] = df['condition'].replace('0', 'NOT_KNOWN')


In [222]:
df['condition'].value_counts()

condition
GOOD              4111
AS_NEW            3262
NOT_KNOWN         2029
TO_BE_DONE_UP     1082
TO_RENOVATE       1029
JUST_RENOVATED     704
TO_RESTORE          57
Name: count, dtype: int64

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12274 entries, 0 to 19975
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   postalcode         12191 non-null  object 
 1   subtype            12274 non-null  object 
 2   price              12274 non-null  float64
 3   bedroom_count      12263 non-null  float64
 4   habitable_surface  12274 non-null  float64
 5   kitchen_type       12274 non-null  object 
 6   furnished          12274 non-null  int64  
 7   fireplace          12274 non-null  int64  
 8   terrace            12274 non-null  int64  
 9   garden             12274 non-null  int64  
 10  garden_surface     12274 non-null  float64
 11  facades            12274 non-null  float64
 12  swimmingpool       12274 non-null  int64  
 13  condition          12274 non-null  object 
dtypes: float64(5), int64(5), object(4)
memory usage: 1.4+ MB


In [239]:
df.columns

Index(['postalcode', 'subtype', 'price', 'bedroom_count', 'habitable_surface',
       'kitchen_type', 'furnished', 'fireplace', 'terrace', 'garden',
       'garden_surface', 'facades', 'swimmingpool', 'condition'],
      dtype='object')

In [224]:
print(df.isna().sum())

postalcode           83
subtype               0
price                 0
bedroom_count        11
habitable_surface     0
kitchen_type          0
furnished             0
fireplace             0
terrace               0
garden                0
garden_surface        0
facades               0
swimmingpool          0
condition             0
dtype: int64


In [247]:
df['condition'].value_counts()

condition
GOOD              4111
AS_NEW            3262
NOT_KNOWN         2029
TO_BE_DONE_UP     1082
TO_RENOVATE       1029
JUST_RENOVATED     704
TO_RESTORE          57
Name: count, dtype: int64

### Creating a pipeline for the model training and testing

In [225]:
# dividing the columns to prediction value and features
X = df.drop(columns=["price"]).to_numpy()
y = df.price.to_numpy().reshape(-1, 1)
# Shapes of X and y
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

#Splitting the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
# Shapes of diffrent train and test sets
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)
print(X_train)

Shape of X:  (12274, 13)
Shape of y:  (12274, 1)
Shape of X_train:  (9819, 13)
Shape of y_train:  (9819, 1)
Shape of X_test:  (2455, 13)
Shape of y_test:  (2455, 1)
[['6941' 'HOUSE' 6.0 ... 3.0 1 'GOOD']
 ['2960' 'DUPLEX' 4.0 ... 3.0 1 'GOOD']
 ['8370' 'APARTMENT' 3.0 ... 0.0 1 'JUST_RENOVATED']
 ...
 ['4340' 'APARTMENT' 1.0 ... 4.0 0 'NOT_KNOWN']
 ['6920' 'HOUSE' 4.0 ... 3.0 1 'TO_BE_DONE_UP']
 ['3271' 'HOUSE' 2.0 ... 4.0 1 'AS_NEW']]


In [226]:
# first transformation for the pipeline which is SimpleImputer
trans_1 = ColumnTransformer([('simple_imputer',SimpleImputer(strategy='most_frequent'),[0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12])], remainder='passthrough' )

In [227]:
# second transformation for the pipeline which is OneHotEncoder
trans_2 = ColumnTransformer([('ohe_trans',OneHotEncoder(sparse_output=False,handle_unknown='ignore'), [0,1,4,12])], remainder='passthrough' )

In [228]:
# third transformation for the pipeline which is scaling
trans_3 = ColumnTransformer([('scale', StandardScaler(), slice(0, 12))], remainder='passthrough')

In [229]:
# fourth transformation for the pipeline which is creating an instance from linear models
trans_4 = RandomForestClassifier()

In [230]:
# creating a pipeline with above transformer
pipe = Pipeline([('trans_1',trans_1),
                 ('trans_2',trans_2),
                 ('trans_3',trans_3),
                 ('trans_4',trans_4)])

In [231]:
# Training the model
pipe.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [232]:
# printing the score
pipe.score(X_train, y_train)

0.9871677360219981

In [233]:
# predicting thr test dataset
y_pred = pipe.predict(X_test)

In [234]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [235]:
score

0.2790224032586558

In [236]:
pipe.score(X_test, y_test)

0.2790224032586558

### Exporting the pipeline

In [237]:
# export
import pickle
pickle_out = open('../models/my_model.pkl', 'wb')
pickle.dump(pipe, pickle_out)
pickle_out.close()


In [None]:
import bz2file as bz2
def compressed_pickle(title, data):

with bz2.BZ2File('my_‘.pbz2’, ‘w’) as f:
    pickle.dump(data, f)


In [240]:
'''['postalcode', 'subtype', 'price', 'bedroom_count', 'habitable_surface',
       'kitchen_type', 'furnished', 'fireplace', 'terrace', 'garden',
       'garden_surface', 'facades', 'swimmingpool', 'condition']'''
pipe.predict([["9000","HOUSE",4,110,"HYPER_EQUIPPED",1,0,0,1,20,2,0,"GOOD"]])

array([469000.])

In [244]:
from pydantic import BaseModel
import json

# Class with required values for the prediction
'''['postalcode', 'subtype', 'price', 'bedroom_count', 'habitable_surface',
       'kitchen_type', 'furnished', 'fireplace', 'terrace', 'garden',
       'garden_surface', 'facades', 'swimmingpool', 'condition']'''
class HouseDetails(BaseModel):
    postalcode : "9000"
    subtype : "HOUSE"
    bedroom_count : 2
    habitable_surface : 112
    kitchen_type : "HYPER_EQUIPPED"
    furnished : 1
    fireplace :0
    terrace: 0
    garden: 1
    garden_surface : 25
    facades : 2
    swimmingpool : 0
    condition : "GOOD"


    
def prediction(input_data : HouseDetails):
    input_data = json.loads(input_data.model_dump())
    print(input_data)

prediction()

TypeError: Forward references must evaluate to types. Got 9000.