### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline,make_pipeline


In [2]:
# importing the dataset
df = pd.read_csv("../data/house_details_v3.csv")
# printing the shape
df.shape

(19976, 31)

### Cleaning the dataset and removing null values

In [3]:
# dropping the rows with 0 price and 0 habitable surface
df = df[(df["Price"] != 0)&(df["Habitable surface"] != 0.0)]
df.shape

(17952, 31)

In [4]:
# printing the head
df.head()

Unnamed: 0,id,Street,Housenumber,Box,Floor,City,Postalcode,Type,Subtype,Lacation area,...,Terrace,Garden,Garden surface,Facades,SwimmingPool,Condition,EPC score,Latitude,Longitude,Property url
0,10683816,Egmont Park,11,0,2.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
1,10683823,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,0,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
2,10683826,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
3,10683825,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
4,10683824,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...


In [5]:
# information about the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17952 entries, 0 to 19975
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 17952 non-null  int64  
 1   Street             17952 non-null  object 
 2   Housenumber        17952 non-null  object 
 3   Box                17952 non-null  object 
 4   Floor              17952 non-null  float64
 5   City               17952 non-null  object 
 6   Postalcode         17952 non-null  object 
 7   Type               17952 non-null  object 
 8   Subtype            17952 non-null  object 
 9   Lacation area      17952 non-null  object 
 10  Region             17952 non-null  object 
 11  District           17952 non-null  object 
 12  Province           17952 non-null  object 
 13  Price              17952 non-null  float64
 14  Type of sale       17952 non-null  object 
 15  Construction year  17952 non-null  float64
 16  Bedroom Count      17952 no

In [6]:
# info about the different postal codes and its value counts 
df["Postalcode"].value_counts()

Postalcode
1180        516
1000        488
8300        379
1050        366
9000        360
           ... 
4506 JH       1
8851          1
8125-404      1
204 00        1
7973          1
Name: count, Length: 995, dtype: int64

In [7]:
# columns that don't contribute to the prediction 
columns_to_drop = ["id","Street","City", "Housenumber", "Box", "Floor","Lacation area", "Subtype", "Type of sale", "Construction year", "Kitchen type", "Furnished", "Fireplace", "Garden", "Garden surface", "SwimmingPool", "Condition", "EPC score","Latitude", "Longitude", "Property url" ]

In [8]:
# dropping the above column from the dataset
df.drop(columns=columns_to_drop, inplace=True)


In [9]:
# drop the rows with no improper postalcode and with no region and province
df.drop(df[df["Region"] == '0'].index, inplace = True)
df['Region'].value_counts()




Region
FLANDERS    9199
WALLONIE    5123
BRUSSELS    3525
Name: count, dtype: int64

In [10]:
# Converting the postal code to integer 
def convert_postalcode(postalcode):
    """function to convert the postal code to integer and handel value error"""
    try:
        return int(postalcode)
    except ValueError:
        return None

# applying the above function to the column "Postal code "
df["Postalcode"] = df["Postalcode"].apply(convert_postalcode)
df["Postalcode"] = df["Postalcode"].astype(str)

In [11]:
df["Postalcode"].value_counts()

Postalcode
1180    516
1000    488
8300    379
1050    366
9000    360
       ... 
3668      1
8851      1
2431      1
3473      1
7973      1
Name: count, Length: 922, dtype: int64

In [12]:
df["Terrace"].value_counts()

Terrace
True    12381
0        5466
Name: count, dtype: int64

In [13]:
# Replacing all the 0's to False in the terrace column and changing it to bool datatype
df['Terrace'] = df['Terrace'].replace('0', False)
df["Terrace"] = df['Terrace'].astype(bool)


### Creating a pipeline for the model training and testing

In [14]:
# dividing the columns to prediction value and features
X = df.drop(columns=["Price"]).to_numpy()
y = df.Price.to_numpy().reshape(-1, 1)
# Shapes of X and y
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

#Splitting the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
# Shapes of diffrent train and test sets
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)
print(X_train[0])

Shape of X:  (17847, 9)
Shape of y:  (17847, 1)
Shape of X_train:  (14277, 9)
Shape of y_train:  (14277, 1)
Shape of X_test:  (3570, 9)
Shape of y_test:  (3570, 1)
['4570' 'HOUSE' 'WALLONIE' 'Huy' 'Liège' 3.0 100.0 True 4.0]


In [15]:
# first transformation for the pipeline which is OneHotEncoder
trans_1 = ColumnTransformer([('ohe_trans',OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0,1,2,3,4])], remainder='passthrough' )

In [16]:
# second transformation for the pipeline which is scaling
trans_2 = ColumnTransformer([('scale', MinMaxScaler(), slice(0,9))], remainder='passthrough')

In [17]:
# third transformation for the pipeline which is creating an instance from linear models
trans_3 = LinearRegression()

In [18]:
# creating a pipeline with above transformer
pipe = Pipeline([('trans_1',trans_1),
                 ('trans_2',trans_2),
                 ('trans_3',trans_3)])

In [19]:
# Training the model
pipe.fit(X_train, y_train)

In [20]:
# printing the score
pipe.score(X_train, y_train)

0.6390093196539772

In [21]:
# predicting thr test dataset
y_pred = pipe.predict(X_test)

In [22]:
pipe.score(X_test, y_test)

-1576775530260.5044