### Importing all the required libraries

In [30]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline,make_pipeline

In [31]:
df = pd.read_csv("../data/house_details_v3.csv")

df.shape

(19976, 31)

In [32]:
df = df[(df["Price"] != 0)&(df["Habitable surface"] != 0.0)]
df.shape

(17952, 31)

In [33]:
df.head()

Unnamed: 0,id,Street,Housenumber,Box,Floor,City,Postalcode,Type,Subtype,Lacation area,...,Terrace,Garden,Garden surface,Facades,SwimmingPool,Condition,EPC score,Latitude,Longitude,Property url
0,10683816,Egmont Park,11,0,2.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
1,10683823,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,0,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
2,10683826,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
3,10683825,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...
4,10683824,Egmont Park,11,0,1.0,Gavere,9890,APARTMENT,APARTMENT,URBAN,...,True,0,0.0,4.0,False,AS_NEW,A,50.928407,3.679605,https://www.immoweb.be/en/classified/apartment...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17952 entries, 0 to 19975
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 17952 non-null  int64  
 1   Street             17952 non-null  object 
 2   Housenumber        17952 non-null  object 
 3   Box                17952 non-null  object 
 4   Floor              17952 non-null  float64
 5   City               17952 non-null  object 
 6   Postalcode         17952 non-null  object 
 7   Type               17952 non-null  object 
 8   Subtype            17952 non-null  object 
 9   Lacation area      17952 non-null  object 
 10  Region             17952 non-null  object 
 11  District           17952 non-null  object 
 12  Province           17952 non-null  object 
 13  Price              17952 non-null  float64
 14  Type of sale       17952 non-null  object 
 15  Construction year  17952 non-null  float64
 16  Bedroom Count      17952 no

In [35]:
df["Postalcode"].value_counts()

Postalcode
1180        516
1000        488
8300        379
1050        366
9000        360
           ... 
4506 JH       1
8851          1
8125-404      1
204 00        1
7973          1
Name: count, Length: 995, dtype: int64

In [36]:
columns_to_drop = ["id","Street", "Housenumber", "Box", "Floor","Lacation area", "Subtype", "Type of sale", "Construction year", "Kitchen type", "Furnished", "Fireplace", "Garden", "Garden surface", "SwimmingPool", "Condition", "EPC score","Latitude", "Longitude", "Property url" ]

In [37]:
df.drop(columns=columns_to_drop, inplace=True)
df.head()

Unnamed: 0,City,Postalcode,Type,Region,District,Province,Price,Bedroom Count,Habitable surface,Terrace,Facades
0,Gavere,9890,APARTMENT,FLANDERS,Gent,East Flanders,269000.0,1.0,76.0,True,4.0
1,Gavere,9890,APARTMENT,FLANDERS,Gent,East Flanders,265000.0,1.0,76.0,True,4.0
2,Gavere,9890,APARTMENT,FLANDERS,Gent,East Flanders,316000.0,2.0,97.0,True,4.0
3,Gavere,9890,APARTMENT,FLANDERS,Gent,East Flanders,356000.0,3.0,112.0,True,4.0
4,Gavere,9890,APARTMENT,FLANDERS,Gent,East Flanders,366000.0,2.0,126.0,True,4.0


In [38]:
# drop the rows with no improper postalcode and with no region and province
df.drop(df[df["Region"] == '0'].index, inplace = True)
df['Region'].value_counts()




Region
FLANDERS    9199
WALLONIE    5123
BRUSSELS    3525
Name: count, dtype: int64

In [39]:
def convert_postalcode(postalcode):
    """function to convert the postal code to integer and handel value error"""
    try:
        return int(postalcode)
    except ValueError:
        return None

# applying the above function to the column "Postal code "
df["Postalcode"] = df["Postalcode"].apply(convert_postalcode)
df["Postalcode"] = df["Postalcode"].astype(str)

In [40]:
df["Postalcode"].value_counts()

Postalcode
1180    516
1000    488
8300    379
1050    366
9000    360
       ... 
3668      1
8851      1
2431      1
3473      1
7973      1
Name: count, Length: 922, dtype: int64

In [41]:
df["Terrace"].value_counts()

Terrace
True    12381
0        5466
Name: count, dtype: int64

In [42]:

df['Terrace'] = df['Terrace'].replace('0', False)
df["Terrace"] = df['Terrace'].astype(bool)


In [43]:
X = df.drop(columns=["Price"]).to_numpy()
y = df.Price.to_numpy().reshape(-1, 1)
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

#Splitting the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X:  (17847, 10)
Shape of y:  (17847, 1)
Shape of X_train:  (14277, 10)
Shape of y_train:  (14277, 1)
Shape of X_test:  (3570, 10)
Shape of y_test:  (3570, 1)


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17847 entries, 0 to 19975
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               17847 non-null  object 
 1   Postalcode         17847 non-null  object 
 2   Type               17847 non-null  object 
 3   Region             17847 non-null  object 
 4   District           17847 non-null  object 
 5   Province           17847 non-null  object 
 6   Price              17847 non-null  float64
 7   Bedroom Count      17847 non-null  float64
 8   Habitable surface  17847 non-null  float64
 9   Terrace            17847 non-null  bool   
 10  Facades            17847 non-null  float64
dtypes: bool(1), float64(4), object(6)
memory usage: 1.5+ MB


In [46]:
# list of columns of categorical values
categ = ["City","Postalcode", "Type", "Region", "District", "Province"]

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
encoded_data = ohe.fit_transform(X_train)


#print(encoded_data.shape)
regressor = LinearRegression()
regressor.fit(encoded_data, y_train)


#ohe.fit_transform(y_train[categ])



In [None]:
print(encoded_data.shape)

(14277, 3592)


In [47]:
print(encoded_data.)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
regressor.score(encoded_data, y_train)

0.8086489204173686

In [None]:
regressor.coef_

array([[ 2.45191237e+05,  6.85497658e+18, -6.38673843e+16, ...,
        -4.83698125e+04,  3.97442500e+04,  6.89501744e+15]])

In [None]:

"""X_test_trans = ohe.fit_transform(X_test)
print(X_test_trans.shape)"""

y_pred = regressor.predict(X_test)
print(y_pred)

ValueError: could not convert string to float: 'Ans'

In [None]:
y_pred = regressor.predict(X_test_trans)
regressor.fit(X_test_trans, y_test)