# House prices pediction using decision forests from scikit-learn

In [151]:
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [152]:
import matplotlib.pyplot as plt

In [153]:
data=pd.read_csv('train.csv')

In [154]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [155]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [208]:

# Summarize missing values
missing_summary = data.isnull().sum().reset_index()
missing_summary.columns = ['Column', 'Missing Values']
missing_summary['% Missing'] = (data.isnull().mean() * 100).values
missing_summary['dtype']=missing_summary.columns.dtype
print(missing_summary[missing_summary['% Missing']>0])

# Output:
#   Column  Missing Values  % Missing
# 0      A               1  33.333333
# 1      B               1  33.333333
# 2      C               3 100.000000



          Column  Missing Values  % Missing   dtype
3    LotFrontage             259  17.739726  object
6          Alley            1369  93.767123  object
25    MasVnrType             872  59.726027  object
26    MasVnrArea               8   0.547945  object
30      BsmtQual              37   2.534247  object
31      BsmtCond              37   2.534247  object
32  BsmtExposure              38   2.602740  object
33  BsmtFinType1              37   2.534247  object
35  BsmtFinType2              38   2.602740  object
42    Electrical               1   0.068493  object
57   FireplaceQu             690  47.260274  object
58    GarageType              81   5.547945  object
59   GarageYrBlt              81   5.547945  object
60  GarageFinish              81   5.547945  object
63    GarageQual              81   5.547945  object
64    GarageCond              81   5.547945  object
72        PoolQC            1453  99.520548  object
73         Fence            1179  80.753425  object
74   MiscFea

In [157]:
list(set(data.dtypes))

[dtype('float64'), dtype('int64'), dtype('O')]

In [158]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def process(data):
    data=data.drop('Id', axis=1)
    data_cat=data.select_dtypes(include=[('O')])
    data_num=data.select_dtypes(include=[('float64'),('int64')]) 
    
    cat_pipeline=Pipeline(steps=[('impute_cat', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder())])
    num_pipeline=Pipeline(steps=[('impute_num', SimpleImputer(strategy='mean'))])
    
    preprocessor=ColumnTransformer(transformers=[('cat', cat_pipeline, list(data_cat.columns)),('num', num_pipeline, list(data_num.columns) )])
    
    data_proc=preprocessor.fit_transform(data)
    
    newCols=preprocessor.get_feature_names_out()
    data_transformed=pd.DataFrame(data_proc.toarray(), columns=list(newCols))
    
    return data_transformed, newCols

In [159]:
data_tranformed, newCols=process(data)
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_transformed, test_size=0.3, random_state=10)
print(train.shape, test.shape)

(1022, 288) (438, 288)


# As the sklearn fit method only accepts numerical array values, we need to preprocess the data, converting categorical to numerical values using one hot encoding. See the chat gpt chat.

In [160]:

regr=RandomForestRegressor(max_depth=4, random_state=0)


In [161]:
Y=train['num__SalePrice']
X=train.drop('num__SalePrice', axis=1)

In [162]:

regr.fit(X,Y)

In [163]:
Y_test=test['num__SalePrice']
X_test=test.drop('num__SalePrice', axis=1)
Y_pred=regr.predict(X_test)

In [164]:
regr.score(X_test, Y_test)

0.8026189007516111

# 80%, not bad! Now we just have to apply the model on the competition dataset and make our prediction! 

In [165]:
kag_data=pd.read_csv('test.csv')



In [166]:
kag_data.shape

(1459, 80)

## Oh, actually we also need to preprocess the competition dataset....


In [None]:
kag_data_transformed, kag_newCols=process(kag_data)
kag_data_transformed.head()
for col in newCols:
    if col not in kag_data_transformed:
        kag_data_transformed[col]=0
kag_data_transformed=kag_data_transformed[newCols]
kag_data_transformed=kag_data_transformed.drop('num__SalePrice', axis=1)
kag_data_transformed.shape

In [None]:
pred=regr.predict(kag_data_transformed)
pred.shape

In [179]:
index=np.arange(1461,1461+len(pred))

In [180]:
index


array([1461, 1462, 1463, ..., 2917, 2918, 2919])

In [187]:
results=pd.DataFrame({'Id': index, 'SalePrice': pred}, index=index)

In [188]:
results.head()

Unnamed: 0,Id,SalePrice
1461,1461,125589.268584
1462,1462,143076.708134
1463,1463,166758.935091
1464,1464,177627.341261
1465,1465,229180.897041


In [189]:
results.to_csv('results.csv', index=False)