In [49]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [42]:
# train
X = pd.read_csv('data/train.csv')
X.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500


In [43]:
X_test = pd.read_csv('data/test.csv')
X_test.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal


In [44]:
X = X.drop(columns=['MiscFeature','Fence','PoolQC','FireplaceQu','Alley'])
X.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500


In [45]:
y = X.SalePrice
X = X.drop(columns=['SalePrice'])

In [60]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [62]:
num_cols = X_train.select_dtypes(include='number').columns.to_list()
cat_cols = X_train.select_dtypes(exclude='number').columns.to_list()

In [63]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [64]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [65]:
ct = ColumnTransformer(remainder='drop',
                       transformers=[
                           ('numerical', num_pipe, num_cols),
                           ('categorical', cat_pipe, cat_cols)
                       ])

In [66]:
model=Pipeline([
    ('transformer', ct),   
    ('predictor', RandomForestRegressor())
])

In [67]:
model.fit(X_train, y_train)

In [69]:
y_pred = model.predict(X_valid)

In [71]:
mean_squared_error(y_pred, y_valid, squared=False)

39107.771476518916

In [72]:
model.fit(X, y)

In [73]:
y_res = model.predict(X_test)

In [74]:
result = pd.DataFrame({'Id': X_test.Id, 'SalePrice': y_res})

In [76]:
result.to_csv('C:/Users/STC/Documents/submission.csv', index=False)

Unnamed: 0,Id,SalePrice
count,1459.0,1459.0
mean,2190.0,178846.216223
std,421.321334,72961.942245
min,1461.0,60921.83
25%,1825.5,129907.87
50%,2190.0,159342.5
75%,2554.5,210347.645
max,2919.0,538816.59
