In [1]:
import pandas as pd 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MaxAbsScaler,LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
test.shape

(1459, 80)

## preprocess

In [4]:
for c in train.columns:
    if(train[c].isna().sum()>0):
        print(train[c].name+" "+str(train[c].isna().sum()))

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [5]:
train.drop(['PoolQC'],axis=1,inplace=True)
test.drop(['PoolQC'],axis=1,inplace=True)

In [6]:

le = LabelEncoder()

train = train.apply(lambda x: x.fillna(x.mode()[0]))
test = test.apply(lambda x:x.fillna(x.mode()[0]))

non_numeric_columns = train.select_dtypes(exclude=['number']).columns

for column in non_numeric_columns:
     train[column] = le.fit_transform(train[column])
     test[column] =  le.transform(test[column])  


## decompose and normalize

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler=MaxAbsScaler()

X=train.drop(columns='SalePrice')
Y=train['SalePrice']


xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2)


xtrain=scaler.fit_transform(xtrain)
xtest=scaler.transform(xtest)


## validation

In [16]:
model=GradientBoostingRegressor(loss='squared_error',n_estimators=500,max_depth=128,min_samples_split=2,min_samples_leaf=2)

In [17]:

model.fit(xtrain,ytrain)

predict_trainYn = model.predict(xtrain)
predict_testYn=model.predict(xtest)

r_train=r2_score(ytrain,predict_trainYn)
r_test=r2_score(ytest,predict_testYn)
print(r_train)
print(r_test) 


0.9999999999984781
0.811807151049102


## predict

In [18]:

model.fit(X,Y)

predict_trainY = model.predict(X)
predict_testY=model.predict(test)

print(r2_score(Y,predict_trainY))
print(sqrt(mean_squared_error(Y,predict_trainY)))

result_dataFrame=pd.DataFrame({
    'Id':test['Id'],
    'SalePrice':predict_testY,
    })

result_dataFrame.to_csv('result.csv',index=False)




1.0
1.001493323771931e-05
