In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv(
    '/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
y = pd.read_csv(
    '/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
# Sort by nulls in columns

pd.set_option('display.max_rows', df.shape[0])
pd.DataFrame(df.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageYrBlt,81
GarageCond,81
GarageType,81
GarageFinish,81


In [3]:
# Delete columns with a lot of nulls (over50%)

df.drop(columns=['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Utilities', 'FireplaceQu'], 
        inplace = True)
a = df.columns[df.isnull().any()]

# In another columns replace nulls - mode

for i in a:
    df[i] = df[i].fillna(df[i].mode()[0])  
    
df.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000
5,50,RL,85.0,14115,Pave,IR1,Lvl,Inside,Gtl,Mitchel,...,0,320,0,0,700,10,2009,WD,Normal,143000
6,20,RL,75.0,10084,Pave,Reg,Lvl,Inside,Gtl,Somerst,...,0,0,0,0,0,8,2007,WD,Normal,307000
7,60,RL,60.0,10382,Pave,IR1,Lvl,Corner,Gtl,NWAmes,...,228,0,0,0,350,11,2009,WD,Normal,200000
8,50,RM,51.0,6120,Pave,Reg,Lvl,Inside,Gtl,OldTown,...,205,0,0,0,0,4,2008,WD,Abnorml,129900
9,190,RL,50.0,7420,Pave,Reg,Lvl,Corner,Gtl,BrkSide,...,0,0,0,0,0,1,2008,WD,Normal,118000


In [4]:
# Extract SalePrice feature as target array

y = df['SalePrice']
del df['SalePrice']


In [5]:
# Transfom objects in coloumns to int64

a = df.select_dtypes(include = object)

for i in a:
    label_encoder = preprocessing.LabelEncoder()
    df[i] = label_encoder.fit_transform(df[i])
    df.drop(columns = [], inplace = True)
    
df.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,4,0,5,...,61,0,0,0,0,0,2,2008,8,4
1,20,3,80.0,9600,1,3,3,2,0,24,...,0,0,0,0,0,0,5,2007,8,4
2,60,3,68.0,11250,1,0,3,4,0,5,...,42,0,0,0,0,0,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,0,6,...,35,272,0,0,0,0,2,2006,8,0
4,60,3,84.0,14260,1,0,3,2,0,15,...,84,0,0,0,0,0,12,2008,8,4
5,50,3,85.0,14115,1,0,3,4,0,11,...,30,0,320,0,0,700,10,2009,8,4
6,20,3,75.0,10084,1,3,3,4,0,21,...,57,0,0,0,0,0,8,2007,8,4
7,60,3,60.0,10382,1,0,3,0,0,14,...,204,228,0,0,0,350,11,2009,8,4
8,50,4,51.0,6120,1,3,3,4,0,17,...,0,205,0,0,0,0,4,2008,8,0
9,190,3,50.0,7420,1,3,3,0,0,3,...,4,0,0,0,0,0,1,2008,8,4


In [6]:
# Train/test splitting

x_train, x_test, y_train, y_test = train_test_split(
    df, y, test_size = 0.2, random_state = 1337)

In [7]:
#Building Model! LGBM Regressor is my favorite regressor for high-dim

lgbm = LGBMRegressor(objective = 'regression', 
                       num_leaves = 13,
                       learning_rate = 0.034428, 
                       n_estimators = 4235,
                       random_state = 1337)

# Fit'n'show rmse

lgbm.fit(x_train, y_train)
lgbm_train_predict = lgbm.predict(x_train)
rmse = np.sqrt(mean_squared_error(y_train, lgbm_train_predict))

print(rmse)

347.46451240544513


In [8]:
# Prediction on specific example

pred_0 = lgbm.predict(df)

a = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
y_0 = a['SalePrice']

abs(pred_0[1337] / y_0[1337])

1.000155720690266

In [9]:
# Preparing test data

test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

pd.set_option('display.max_rows', test.shape[0])
pd.DataFrame(test.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
PoolQC,1456
MiscFeature,1408
Alley,1352
Fence,1169
FireplaceQu,730
LotFrontage,227
GarageYrBlt,78
GarageQual,78
GarageFinish,78
GarageCond,78


In [10]:
# Convet test data as train

test.drop(columns=[
    'Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Utilities', 'FireplaceQu'], 
          inplace = True)

a = test.columns[test.isnull().any()]

for i in a:
    test[i] = test[i].fillna(test[i].mode()[0]) 
    
label_encoder = preprocessing.LabelEncoder()

a = test.select_dtypes(include = object)

for i in a:
    label_encoder = preprocessing.LabelEncoder()
    test[i] = label_encoder.fit_transform(test[i])
    test.drop(columns = [], inplace = True)
    
test.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,2,80.0,11622,1,3,3,4,0,12,...,0,0,0,120,0,0,6,2010,8,4
1,20,3,81.0,14267,1,0,3,0,0,12,...,36,0,0,0,0,12500,6,2010,8,4
2,60,3,74.0,13830,1,0,3,4,0,8,...,34,0,0,0,0,0,3,2010,8,4
3,60,3,78.0,9978,1,0,3,4,0,8,...,36,0,0,0,0,0,6,2010,8,4
4,120,3,43.0,5005,1,0,1,4,0,22,...,82,0,0,144,0,0,1,2010,8,4
5,60,3,75.0,10000,1,0,3,0,0,8,...,84,0,0,0,0,0,4,2010,8,4
6,20,3,60.0,7980,1,0,3,4,0,8,...,21,0,0,0,0,500,3,2010,8,4
7,60,3,63.0,8402,1,0,3,4,0,8,...,75,0,0,0,0,0,5,2010,8,4
8,20,3,85.0,10176,1,3,3,4,0,8,...,0,0,0,0,0,0,2,2010,8,4
9,20,3,70.0,8400,1,3,3,0,0,12,...,0,0,0,0,0,0,4,2010,8,4


In [11]:
# Make some predictions...!

sub = lgbm.predict(test)
sub = pd.DataFrame(sub)

In [12]:
#Write to csv

submission = pd.read_csv(
    '/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
submission = submission['Id']
submission = pd.DataFrame(submission)
submission['SalePrice'] = sub

submission.to_csv('/kaggle/working/submission.csv', index = False)

In [13]:
submission = pd.read_csv('/kaggle/working/submission.csv')
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,123359.228097
1,1462,160151.584938
2,1463,179231.135915
3,1464,189976.059573
4,1465,175566.488908
