# Goal of the project:
    Predict house price:
        Using creative feature engineering and advance machine learning model

In [None]:
# Basic Imports
import pandas as pd
import numpy as np

# Visualisation imports
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings ignor
import warnings
warnings.filterwarnings('ignore')

# Label encoding
from sklearn.preprocessing import LabelEncoder

# Train test split
from sklearn.model_selection import train_test_split

# Regression Model

## Linear regression
from sklearn.linear_model import LinearRegression

## Random forest
from sklearn.ensemble import RandomForestRegressor

## XGBoost
from xgboost import XGBRegressor

# Score
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Input

In [None]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df_train

In [None]:
df_test

# Preprocessing

        ◦ Check empty data
        ◦ Delete columns contain aproximately higher than 20 % missing data
        ◦ If features has empty cells and less than 20% missing data, delete / fill empty entites
        ◦ Check and clean outliers
        ◦ Features selection

## Check empty data

In [None]:
df_train.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
df_test.isnull().sum().sort_values(ascending=False).head(20)

## Delete columns contain missing data

In [None]:
df_train = df_train.dropna(axis=1)
df_train

In [None]:
df_test = df_test.dropna(axis=1)
df_test

In [None]:
df_train.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
df_test.isnull().sum().sort_values(ascending=False).head(20)

## Outliers

In [None]:
df_train.corr()['SalePrice'].sort_values(ascending=False).head(25)

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='OverallQual', y='SalePrice', data=df_train)

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='GrLivArea', y='SalePrice', data=df_train)

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='GarageCars', y='SalePrice', data=df_train)

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='GarageArea', y='SalePrice', data=df_train)

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='1stFlrSF', y='SalePrice', data=df_train)

In [None]:
df_train = df_train[df_train.GrLivArea<4000]
df_train = df_train[df_train['1stFlrSF']<4000]

In [None]:
plt.figure(figsize=(12, 10))
sns.regplot(x='GrLivArea', y='SalePrice', data=df_train)

## Features selection
            ◦ Label encoding
            ◦ Check correlation with target

In [None]:
cat_cols_train = df_train.columns[df_train.dtypes== 'object']
cat_cols_test = df_test.columns[df_test.dtypes== 'object']
cat_cols_train

In [None]:
encoder = LabelEncoder()

for i in cat_cols_train:
    df_train[i] = encoder.fit_transform(df_train[i])
df_train.head()

In [None]:
for i in cat_cols_test:
    df_test[i] = encoder.fit_transform(df_test[i])
df_test.head()

In [None]:
df_train.corr()['SalePrice'].sort_values(ascending=False).head(25)

### Select columns from feature correlation with target

In [None]:
select_cols = ['SalePrice', 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
       'Foundation', 'Heating', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleCondition']

In [None]:
df_train = df_train[select_cols]
df_train

Feature and Target seperation

In [None]:
X = df_train.drop(['SalePrice'], axis=1)
y = df_train['SalePrice']

Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

# Regression model

In [None]:
# Model LR and prediction
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
# Model Random forest

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)

In [None]:
# Model XGBR and prediction
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
# Features importances Linear regression

# get importances
importances = lr.coef_

# Summarize feature importance
for i, v in enumerate(importances):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# Features importances Random forest regression

# get importances
importances = rfr.feature_importances_

# Summarize feature importance
for i, v in enumerate(importances):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# Features importances XGBoost regression

# get importances
importances = xgb.feature_importances_

# Summarize feature importance
for i, v in enumerate(importances):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# Score
mse = mean_squared_error(y_test, lr_pred)
print ('Mean squared error:', mse)
rmse = sqrt(mse)
print ('Root mean squared error:', rmse)
print ('r2:', r2_score(y_test, lr_pred))

In [None]:
# Score
mse = mean_squared_error(y_test, rfr_pred)
print ('Mean squared error:', mse)
rmse = sqrt(mse)
print ('Root mean squared error:', rmse)
print ('r2:', r2_score(y_test, rfr_pred))

In [None]:
# Score
mse = mean_squared_error(y_test, xgb_pred)
print ('Mean squared error:', mse)
rmse = sqrt(mse)
print ('Root mean squared error:', rmse)
print ('r2:', r2_score(y_test, xgb_pred))

# Prediction

Convert test file according to model trained style

In [None]:
test_cols = ['MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
       'Foundation', 'Heating', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleCondition']
test = df_test[test_cols]
test

In [None]:
test_pred = xgb.predict(test)

In [None]:
test_pred

In [None]:
df_test['SalePrice'] = test_pred
df_test

In [None]:
df_final = df_test[['Id', 'SalePrice']]
df_final

In [None]:
df_final.to_csv('submission.csv', index=False)

In [None]:
nan