In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


## Load data

In [None]:
target = 'SalePrice'
train = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/train.csv', index_col=0)
test = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/test.csv', index_col=0)
test[target]=np.NaN  # Dummy

full_data = [train, test]  # List of both train and test (so that they can ba handled in the same way)

# Missing data

In [None]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

If more than 15% is missing, we remove that feature:

In [None]:
mask = missing_data['Percent'] > 0.15
df_remove = missing_data.loc[mask].copy()
df_remove

In [None]:
for dataset in full_data:
    dataset.drop(columns=df_remove.index, inplace=True)

### Impute the rest of missing values

In [None]:
from sklearn.impute import SimpleImputer

## Objects:
object_imputer = SimpleImputer(strategy='most_frequent')
train_objects = train.select_dtypes(include='object')
train_objects = pd.DataFrame(object_imputer.fit_transform(train_objects), columns=train_objects.columns)
test_objects = test.select_dtypes(include='object')
test_objects = pd.DataFrame(object_imputer.transform(test_objects), columns=test_objects.columns)

## Numeric:
numberic_imputer = SimpleImputer(strategy='mean')
train_numeric = train.select_dtypes(exclude='object')
train_numeric = pd.DataFrame(numberic_imputer.fit_transform(train_numeric), columns=train_numeric.columns)

test_numeric = test.select_dtypes(exclude='object')
test_numeric = pd.DataFrame(numberic_imputer.transform(test_numeric), columns=test_numeric.columns)

train_index = train.index.copy()
test_index = test.index.copy()
train = pd.concat((train_numeric, train_objects), axis=1)
train.index=train_index
test = pd.concat((test_numeric, test_objects), axis=1)
test.index = test_index
full_data = [train,test]



# Data exploration

In [None]:
train.select_dtypes(include='object').describe()

### Categorical data

In [None]:
train.select_dtypes(include='object').nunique()

Look at categories where the SalePrice changes much with categories.

In [None]:
categorical_columns = train.select_dtypes(include='object').columns

s = pd.Series()
for categorical_column in categorical_columns:
    s[categorical_column] = train.groupby(by=categorical_column)['SalePrice'].median().std()/train['SalePrice'].std()

In [None]:
s.sort_values(ascending=False)[0:15]

In [None]:
def plot_categories(key):
    sort = train.groupby(by=key)['SalePrice'].median().sort_values(ascending=False)
    fig,ax=plt.subplots()
    fig.set_size_inches(16,3)
    sns.boxplot(data=train, x=key, y='SalePrice', order=sort.index, ax=ax)
    ax.tick_params(axis='x', rotation=70)

In [None]:
for category in s.sort_values(ascending=False)[0:15].keys():
    plot_categories(key=category)

## One hot encoder


In [None]:
df_ = pd.concat([train,test], axis=0)
df_ = pd.get_dummies(df_)
df_.shape
train = df_.loc[train.index].copy()
test = df_.loc[test.index].copy()

## Correlation matrix (heatmap style)

In [None]:
#correlation matrix
corrmat = train.corr().abs()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

## SalePrice' correlation matrix (zoomed heatmap style)

In [None]:
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)

fig, ax = plt.subplots(figsize=(10,10))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values, 
                 ax=ax)

In [None]:
corrmat['SalePrice'].sort_values(ascending=False)

## Remove columns with low correlation

In [None]:
mask = corrmat['SalePrice'].abs() > 0.05
columns = train.columns[mask]
train = train[columns]
test = test[columns]
full_data = [train,test]

## Scatter plots between 'SalePrice' and correlated variables

In [None]:
#scatterplot
sns.set()
k=5
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
sns.pairplot(train[cols], size = 2.5)

## Removing outliers

In [None]:
mask = train.nunique()/train.count() > 0.01  # Columns with many different values
for key in train.columns[mask]:
    mask = train[key] < train[key].quantile(0.995)
    train = train.loc[mask].copy()
    

In [None]:
#scatterplot
sns.set()
k=8
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
x_vars=cols.drop('SalePrice')
sns.pairplot(data=train[cols], x_vars=x_vars, y_vars=['SalePrice'], size = 2.5, kind='reg')

# Feature engineering

In [None]:
train.describe()

In [None]:
#for dataset in full_data:
#    dataset['sale_time'] = dataset['YrSold']*12 + dataset['MoSold']

In [None]:
from sklearn.decomposition import PCA

In [None]:

#pca = PCA(n_components=2, random_state=0)
#pca.fit(train_numeric)
#pca.explained_variance_ratio_

## Trying a simple Pipeline with polynomial regression on the numeric data

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import VarianceThreshold

select_k_best = SelectKBest(score_func=f_regression, k=15)
standard_scaler = StandardScaler()
selector = VarianceThreshold(threshold=0.9)
polynomial_features = PolynomialFeatures(degree=3)
linear_regression = LinearRegression()

steps = [
    ('scaler', standard_scaler),
#    ('selector', selector),
    ('polynomial_features', polynomial_features),
    ('select_k_best', select_k_best),
    ('linear_regression', linear_regression),
]

pipeline_polynomial = Pipeline(steps=steps)

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor(n_estimators=100)
selector = VarianceThreshold(threshold=0.2)

steps = [
    ('scaler', standard_scaler),
    ('selector', selector),
    ('estimator', random_forest_regressor),
]

pipeline_random_forest = Pipeline(steps=steps)

In [None]:
#data = pd.concat([train_numeric, train_one_hot], axis=1)
#data_test = pd.concat([test_numeric, test_one_hot], axis=1)

data = train.copy()
y=data.pop('SalePrice')
X=data

data_test = test.copy()
_ = data_test.pop(target)
X_test = data_test

assert not 'SalePrice' in X

In [None]:
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(n_estimators=100, max_depth=100)

steps = [
    ('scaler', standard_scaler),
    ('estimator', xgb_regressor),
]

pipeline_xgb_regressor = Pipeline(steps=steps)


## Spot checking

In [None]:
pipelines = {
                'poly': pipeline_polynomial, 
                'random forest' : pipeline_random_forest, 
                'xgb': pipeline_xgb_regressor,
}

scores = {}
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

for name, estimator in pipelines.items():
    scores[name] = cross_val_score(estimator=estimator, X=X, y=y, cv=cv)

In [None]:
df_scores = pd.DataFrame(scores)

In [None]:
fig,ax=plt.subplots()
sns.boxplot(data=df_scores, ax=ax);

In [None]:
df_scores.mean()

In [None]:
from sklearn.model_selection import GridSearchCV

# define the grid
grid = dict()
grid['estimator__n_estimators'] = [i for i in [5000]]
grid['estimator__max_depth'] = [i for i in [2, 10, 20]]

#grid['estimator__n_estimators'] = [i for i in range(1, 2)]
cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=0)

# define the grid search
search = GridSearchCV(estimator=pipeline_xgb_regressor, param_grid=grid, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y)

In [None]:
ref_error = -19854.589503095718

In [None]:
search_result.best_score_/ref_error

In [None]:
model = search_result.best_estimator_

In [None]:
model

## Predict and Save

In [None]:
y_pred = model.predict(X_test)
df_result = pd.DataFrame(data=y_pred, index=X_test.index, columns=['SalePrice'])
df_result.index.name='Id'
df_result.to_csv('my_submission.csv')

In [None]:
df_result