In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## DISCOVER AND VISUALIZE DATA

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv") 
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv") 

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
test.shape

In [None]:
train.describe()

## Searching for missing values and visualizing them by column

In [None]:
import missingno as msno
msno.matrix(train)

There are a lot of missing values - we'll handle them later on.

## Take a look at the correlation between variables

In [None]:
sns.set(style="white")
corr_train = train.corr()
mask_train = np.triu(np.ones_like(corr_train, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap_train = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr_train, mask=mask_train, cmap=cmap_train, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr_matrix = train.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)

We can see above, which variables are most correlated with SalePrice.
However - as we have seen the various data types previously - there might be other strong correlations which we cannot see now because there are numerous categorical variables. 
We'll handle those columns later.

## Take a look at the distribution of the dependent variable

In [None]:
from scipy.stats import norm
from scipy import stats
from scipy.stats import norm, skew #for some statistics

sns.distplot(train['SalePrice'] , fit=norm);

(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

As we can see, SalePrice does not have a normal distribution. That is a problem, because normal distribution is a prerequisite for Multiple Linear Regression.
However, we can make some transformation in order to have the normal distribution.

In [None]:
#skewness and kurtosis
print("Skewness: " + str(train['SalePrice'].skew()))
print("Kurtosis: " + str(train['SalePrice'].kurt()))

## Transform the target variable by applying log-transformation

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])

#Check the new distribution 
sns.distplot(train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

Now we have a normally distributed target variable and strong correlations. We may proceed with the Regression model, but first of all we need to clean our data and make some feature engineering.

# **FEATURE ENGINEERING**

## Combining train and test datasets together so that we can do all the work at once. 

In [None]:
all_data = pd.concat((train, test)).reset_index(drop = True)
all_data.drop(['SalePrice'], axis = 1, inplace = True)

## What we need to manage in this chapter:
* decision on missing values
* categorical values into numeric values
* numeric values into strings (e.g. years, months)
* create new values (if applicable)
* scaling

## Missing Values

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

## Hadling categorical and missing values

In [None]:
missing_val_col = ["Alley", 
                   "PoolQC", 
                   "MiscFeature",
                   "Fence",
                   "FireplaceQu",
                   "GarageType",
                   "GarageFinish",
                   "GarageQual",
                   "GarageCond",
                   'BsmtQual',
                   'BsmtCond',
                   'BsmtExposure',
                   'BsmtFinType1',
                   'BsmtFinType2',
                   'MasVnrType']

for i in missing_val_col:
    all_data[i] = all_data[i].fillna('None')

In [None]:
missing_val_col2 = ['BsmtFinSF1',
                    'BsmtFinSF2',
                    'BsmtUnfSF',
                    'TotalBsmtSF',
                    'BsmtFullBath', 
                    'BsmtHalfBath', 
                    'GarageYrBlt',
                    'GarageArea',
                    'GarageCars',
                    'MasVnrArea']

for i in missing_val_col2:
    all_data[i] = all_data[i].fillna(0)
    
## Replaced all missing values in LotFrontage by imputing the median value of each neighborhood. 
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform( lambda x: x.fillna(x.mean()))

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['MSZoning'] = all_data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str) 
all_data['Functional'] = all_data['Functional'].fillna('Typ') 
all_data['Utilities'] = all_data['Utilities'].fillna('AllPub') 
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0]) 
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna("TA") 
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['Electrical'] = all_data['Electrical'].fillna("SBrkr") 

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

In [None]:
features = pd.get_dummies(all_data).reset_index(drop=True)
features.shape

In [None]:
y = train['SalePrice'].copy()
X = all_data[:1460]

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def rmse_crossval(model):
    scores = cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 10)
    rmse_scores = np.sqrt(-scores)
    return(rmse_scores)