In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import missingno as msno 

from sklearn import preprocessing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
train_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

train_data['train_test'] = 1
test_data['train_test'] = 0

test_data['SalePrice'] = np.NaN

all_data = pd.concat([train_data, test_data])

In [None]:
data_desc = open('/kaggle/input/home-data-for-ml-course/data_description.txt', 'r')
for line in data_desc.readlines():
    print(line)

# Exploratory Data Analysis:

In [None]:
all_data.shape

In [None]:
all_data.describe()

In [None]:
all_data.head()

## Address data skew in target variable:

In [None]:
plt.figure(figsize=(10,3))

sns.distplot(all_data['SalePrice'])

In [None]:
plt.figure(figsize=(10,3))

sns.distplot(np.log(all_data['SalePrice']))

In [None]:
all_data['SalePrice_log'] = np.log(all_data['SalePrice'])
all_data = all_data.drop('SalePrice', axis=1)

## Categorical features:

In [None]:
cat_features = all_data.select_dtypes(include='object')

cat_features.columns

In [None]:
plt.figure(figsize=(15,6))
msno.matrix(cat_features)

### Univariate analysis:

In [None]:
f = plt.figure(figsize=(30,20))

i = 1
for column in cat_features:
    f.add_subplot(8, 8, i+1)
    sns.barplot(x = cat_features[column].value_counts(), y=cat_features[column].value_counts().values)
    i += 1

plt.tight_layout()
plt.show()

### Bivariate analysis:

In [None]:
f = plt.figure(figsize=(30,30))

i = 1
for column in cat_features:
    f.add_subplot(8, 8, i+1)
    sns.boxplot(x = cat_features[column], y=all_data['SalePrice_log'])
    plt.xticks(rotation=60)
    i += 1

plt.tight_layout()
plt.show()

## Numerical features:

In [None]:
num_features = all_data.select_dtypes(exclude='object')

num_features = num_features.drop((['Id','SalePrice_log', 'train_test']), axis=1)

num_features.columns

### Univariate analysis:

In [None]:
f = plt.figure(figsize=(30,30))

i = 1
for column in num_features:
    f.add_subplot(8, 8, i+1)
    sns.distplot(num_features[column], kde=False)
    plt.xticks(rotation=60)
    i += 1

plt.tight_layout()
plt.show()

### Bivariate analysis:

In [None]:
f = plt.figure(figsize=(30,30))

i = 1
for column in num_features:
    f.add_subplot(8, 8, i+1)
    sns.scatterplot(num_features[column], all_data['SalePrice_log'])
    plt.xticks(rotation=60)
    i += 1

plt.tight_layout()
plt.show()

**Notes for Data Cleaning & Preprocessing:**

Based on a first viewing of the scatter plots against SalePrice, there appear to be a few outliers on the:
* LotFrontage (say, >200) and LotArea (>100000) data.
* BsmtFinSF1 (>4000) and TotalBsmtSF (>6000)
* 1stFlrSF (>4000)
* GrLivArea (>4000 AND SalePrice <300000)
* LowQualFinSF (>550)

### Correlations between variables:

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(all_data.corr())

plt.title("Correlations between variables")

# Preprocessing:

## Categorical features:

In [None]:
cat_features.columns

In [None]:
cat_features.isnull().sum()[cat_features.isnull().sum()>0].sort_values(ascending=False)

### Transform existing features into new, more useful features:
Many of the existing features have null values, which reflects the existence or otherwise of a given attribute in each house (eg. with/without a pool). I am transforming these features into binary features, in order to make the data easier for the model to learn from:

#### Air conditioning:

In [None]:
cat_features['CentralAir'].value_counts()

In [None]:
def yn(x):
    if x == 'Y':
        y = 1
    else:
        y = 0
    return y

cat_features['AirConditioning'] = cat_features['CentralAir'].apply(yn)

In [None]:
sns.swarmplot(x=cat_features['AirConditioning'], y=all_data['SalePrice_log'])

#### Create 'Has...' features from null values:

In [None]:
derived_cat_features = {
'HasPool' : 'PoolQC',
'HasFence' : 'Fence',
'HasFireplace' : 'FireplaceQu',
'HasGarage' : 'GarageQual',
'HasBsmt' : 'BsmtExposure',
'HasAlley' : 'Alley'
}

def convert_to_binary(x):
    if x == 'None':
        y = 0
    else:
        y = 1
    return y

for new_feat, old_feat in derived_cat_features.items():
    print(new_feat)
    cat_features[old_feat] = cat_features[old_feat].fillna('None')
    cat_features[new_feat] = cat_features[old_feat].apply(convert_to_binary)

In [None]:
sns.swarmplot(x=cat_features['HasFireplace'], y=all_data['SalePrice_log'], hue=cat_features['HasFireplace'])

### Remove useless features:

In [None]:
sns.swarmplot(x=cat_features['Alley'], y=all_data['SalePrice_log'], hue=cat_features['HasAlley'])

In [None]:
# There are too many null values in Alley and PoolQC, making it not useful to learn from
del cat_features['Alley']
del cat_features['PoolQC']

# Exterior of house features are too diverse and have little impact on sale price
del cat_features['Exterior1st']
del cat_features['Exterior2nd']
del cat_features['ExterCond']
del cat_features['ExterQual']

### Deal with features with high cardinality:

### Group variables in features with high cardinality:

#### Building type:

In [None]:
sns.scatterplot(x=all_data['GrLivArea'], y=all_data['SalePrice_log'], hue=cat_features['BldgType'])

In [None]:
def group_bldgtype(x):
    if x in ['TwnhsE', 'Twnhs']:
        y = 'Twnhs'
    elif x in ['1Fam', '2fmCon']:
        y = 'Fam'
    else:
        y = x
    return y

cat_features['BldgType'] = cat_features['BldgType'].apply(group_bldgtype)

#### Condition1 and Condition2:

In [None]:
sns.swarmplot(x=cat_features['Condition1'], y=all_data['SalePrice_log'])

In [None]:
def group_condition(x):
    if x in ['PosN', 'PosA']:
        y = 'Park'
    elif x in ['RRNe', 'RRAe', 'RRAn', 'RRNn']:
        y = 'Railroad'
    elif x in ['Artery', 'Feedr']:
        y = 'Road'
    elif x == 'Norm':
        y = 'Norm'
    return y
    
cat_features['Condition1_grouped'] = cat_features['Condition1'].apply(group_condition)
cat_features['Condition2_grouped'] = cat_features['Condition2'].apply(group_condition)
del cat_features['Condition1']
del cat_features['Condition2']

In [None]:
cat_features['Condition1_grouped']

In [None]:
def encode_conds(df):
    col1 = df['Condition1_grouped']
    col2 = df['Condition2_grouped']
    
    park = 0
    railroad = 0
    road = 0
    if (col1 == 'Park') | (col2 == 'Park'):
        park = 1
    if (col1 == 'Railroad') | (col2 == 'Railroad'):
        railroad = 1
    if (col1 == 'Road') | (col2 == 'Road'):
        road = 1
        
    return park, railroad, road

cat_features['Park'], cat_features['Railroad'], cat_features['Road'] =  cat_features.apply(encode_conds, axis=1)

#### Neighborhood:

In [None]:
plt.figure(figsize=(15,5))
sns.swarmplot(x=cat_features['Neighborhood'], y=all_data['SalePrice_log'])

In [None]:
def group_neigh(x):
    if x in ['OldTown', 'BrkSide', 'Sawyer', 'IDOTRR', 'MeadowV', 'Edwards', 'BrDale']:
        y = 1
    elif x in ['CollgCr', 'Veenker', 'Crawfor', 'Mitchel', 'Somerst', 'NWAmes', 'NAmes', 'SawyerW', 'Timber', 'Gilbert', 'ClearCr', 'NPkVill', 'Blmngtn', 'SWISU', 'Blueste']:
        y = 2
    elif x in ['NoRidge', 'NridgHt', 'StoneBr']:
        y = 3
    return y

cat_features['Neighborhood_grouped'] = cat_features['Neighborhood'].apply(group_neigh)

In [None]:
sns.swarmplot(x=cat_features['Neighborhood_grouped'], y=all_data['SalePrice_log'])

### Fill remaining missing values with 'None':

In [None]:
for feat in cat_features:
    cat_features[feat] = cat_features[feat].fillna("None")

### Encode categorical features:

In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
train_X = all_data[all_data['train_test']==1].drop('SalePrice_log', axis=1)
train_y = all_data[all_data['train_test']==1]['SalePrice_log']
test_X = all_data[all_data['train_test']==0].drop('SalePrice_log', axis=1)

## Numerical features:

In [None]:
num_features.columns

### Missing values:

In [None]:
num_features.isnull().sum()[num_features.isnull().sum()>0].sort_values(ascending=False)

In [None]:
sns.scatterplot(x=num_features['LotFrontage'], y=all_data['SalePrice_log'])

In [None]:
num_features['LotFrontage'].fillna(0, inplace=True)

### Remove outlier values:

In [None]:
# Remove outliers based on observations on scatter plots against SalePrice:
train_data_only = all_data[all_data['train_test']==1]

train_data_only = train_data_only.drop(train_data_only['LotFrontage']
                                     [train_data_only['LotFrontage']>200].index)
train_data_only = train_data_only.drop(train_data_only['LotArea']
                                     [train_data_only['LotArea']>100000].index)
train_data_only = train_data_only.drop(train_data_only['BsmtFinSF1']
                                     [train_data_only['BsmtFinSF1']>4000].index)
train_data_only = train_data_only.drop(train_data_only['TotalBsmtSF']
                                     [train_data_only['TotalBsmtSF']>6000].index)
train_data_only = train_data_only.drop(train_data_only['1stFlrSF']
                                     [train_data_only['1stFlrSF']>4000].index)
train_data_only = train_data_only.drop(train_data_only.GrLivArea
                                     [(train_data_only['GrLivArea']>4000) & 
                                      (train_data_only['SalePrice_log']<300000)].index)
train_data_only = train_data_only.drop(train_data_only.LowQualFinSF
                                     [train_data_only['LowQualFinSF']>550].index)

### Normalise all numerical features:

In [None]:
for column in num_features:
    num_features[column + '_log'] = np.log(num_features[column])
    num_features.drop(column, axis = 1, inplace=True)

In [6]:
num_features

NameError: name 'num_features' is not defined

# Feature selection and engineering:

## SelectFromModel:

# Model:

In [None]:
def inv_y(transformed_y):
    return np.exp(transformed_y)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

lasso_model = Lasso(alpha=0.0005, random_state=5)
lasso_model.fit(train_X, train_y)
lasso_val_predictions = lasso_model.predict(train_X)

lasso_val_mae = mean_absolute_error(inv_y(lasso_val_predictions), inv_y(train_y))

In [None]:
from sklearn.model_selection import GridSearchCV

# Tuning Lasso
param_grid = [{'alpha': [0.0007, 0.0005, 0.005]}]
top_reg = Lasso()

grid_search = GridSearchCV(top_reg, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')
grid_search.fit(train_X, train_y)
print(grid_search.best_params_)

# Submission:

In [None]:
final_model = Lasso(alpha=0.0005, random_state=5)

final_model.fit(train_X, train_y)

predictions = final_model.predict(test_X).astype(int)

output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': inv_y(predictions)})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
output.head(20)