## Baseline Model
### Strategy:
- Normalize Numerical Variables
- Dimentionality Reduction of Numerical Variables using PCA
- One hot Encoding of Categorical Variable
- Feature eng. of duration of house remodel from its built date
- Algorith to use: Linear Regression

### Load Libraries

In [57]:

# Data manipulation libraries
import pandas as pd
import numpy as np

##### Scikit Learn modules needed for Logistic Regression
from sklearn.linear_model import LinearRegression ,Ridge , Lasso
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder,MinMaxScaler , StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Plotting libraries
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

In [None]:
df = pd.read_csv("../train.csv")
df.head()
print(df.describe())
df.dtypes

In [53]:
# Select only numerical columns for data analysis
df_numeric = df._get_numeric_data()
print(df_numeric.columns)
exclude_dates = ['Id','YearBuilt','YearRemodAdd','MoSold', 'YrSold','SalePrice']
df_numeric = df_numeric.drop(exclude_dates,axis=1)
print(df_numeric.columns)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
       'WoodD

### Data Exploration

In [None]:
# Explore data visually
# Build Correlation Matrix to study multi collinearity
correlation = df_numeric.corr()
#print(correlation)

fig , ax = plt.subplots()
fig.set_figwidth(18)
fig.set_figheight(18)
sns.heatmap(correlation,annot=True,cmap="YlGnBu")

### Build Data Transformation Pipeline

In [64]:
# We create the preprocessing pipelines for both numeric and categorical data.

numeric_features = [x for x in df_numeric.columns]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components= 2))])

all_numeric_columns = exclude_dates + numeric_features
categorical_features = [x for x in df.columns if x not in all_numeric_columns ]
# categorical_features = [x for x in df.columns if x not in df_numeric + exclude_dates]
#print(categorical_features)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])

In [None]:
# columns to use as features
X_columns = numeric_features + categorical_features
X_columns = [x for x in X_columns if x != 'SalePrice']
X_columns

### Split Train & Test Data

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df[numeric_features + categorical_features], 
                                                    df["SalePrice"], test_size=0.2,random_state =42)

### Build Model / Grid Search

In [67]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
#     'classifier__solver': ["newton-cg","lbfgs", "liblinear", "sag", "saga"],
#     'classifier__max_iter' :[100,150,200]
}

grid_search_lr = GridSearchCV(clf, param_grid, cv=10, iid=False,verbose= 2 , n_jobs = -1)
grid_search_lr.fit(X_train, y_train)

print(("best Linear Regression from grid search: %.3f"
       % grid_search_lr.score(X_test, y_test)))
print("Best Parameter Setting is {}".format(grid_search_lr.best_params_))

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best Linear Regression from grid search: 0.867
Best Parameter Setting is {'preprocessor__num__imputer__strategy': 'mean'}


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


### Ridge Regression

In [69]:
# We create the preprocessing pipelines for both numeric and categorical data.

numeric_features = [x for x in df_numeric.columns]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components= 2))])

all_numeric_columns = exclude_dates + numeric_features
categorical_features = [x for x in df.columns if x not in all_numeric_columns ]
# categorical_features = [x for x in df.columns if x not in df_numeric + exclude_dates]
#print(categorical_features)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', Ridge())])

In [70]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'classifier__alpha' :[1.0,0.5,0.01],
    'classifier__max_iter': [100,150,200]
}

grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False,verbose= 2 , n_jobs = -1)
grid_search.fit(X_train, y_train)

print(("best Linear Regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))
print("Best Parameter Setting is {}".format(grid_search.best_params_))

Fitting 10 folds for each of 126 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  1.6min


best Linear Regression from grid search: 0.858
Best Parameter Setting is {'classifier__alpha': 1.0, 'classifier__max_iter': 100, 'classifier__solver': 'sag', 'preprocessor__num__imputer__strategy': 'median'}


[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed:  2.1min finished


### Evaluate Test Data and prepare submission file

In [76]:
test_df = pd.read_csv("../test.csv")
test_df_columns = [x for x in test_df if x not in exclude_dates]

# Load Submission File
sample_submission = pd.read_csv("../sample_submission.csv")

In [96]:
y_prediction = grid_search.predict(test_df[test_df_columns])

In [97]:
sample_submission.head(n=10)

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977
5,1466,177150.989247
6,1467,172070.659229
7,1468,175110.95652
8,1469,162011.698832
9,1470,160726.247831


In [98]:
submission = pd.DataFrame({"Id":sample_submission["Id"].values, "SalePrice":y_prediction.tolist()})

In [99]:
submission.to_csv("submission_ridge_V1.csv",index=False)