In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

%matplotlib inline
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

sns.set(style='white', context='notebook', palette='deep')

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.select_dtypes(include=['int64','float64']).columns


In [None]:
train.select_dtypes(include=['object']).columns


In [None]:
train['SalePrice'].describe()

In [None]:
g = sns.distplot(train['SalePrice'],color="gray")
g = g.legend(['Skewness : {:.2f}'.format(train['SalePrice'].skew())],loc='best')

In [None]:
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>.5]
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
g = sns.factorplot(x="OverallQual",y="SalePrice",data=train,kind='box',aspect=2.5)


In [None]:
def multiplot(data,features,plottype,nrows,ncols,figsize,y=None,colorize=False):
    """ This function draw a multi plot for 3 types of plots ["regplot","distplot","coutplot"]"""
    n = 0
    plt.figure(1)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    
    if colorize:
        colors = sns.color_palette(n_colors=(nrows*ncols))
    else :
        colors = [None]*(nrows*ncols)
        
    for row in range(ncols):
        for col in range(nrows):
            
            if plottype == 'regplot':
                if y == None:
                    raise ValueError('y value is needed with regplot type')
                
                sns.regplot(data = data, x = features[n], y = y ,ax=axes[row,col], color = colors[n])
                correlation = np.corrcoef(data[features[n]],data[y])[0,1]
                axes[row,col].set_title("Correlation {:.2f}".format(correlation))
            
            elif plottype == 'distplot':
                sns.distplot(a = data[features[n]],ax = axes[row,col],color=colors[n])
                skewness = data[features[n]].skew()
                axes[row,col].legend(["Skew : {:.2f}".format(skewness)])
            
            elif plottype in ['countplot']:
                g = sns.countplot(x = data[features[n]], y = y, ax = axes[row,col],color = colors[n])
                g = plt.setp(g.get_xticklabels(), rotation=45)
                
            n += 1
    plt.tight_layout()
    plt.show()
    plt.gcf().clear()

In [None]:
feats = ["YearBuilt","TotalBsmtSF","GrLivArea","GarageArea"]

multiplot(data = train,features = feats,plottype = "regplot",nrows = 2, ncols = 2,
          figsize = (10,6),y = "SalePrice", colorize = True)

In [None]:
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>.5]
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
total = train.isna().sum().sort_values(ascending=False)
missing_data = pd.concat([total], axis=1, keys=["Total"])
missing_data.head(30)

In [None]:
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
dataset.info()


In [None]:
total = dataset.isna().sum().sort_values(ascending=False)
missing_data = pd.concat([total], axis=1, keys=["Total"])
missing_data.head(30)

In [None]:
dataset["Alley"] = dataset["Alley"].fillna("No")

dataset["MiscFeature"] = dataset["MiscFeature"].fillna("No")

dataset["Fence"] = dataset["Fence"].fillna("No")

dataset["PoolQC"] = dataset["PoolQC"].fillna("No")

dataset["FireplaceQu"] = dataset["FireplaceQu"].fillna("No")

In [None]:
g = sns.countplot(dataset["Utilities"])

In [None]:
dataset["Utilities"] = dataset["Utilities"].fillna("AllPub")


In [None]:
dataset["BsmtCond"] = dataset["BsmtCond"].fillna("No")
dataset["BsmtQual"] = dataset["BsmtQual"].fillna("No")
dataset["BsmtFinType2"] = dataset["BsmtFinType2"].fillna("No")
dataset["BsmtFinType1"] = dataset["BsmtFinType1"].fillna("No")
dataset.loc[dataset["BsmtCond"] == "No","BsmtUnfSF"] = 0
dataset.loc[dataset["BsmtFinType1"] == "No","BsmtFinSF1"] = 0
dataset.loc[dataset["BsmtFinType2"] == "No","BsmtFinSF2"] = 0
dataset.loc[dataset["BsmtQual"] == "No","TotalBsmtSF"] = 0
dataset.loc[dataset["BsmtCond"] == "No","BsmtHalfBath"] = 0
dataset.loc[dataset["BsmtCond"] == "No","BsmtFullBath"] = 0
dataset["BsmtExposure"] = dataset["BsmtExposure"].fillna("No")

In [None]:
g = sns.countplot(dataset["SaleType"])

dataset["SaleType"] = dataset["SaleType"].fillna("WD")

In [None]:
g = sns.countplot(dataset["MSZoning"])

dataset["MSZoning"] = dataset["MSZoning"].fillna("RL")

In [None]:
g = sns.countplot(dataset["KitchenQual"])

dataset["KitchenQual"] = dataset["KitchenQual"].fillna("TA")

In [None]:
dataset["GarageType"] = dataset["GarageType"].fillna("No")
dataset["GarageFinish"] = dataset["GarageFinish"].fillna("No")
dataset["GarageQual"] = dataset["GarageQual"].fillna("No")
dataset["GarageCond"] = dataset["GarageCond"].fillna("No")
dataset.loc[dataset["GarageType"] == "No","GarageYrBlt"] = dataset["YearBuilt"][dataset["GarageType"]=="No"]
dataset.loc[dataset["GarageType"] == "No","GarageCars"] = 0
dataset.loc[dataset["GarageType"] == "No","GarageArea"] = 0
dataset["GarageArea"] = dataset["GarageArea"].fillna(dataset["GarageArea"].median())
dataset["GarageCars"] = dataset["GarageCars"].fillna(dataset["GarageCars"].median())
dataset["GarageYrBlt"] = dataset["GarageYrBlt"].fillna(dataset["GarageYrBlt"].median())

In [None]:
Function_feat = ["Functional","Exterior2nd","Exterior1st","Electrical"]

multiplot(data = dataset ,features = Function_feat,plottype = "countplot",nrows = 2, ncols = 2,
          figsize = (11,9), colorize = True)


dataset["Functional"] = dataset["Functional"].fillna("Typ")
dataset["Exterior2nd"] = dataset["Exterior2nd"].fillna("VinylSd")
dataset["Exterior1st"] = dataset["Exterior1st"].fillna("VinylSd")
dataset["Electrical"] = dataset["Electrical"].fillna("SBrkr")

In [None]:
dataset["MasVnrType"] = dataset["MasVnrType"].fillna("None")
dataset.loc[dataset["MasVnrType"] == "None","MasVnrArea"] = 0

In [None]:
dataset = dataset.replace({'MSSubClass': {20: 'SubClass_20', 30: 'SubClass_30',40: 'SubClass_40',
45: 'SubClass_45',50: 'SubClass_50',60: 'SubClass_60',70: 'SubClass_70',
75: 'SubClass_75',80: 'SubClass_80',85: 'SubClass_85',90: 'SubClass_90',
120: 'SubClass_120',150: 'SubClass_150',160: 'SubClass_160',180: 'SubClass_180',
190: 'SubClass_190'}})

In [None]:
dataset = dataset.replace({'MoSold': {1: 'Jan', 2: 'Feb',3: 'Mar',
4: 'Apr',5: 'May',6: 'Jun',7: 'Jul',8: 'Aug',9: 'Sep',10: 'Oct',
11: 'Nov',12: 'Dec'}})

In [None]:
dataset['YrSold'] = dataset['YrSold'].astype(str)

In [None]:
dataset["BsmtCond"] = dataset["BsmtCond"].astype("category",['No','Po','Fa','TA','Gd','Ex']).cat.codes
dataset["BsmtExposure"] = dataset["BsmtExposure"].astype("category",['No','Mn','Av','Gd']).cat.codes
dataset["BsmtFinType1"] = dataset["BsmtFinType1"].astype("category",['No','Unf','LwQ','Rec','BLQ','ALQ','GLQ'] ).cat.codes
dataset["BsmtFinType2"] = dataset["BsmtFinType2"].astype("category",['No','Unf','LwQ','Rec','BLQ','ALQ','GLQ'] ).cat.codes
dataset["BsmtQual"] = dataset["BsmtQual"].astype("category",['No','Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["ExterCond"] = dataset["ExterCond"].astype("category",['Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["ExterQual"] = dataset["ExterQual"].astype("category",['Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["Fence"] = dataset["Fence"].astype("category",['No','MnWw','GdWo','MnPrv','GdPrv'] ).cat.codes
dataset["FireplaceQu"] = dataset["FireplaceQu"].astype("category",['No','Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["Functional"] = dataset["Functional"].astype("category",['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'] ).cat.codes
dataset["GarageCond"] = dataset["GarageCond"].astype("category",['No','Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["GarageFinish"] = dataset["GarageFinish"].astype("category",['No','Unf','RFn','Fin'] ).cat.codes
dataset["GarageQual"] = dataset["GarageQual"].astype("category",['No','Po','Fa','TA','Gd','Ex']).cat.codes
dataset["HeatingQC"] = dataset["HeatingQC"].astype("category",['Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["KitchenQual"] = dataset["KitchenQual"].astype("category",['Po','Fa','TA','Gd','Ex'] ).cat.codes
dataset["PavedDrive"] = dataset["PavedDrive"].astype("category",['N','P','Y'] ).cat.codes
dataset["PoolQC"] = dataset["PoolQC"].astype("category",['No','Fa','TA','Gd','Ex'] ).cat.codes
dataset["Utilities"] = dataset["Utilities"].astype("category",['ELO','NoSeWa','NoSewr','AllPub'] ).cat.codes

In [None]:
dataset = pd.get_dummies(dataset,columns=["Alley","BldgType","CentralAir",
"Condition1","Condition2","Electrical","Exterior1st","Exterior2nd","Foundation",
"GarageType","Heating","HouseStyle","LandContour","LandSlope","LotConfig","LotShape",
"MSZoning","MasVnrType","MiscFeature","Neighborhood","RoofMatl","RoofStyle",
"SaleCondition","SaleType","Street","MSSubClass",'MoSold','YrSold'],drop_first=True)
dataset = dataset.drop(labels=['MSSubClass_SubClass_150','Condition2_PosN',
                               'MSSubClass_SubClass_160'],axis = 1)

In [None]:
skewed_features = ["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","GarageArea","MasVnrArea"
                   ,"TotalBsmtSF","1stFlrSF","2ndFlrSF","3SsnPorch","EnclosedPorch",
                   "GrLivArea","LotArea","LowQualFinSF","OpenPorchSF","PoolArea",
                   "ScreenPorch","WoodDeckSF"]

In [None]:
multiplot(data = dataset,features = skewed_features,plottype = "distplot",
          nrows = 4, ncols = 4, figsize = (11,9), colorize = True)

In [None]:
for feature in skewed_features:
    dataset[feature] = np.log1p(dataset[feature])

In [None]:
multiplot(data = dataset,features = skewed_features,plottype = "distplot",
          nrows = 4, ncols = 4, figsize = (11,9), colorize = True)

In [None]:
plt.figure(1)
fig, axes = plt.subplots(1,2,figsize=(15,7))

sns.distplot(train["SalePrice"],ax = axes[0])
sns.distplot(np.log1p(train["SalePrice"]),ax = axes[1],color="g")

axes[0].legend(["Skew : {:.2f}".format(train["SalePrice"].skew())])
axes[1].legend(["Skew : {:.2f}".format(np.log1p(train["SalePrice"].skew()))])

plt.tight_layout()
plt.show()
plt.gcf().clear()

In [None]:
dataset["SalePrice"] = np.log1p(dataset["SalePrice"])
Y = dataset["SalePrice"]
dataset = dataset.drop(labels="SalePrice",axis = 1)

In [None]:
features = dataset.columns

LotF = dataset["LotFrontage"]
dataset = dataset.drop(labels="LotFrontage",axis= 1)

In [None]:
N = RobustScaler()

N.fit(dataset)

dataset = N.transform(dataset)

In [None]:
X_train_LotF = dataset[LotF.notnull()] 
Y_train_LotF = LotF[LotF.notnull()]
Y_train_LotF = np.log1p(Y_train_LotF)

In [None]:
test_LotF = dataset[LotF.isnull()]


In [None]:
lassocv = LassoCV(eps=1e-8)

cv_results = cross_val_score(lassocv,X_train_LotF,Y_train_LotF,cv=5,scoring="r2",n_jobs=4)
cv_results.mean()

In [None]:
lassocv.fit(X_train_LotF,Y_train_LotF)

LotF_pred = lassocv.predict(test_LotF)

LotF[LotF.isnull()] = LotF_pred

In [None]:
LotF = N.fit_transform(np.array(LotF).reshape(-1,1))

dataset = np.concatenate((dataset,LotF),axis = 1)

**Modelling**

In [None]:
X_train = dataset[:train_len]
test = dataset[train_len:]

In [None]:
Y_train = Y[:train_len]

In [None]:
lassocv = LassoCV(eps=1e-7) 
ridge = Ridge(alpha=1e-6) 
lassolarscv = LassoLarsCV()
elasticnetcv = ElasticNetCV(eps=1e-15)

In [None]:
def RMSE(estimator,X_train, Y_train, cv=5,n_jobs=4):
    cv_results = cross_val_score(estimator,X_train,Y_train,cv=cv,scoring="neg_mean_squared_error",n_jobs=n_jobs)
    return (np.sqrt(-cv_results)).mean()

In [None]:
RMSE(lassocv, X_train, Y_train)

In [None]:
RMSE(ridge, X_train, Y_train)

In [None]:
RMSE(lassolarscv, X_train, Y_train)

In [None]:
RMSE(elasticnetcv, X_train, Y_train)

In [None]:
lassocv.fit(X_train,Y_train)
ridge.fit(X_train,Y_train)
lassolarscv.fit(X_train,Y_train)
elasticnetcv.fit(X_train,Y_train)

In [None]:
print("LassoCV regression has conserved %d features over %d"%(len(features[lassocv.coef_!=0]),X_train.shape[1]))
print("Ridge regression has conserved %d features over %d"%(len(features[ridge.coef_!=0]),X_train.shape[1]))
print("LassoLarsCV regression has conserved %d features over %d"%(len(features[lassolarscv.coef_!=0]) ,X_train.shape[1]))
print("ElasticNetCV regression has conserved %d features over %d"%(len(features[elasticnetcv.coef_!=0]),X_train.shape[1]))

In [None]:
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))

names_regressors = [("LassoCV", lassocv),("Ridge",ridge),("LassolarsCV",lassolarscv),("ElasticNetCV",elasticnetcv)]

nregressors = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_regressors[nregressors][0]
        regressor = names_regressors[nregressors][1]
        indices = np.argsort(regressor.coef_)[::-1][:40]
        g = sns.barplot(y=features[indices][:40],x = regressor.coef_[indices][:40] , orient='h',ax=axes[row][col])
        g.set_xlabel("Coefficient",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " regression coefs")
        nregressors += 1
plt.tight_layout()
plt.show()
plt.gcf().clear()

In [None]:
Y_pred_lassocv = np.expm1(lassocv.predict(test))
Y_pred_lassolarscv = np.expm1(lassolarscv.predict(test))
Y_pred_elasticnetcv = np.expm1(elasticnetcv.predict(test))

In [None]:
model_xgb = xgb.XGBRegressor()

RMSE(model_xgb,X_train,Y_train)

In [None]:
model_xgb.fit(X_train,Y_train)
Y_pred_xgb = np.expm1(model_xgb.predict(test))

In [None]:
GBoost = GradientBoostingRegressor()
RMSE(GBoost,X_train,Y_train)

In [None]:
GBoost.fit(X_train,Y_train)
Y_pred_GBoost = np.expm1(GBoost.predict(test))

In [None]:
# Light GBM
LightGB = lgb.LGBMRegressor()

RMSE(LightGB,X_train,Y_train)

In [None]:
LightGB.fit(X_train,Y_train)
Y_pred_LightGB = np.expm1(LightGB.predict(test))

In [None]:
results = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

results["SalePrice"] = ((Y_pred_lassocv*0.4 + Y_pred_elasticnetcv*0.3 + Y_pred_lassolarscv*0.3))*0.4 + Y_pred_xgb*0.2 + Y_pred_GBoost*0.2 + Y_pred_LightGB*0.2
results=results[["Id","SalePrice"]]
results.to_csv("submission.csv",index=False)

In [None]:
results