In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import probplot
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LassoCV
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

print('Number of Training Examples = {}'.format(train.shape[0]))
print('Number of Test Examples = {}\n'.format(test.shape[0]))
print('Training X Shape = {}'.format(train.shape))
print('Training y Shape = {}\n'.format(train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(test.shape))
print('Test y Shape = {}\n'.format(test.shape[0]))

Number of Training Examples = 1460
Number of Test Examples = 1459

Training X Shape = (1460, 81)
Training y Shape = 1460

Test X Shape = (1459, 80)
Test y Shape = 1459



In [3]:
def get_data_for_analysis():
    return pd.concat([train, test], sort=True)

In [4]:
get_data_for_analysis().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
3SsnPorch        2919 non-null int64
Alley            198 non-null object
BedroomAbvGr     2919 non-null int64
BldgType         2919 non-null object
BsmtCond         2837 non-null object
BsmtExposure     2837 non-null object
BsmtFinSF1       2918 non-null float64
BsmtFinSF2       2918 non-null float64
BsmtFinType1     2840 non-null object
BsmtFinType2     2839 non-null object
BsmtFullBath     2917 non-null float64
BsmtHalfBath     2917 non-null float64
BsmtQual         2838 non-null object
BsmtUnfSF        2918 non-null float64
CentralAir       2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
Electrical       2918 non-null object
EnclosedPorch    2919 non-null int64
ExterCond        2919 non-null object
ExterQual        2919 non-null object
Exterior1st      291

In [5]:
#descriptive statistics summary
train['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [6]:
data = get_data_for_analysis()
for column in data.columns.tolist():
    if data[column].isnull().sum():
        print("{} column has {} missing value from {}".format(column, data[column].isnull().sum(), len(data)))

Alley column has 2721 missing value from 2919
BsmtCond column has 82 missing value from 2919
BsmtExposure column has 82 missing value from 2919
BsmtFinSF1 column has 1 missing value from 2919
BsmtFinSF2 column has 1 missing value from 2919
BsmtFinType1 column has 79 missing value from 2919
BsmtFinType2 column has 80 missing value from 2919
BsmtFullBath column has 2 missing value from 2919
BsmtHalfBath column has 2 missing value from 2919
BsmtQual column has 81 missing value from 2919
BsmtUnfSF column has 1 missing value from 2919
Electrical column has 1 missing value from 2919
Exterior1st column has 1 missing value from 2919
Exterior2nd column has 1 missing value from 2919
Fence column has 2348 missing value from 2919
FireplaceQu column has 1420 missing value from 2919
Functional column has 2 missing value from 2919
GarageArea column has 1 missing value from 2919
GarageCars column has 1 missing value from 2919
GarageCond column has 159 missing value from 2919
GarageFinish column has 15

In [7]:
get_data_for_analysis()['MasVnrType']

0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1454       None
1455       None
1456       None
1457       None
1458    BrkFace
Name: MasVnrType, Length: 2919, dtype: object

In [8]:
def filling_mansory_features(data):    
    # Filling masonry veneer features
    data['MasVnrArea'] = data['MasVnrArea'].fillna(0)
    data['MasVnrType'] = data['MasVnrType'].fillna('None')

    # Filling continuous basement features
    for feature in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
        data[feature] = data[feature].fillna(0)

    # Filling categorical basement features
    for feature in ['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual']:
        data[feature] = data[feature].fillna('None')

    # Filling continuous garage features
    for feature in ['GarageArea', 'GarageCars', 'GarageYrBlt']:
        data[feature] = data[feature].fillna(0)

    # Filling categorical garage features
    for feature in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        data[feature] = data[feature].fillna('None')

    # Filling other categorical features
    for feature in ['Alley', 'Fence', 'FireplaceQu', 'MiscFeature', 'PoolQC']:
        data[feature] = data[feature].fillna('None')
filling_mansory_features(train)
filling_mansory_features(test)

data =  pd.concat([train, test], sort=True)
for column in data.columns.tolist():
    if data[column].isnull().sum():
        print("{} column has {} missing value from {}".format(column, data[column].isnull().sum(), len(data)))

Electrical column has 1 missing value from 2919
Exterior1st column has 1 missing value from 2919
Exterior2nd column has 1 missing value from 2919
Functional column has 2 missing value from 2919
KitchenQual column has 1 missing value from 2919
LotFrontage column has 486 missing value from 2919
MSZoning column has 4 missing value from 2919
SalePrice column has 1459 missing value from 2919
SaleType column has 1 missing value from 2919
Utilities column has 2 missing value from 2919


In [9]:
data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,,3,1Fam,TA,No,706.0,0.0,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,,3,1Fam,TA,Gd,978.0,0.0,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,,3,1Fam,TA,Mn,486.0,0.0,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,,3,1Fam,Gd,No,216.0,0.0,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,,4,1Fam,TA,Av,655.0,0.0,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [10]:
train.drop(columns=['Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual', 'MSZoning', 'SaleType', 'Utilities', 'LotFrontage'], inplace=True)
test.drop(columns=['Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual', 'MSZoning', 'SaleType', 'Utilities', 'LotFrontage'], inplace=True)

In [11]:
data =  pd.concat([train, test], sort=True)
for column in data.columns.tolist():
    if data[column].isnull().sum():
        print("{} column has {} missing value from {}".format(column, data[column].isnull().sum(), len(data)))

SalePrice column has 1459 missing value from 2919


In [12]:
train.drop(train[np.logical_and(train['OverallQual'] < 5, train['SalePrice'] > 200000)].index, inplace=True)
train.drop(train[np.logical_and(train['GrLivArea'] > 4000, train['SalePrice'] < 300000)].index, inplace=True)
train.drop(columns=['Street', 'PoolQC'], inplace=True)
test.drop(columns=['Street', 'PoolQC'], inplace=True)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1457 entries, 0 to 1459
Data columns (total 70 columns):
Id               1457 non-null int64
MSSubClass       1457 non-null int64
LotArea          1457 non-null int64
Alley            1457 non-null object
LotShape         1457 non-null object
LandContour      1457 non-null object
LotConfig        1457 non-null object
LandSlope        1457 non-null object
Neighborhood     1457 non-null object
Condition1       1457 non-null object
Condition2       1457 non-null object
BldgType         1457 non-null object
HouseStyle       1457 non-null object
OverallQual      1457 non-null int64
OverallCond      1457 non-null int64
YearBuilt        1457 non-null int64
YearRemodAdd     1457 non-null int64
RoofStyle        1457 non-null object
RoofMatl         1457 non-null object
MasVnrType       1457 non-null object
MasVnrArea       1457 non-null float64
ExterQual        1457 non-null object
ExterCond        1457 non-null object
Foundation       1457 non

In [14]:
bsmtcond_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4}
bsmtexposure_map = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
bsmtfintype_map = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
bsmtqual_map = {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
centralair_map = {'Y': 1, 'N': 0}
extercond_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
exterqual_map = {'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
fireplacequ_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
functional_map = {'Typ': 0, 'Min1': 1, 'Min2': 1, 'Mod': 2, 'Maj1': 3, 'Maj2': 3, 'Sev': 4}
garagecond_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
garagefinish_map = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
garagequal_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
heatingqc_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
kitchenqual_map = {'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
landslope_map = {'Gtl': 1, 'Mod': 2, 'Sev': 3}
lotshape_map = {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3}
paveddrive_map = {'N': 0, 'P': 1, 'Y': 2}
def perform_map(df_all):
    df_all['BsmtCond'] = df_all['BsmtCond'].map(bsmtcond_map)
    df_all['BsmtExposure'] = df_all['BsmtExposure'].map(bsmtexposure_map)
    df_all['BsmtFinType1'] = df_all['BsmtFinType1'].map(bsmtfintype_map)
    df_all['BsmtFinType2'] = df_all['BsmtFinType2'].map(bsmtfintype_map)
    df_all['BsmtQual'] = df_all['BsmtQual'].map(bsmtqual_map)
    df_all['CentralAir'] = df_all['CentralAir'].map(centralair_map)
    df_all['ExterCond'] = df_all['ExterCond'].map(extercond_map)
    df_all['ExterQual'] = df_all['ExterQual'].map(exterqual_map)
    df_all['FireplaceQu'] = df_all['FireplaceQu'].map(fireplacequ_map)
    df_all['GarageCond'] = df_all['GarageCond'].map(garagecond_map)
    df_all['GarageFinish'] = df_all['GarageFinish'].map(garagefinish_map)
    df_all['GarageQual'] = df_all['GarageQual'].map(garagequal_map)
    df_all['HeatingQC'] = df_all['HeatingQC'].map(heatingqc_map)
    df_all['LandSlope'] = df_all['LandSlope'].map(landslope_map)
    df_all['LotShape'] = df_all['LotShape'].map(lotshape_map)
    df_all['PavedDrive'] = df_all['PavedDrive'].map(paveddrive_map)
    
perform_map(train)
perform_map(test)

In [15]:
nominal_features = ['Alley', 'BldgType', 'Condition1', 'Condition2', 'Fence', 'Foundation', 'GarageType', 
                    'Heating', 'HouseStyle', 'LandContour', 'LotConfig', 'MSSubClass',
                    'MasVnrType', 'MiscFeature', 'MoSold', 'Neighborhood',
                    'RoofMatl', 'RoofStyle', 'SaleCondition', 'YrSold']

train.drop(columns=nominal_features, inplace=True)
test.drop(columns=nominal_features, inplace=True)

Important operation!

In [16]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [17]:
drop_cols = ['Id']
X_train = train.drop(columns=drop_cols + ['SalePrice']).values
y_train = train['SalePrice'].values
X_test = test.drop(columns=drop_cols).values

print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))

X_train shape: (1457, 48)
y_train shape: (1457,)
X_test shape: (1459, 48)


In [18]:
def rmse(y_train, y_pred):
     return np.sqrt(mean_squared_error(y_train, y_pred))

def cv_rmse(model, X=X_train, y=y_train):    
    return np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))

K = 10
kf = KFold(n_splits=K, shuffle=True, random_state=42)

In [19]:
train

Unnamed: 0,Id,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice
0,1,8450,0,1,7,5,2003,2003,196.0,3,...,3,2,0,61,0,0,0,0,0,12.247699
1,2,9600,0,1,6,8,1976,1976,0.0,2,...,3,2,298,0,0,0,0,0,0,12.109016
2,3,11250,1,1,7,5,2001,2002,162.0,3,...,3,2,0,42,0,0,0,0,0,12.317171
3,4,9550,1,1,7,5,1915,1970,0.0,2,...,3,2,0,35,272,0,0,0,0,11.849405
4,5,14260,1,1,8,5,2000,2000,350.0,3,...,3,2,192,84,0,0,0,0,0,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,7917,0,1,6,5,1999,2000,0.0,2,...,3,2,0,40,0,0,0,0,0,12.072547
1456,1457,13175,0,1,6,6,1978,1988,119.0,2,...,3,2,349,0,0,0,0,0,0,12.254868
1457,1458,9042,0,1,7,9,1941,2006,0.0,4,...,3,2,0,60,0,0,0,0,2500,12.493133
1458,1459,9717,0,1,5,6,1950,1996,0.0,2,...,3,2,366,0,112,0,0,0,0,11.864469


Will be used lasso model

In [20]:
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=np.arange(0.0001, 0.0009, 0.0001), random_state=42, cv=kf))
models = {'lasso': lasso}
predictions = {}
scores = {}

for name, model in models.items():
    print('Running {}'.format(name))
    
    model.fit(X_train, y_train)
    predictions[name] = np.expm1(model.predict(X_train))
    
    score = cv_rmse(model, X=X_train, y=y_train)
    scores[name] = (score.mean(), score.std())
    
    print(' Finished Running {}'.format(name))
    print(' {} Mean RMSE: {:.6f} / Std: {:.6f}\n'.format(name, scores[name][0], scores[name][1]))

Running lasso
 Finished Running lasso
 lasso Mean RMSE: 0.126891 / Std: 0.011869



In [21]:
submission_df = pd.DataFrame(columns=['Id', 'SalePrice'])
submission_df['Id'] = test['Id']
print(test['Id'].shape)
print(X_test.shape)
submission_df['SalePrice'] = np.expm1(lasso.predict(X_test))
submission_df.to_csv('submissions.csv', header=True, index=False)
submission_df.head(10)
print(submission_df.shape)

(1459,)
(1459, 48)
(1459, 2)
