In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import  SVR
from sklearn.ensemble import IsolationForest

# Importing the dataset 

In [3]:
train_data_csv = pd.read_csv("train.csv")
test_data_csv = pd.read_csv("test.csv")

In [4]:
train_SalePrice = pd.DataFrame(train_data_csv["SalePrice"])

In [5]:
train_SalePrice.shape

(1460, 1)

In [6]:
train_SalePrice.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [7]:
train_data = train_data_csv.drop(["SalePrice"],axis=1)

## Concatenate the train and the test set 

In [8]:
full_features = pd.concat([train_data,test_data_csv],ignore_index= True)

In [9]:
full_features.to_csv("full.csv")

In [10]:
full_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-

In [11]:
full_features = full_features.drop(["Id"],axis=1)

In [12]:
full_features.shape

(2919, 79)

# Data Preprocessing

###  1-Missing Values

In [13]:
full_features.isnull().sum().sort_values(ascending = False)

PoolQC           2909
MiscFeature      2814
Alley            2721
Fence            2348
FireplaceQu      1420
LotFrontage       486
GarageCond        159
GarageQual        159
GarageYrBlt       159
GarageFinish      159
GarageType        157
BsmtCond           82
BsmtExposure       82
BsmtQual           81
BsmtFinType2       80
BsmtFinType1       79
MasVnrType         24
MasVnrArea         23
MSZoning            4
BsmtHalfBath        2
Utilities           2
Functional          2
BsmtFullBath        2
BsmtFinSF2          1
BsmtFinSF1          1
Exterior2nd         1
BsmtUnfSF           1
TotalBsmtSF         1
Exterior1st         1
SaleType            1
                 ... 
YearRemodAdd        0
YearBuilt           0
SaleCondition       0
HeatingQC           0
ExterQual           0
ExterCond           0
YrSold              0
MoSold              0
MiscVal             0
PoolArea            0
ScreenPorch         0
3SsnPorch           0
EnclosedPorch       0
OpenPorchSF         0
WoodDeckSF

In [14]:
full_features.isnull().sum().sort_values(ascending = False)

PoolQC           2909
MiscFeature      2814
Alley            2721
Fence            2348
FireplaceQu      1420
LotFrontage       486
GarageCond        159
GarageQual        159
GarageYrBlt       159
GarageFinish      159
GarageType        157
BsmtCond           82
BsmtExposure       82
BsmtQual           81
BsmtFinType2       80
BsmtFinType1       79
MasVnrType         24
MasVnrArea         23
MSZoning            4
BsmtHalfBath        2
Utilities           2
Functional          2
BsmtFullBath        2
BsmtFinSF2          1
BsmtFinSF1          1
Exterior2nd         1
BsmtUnfSF           1
TotalBsmtSF         1
Exterior1st         1
SaleType            1
                 ... 
YearRemodAdd        0
YearBuilt           0
SaleCondition       0
HeatingQC           0
ExterQual           0
ExterCond           0
YrSold              0
MoSold              0
MiscVal             0
PoolArea            0
ScreenPorch         0
3SsnPorch           0
EnclosedPorch       0
OpenPorchSF         0
WoodDeckSF

In [15]:
full_features.loc[2576,'GarageArea'] = 0
full_features.loc[2576,'GarageCars'] = 0
full_features.loc[2120,'BsmtUnfSF'] = 0
full_features.loc[2120,'BsmtFinSF1'] = 0
full_features.loc[2120,'TotalBsmtSF'] = 0
full_features.loc[2120,'BsmtFinSF2'] = 0
full_features.loc[2120,'BsmtFullBath'] = 0
full_features.loc[2188,'BsmtFullBath'] = 0
full_features.loc[2120,'BsmtHalfBath'] = 0
full_features.loc[2188,'BsmtHalfBath'] = 0
full_features['Functional'] = full_features['Functional'].fillna('None')
full_features['Utilities'] = full_features['Utilities'].fillna('None')
full_features.loc[1555,'KitchenQual'] = 'None'
full_features.loc[2151,'Exterior1st'] = 'None'
full_features.loc[2151,'Exterior2nd'] = 'None'
full_features["Electrical"] = full_features["Electrical"].fillna('SBrkr')
full_features["MasVnrArea"] = full_features["MasVnrArea"].fillna(0)
full_features["MasVnrType"] = full_features["MasVnrType"].fillna('None')
full_features["GarageFinish"] = full_features["MasVnrType"].fillna(0)
full_features['SaleType']=full_features['SaleType'].fillna(full_features['SaleType'].mode()[0])


cols = ['BsmtQual','BsmtCond','FireplaceQu','GarageType','GarageQual','GarageCond',
        'PoolQC','MiscFeature','Fence','BsmtFinType1','Alley','BsmtFinType2','BsmtExposure']
for c in cols:
    full_features[c].fillna('None', inplace=True)

In [16]:
imp=Imputer(missing_values="NaN", strategy="median" )
imp.fit(full_features[["GarageYrBlt"]])
full_features["GarageYrBlt"]=imp.transform(full_features[["GarageYrBlt"]]).ravel()

imp=Imputer(missing_values="NaN", strategy="mean" )
imp.fit(full_features[["LotFrontage"]])
full_features["LotFrontage"]=imp.transform(full_features[["LotFrontage"]]).ravel()

mean = full_features['LotFrontage'].agg(['mean'])
full_features['LotFrontage'] = full_features['LotFrontage'].fillna(value=mean)

In [17]:
subclass_group = full_features.groupby('MSSubClass')
Zoning_modes = subclass_group['MSZoning'].apply(lambda x : x.mode()[0])
Zoning_modes

MSSubClass
20     RL
30     RM
40     RL
45     RM
50     RL
60     RL
70     RM
75     RM
80     RL
85     RL
90     RL
120    RL
150    RL
160    RM
180    RM
190    RL
Name: MSZoning, dtype: object

In [18]:
full_features['MSZoning'] = full_features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [19]:
full_features.isnull().sum().sort_values(ascending = False)

SaleCondition    0
Foundation       0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
BsmtQual         0
YearRemodAdd     0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
RoofStyle        0
YearBuilt        0
Heating          0
Utilities        0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
LotConfig        0
                ..
ScreenPorch      0
GarageFinish     0
PoolArea         0
PoolQC           0
Fence            0
MiscFeature      0
MiscVal          0
MoSold           0
YrSold           0
GarageCars       0
GarageYrBlt      0
CentralAir       0
FullBath         0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
HalfBath         0
GarageType  

In [20]:
full_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 79 columns):
MSSubClass       2919 non-null int64
MSZoning         2919 non-null object
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            2919 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2919 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2919 non-null object
Exterior2nd      2919 no

### 2-Categorical Data

In [21]:
full_features = pd.get_dummies(data=full_features,columns=['MSSubClass','Fence','Alley','MiscFeature','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','OverallQual','OverallCond','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','SaleType','SaleCondition'])

### 3-Split the train and test data

In [22]:
train_data = full_features.iloc[:1460,:]
test_data = full_features.iloc[1460:,:]

In [23]:
print(train_data.shape)
print(test_data.shape)

(1460, 339)
(1459, 339)


In [24]:
train_data['SalePrice'] = train_SalePrice

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
train_data.shape

(1460, 340)

In [26]:
y_train = pd.DataFrame(index = train_data.index, columns=["SalePrice"])
y_train["SalePrice"] = np.log(train_data["SalePrice"])
X_train = train_data.drop('SalePrice',axis= 1 )

In [27]:
X_train

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.000000,8450,2003,2003,196.0,706.0,0.0,150.0,856.0,856,...,0,0,0,1,0,0,0,0,1,0
1,80.000000,9600,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,...,0,0,0,1,0,0,0,0,1,0
2,68.000000,11250,2001,2002,162.0,486.0,0.0,434.0,920.0,920,...,0,0,0,1,0,0,0,0,1,0
3,60.000000,9550,1915,1970,0.0,216.0,0.0,540.0,756.0,961,...,0,0,0,1,1,0,0,0,0,0
4,84.000000,14260,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,...,0,0,0,1,0,0,0,0,1,0
5,85.000000,14115,1993,1995,0.0,732.0,0.0,64.0,796.0,796,...,0,0,0,1,0,0,0,0,1,0
6,75.000000,10084,2004,2005,186.0,1369.0,0.0,317.0,1686.0,1694,...,0,0,0,1,0,0,0,0,1,0
7,69.305795,10382,1973,1973,240.0,859.0,32.0,216.0,1107.0,1107,...,0,0,0,1,0,0,0,0,1,0
8,51.000000,6120,1931,1950,0.0,0.0,0.0,952.0,952.0,1022,...,0,0,0,1,1,0,0,0,0,0
9,50.000000,7420,1939,1950,0.0,851.0,0.0,140.0,991.0,1077,...,0,0,0,1,0,0,0,0,1,0


In [28]:
y_train

Unnamed: 0,SalePrice
0,12.247694
1,12.109011
2,12.317167
3,11.849398
4,12.429216
5,11.870600
6,12.634603
7,12.206073
8,11.774520
9,11.678440


In [29]:
print(test_data.shape)
test_data.head(n = 10)

(1459, 339)


Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1460,80.0,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1461,81.0,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
1462,74.0,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
1463,78.0,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
1464,43.0,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0
1465,75.0,10000,1993,1994,0.0,0.0,0.0,763.0,763.0,763,...,0,0,0,1,0,0,0,0,1,0
1466,69.305795,7980,1992,2007,0.0,935.0,0.0,233.0,1168.0,1187,...,0,0,0,1,0,0,0,0,1,0
1467,63.0,8402,1998,1998,0.0,0.0,0.0,789.0,789.0,789,...,0,0,0,1,0,0,0,0,1,0
1468,85.0,10176,1990,1990,0.0,637.0,0.0,663.0,1300.0,1341,...,0,0,0,1,0,0,0,0,1,0
1469,70.0,8400,1970,1970,0.0,804.0,78.0,0.0,882.0,882,...,0,0,0,1,0,0,0,0,1,0


In [30]:
test_data.reset_index(drop= True)

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.000000,11622,1961,1961,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,1,0,0,0,0,1,0
1,81.000000,14267,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,...,0,0,0,1,0,0,0,0,1,0
2,74.000000,13830,1997,1998,0.0,791.0,0.0,137.0,928.0,928,...,0,0,0,1,0,0,0,0,1,0
3,78.000000,9978,1998,1998,20.0,602.0,0.0,324.0,926.0,926,...,0,0,0,1,0,0,0,0,1,0
4,43.000000,5005,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,...,0,0,0,1,0,0,0,0,1,0
5,75.000000,10000,1993,1994,0.0,0.0,0.0,763.0,763.0,763,...,0,0,0,1,0,0,0,0,1,0
6,69.305795,7980,1992,2007,0.0,935.0,0.0,233.0,1168.0,1187,...,0,0,0,1,0,0,0,0,1,0
7,63.000000,8402,1998,1998,0.0,0.0,0.0,789.0,789.0,789,...,0,0,0,1,0,0,0,0,1,0
8,85.000000,10176,1990,1990,0.0,637.0,0.0,663.0,1300.0,1341,...,0,0,0,1,0,0,0,0,1,0
9,70.000000,8400,1970,1970,0.0,804.0,78.0,0.0,882.0,882,...,0,0,0,1,0,0,0,0,1,0


## Model
* Tree Based Model 
* SVR
* Lasso Regression
* XGBoost Regressor

### Tree based model

In [31]:
regressor = DecisionTreeRegressor(random_state= 0)

In [32]:
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [33]:
y_pred_tree =regressor.predict(test_data)

## SVR

In [34]:
svr = SVR(kernel= 'rbf')

In [35]:
svr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [36]:
y_pred_svr = svr.predict(test_data)

### LASSO 

In [37]:
best_alpha = 0.00099


In [38]:
regr = Lasso(alpha=best_alpha, max_iter=50000)


In [39]:
regr.fit(X_train, y_train)

Lasso(alpha=0.00099, copy_X=True, fit_intercept=True, max_iter=50000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [40]:
y_pred_lasso =regressor.predict(test_data)

In [41]:
#y_pred = (y_pred_lasso + y_pred_svr) / 2

In [42]:
#y_pred = np.exp(y_pred)

### XGBoost 

In [43]:
from xgboost import  XGBRegressor

In [44]:
xgboost = XGBRegressor(learning_rate=0.05, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006, random_state=42)

In [45]:
xgboost.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, importance_type='gain',
       learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=0, missing=None, n_estimators=3460, n_jobs=1,
       nthread=-1, objective='reg:linear', random_state=42,
       reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.7)

In [46]:
predictions = xgboost.predict(test_data)
y_pred = (y_pred_lasso + predictions) / 2
y_pred = np.exp(y_pred)

## Submission File

In [47]:
pred_df = pd.DataFrame(y_pred, index=test_data_csv["Id"], columns=["SalePrice"])
pred_df.to_csv('output_xgb.csv', header=True, index_label='Id')