In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore  warning (from sklearn)

#print(os.listdir("../input"))

Let's Load the Data

In [2]:
house_data = pd.read_csv('../input/train.csv')
house_data_test = pd.read_csv('../input/test.csv')

**Data Cleaning**

Will store the Id column(information) from test dataframe ,in test_parent dataframe.

Then let's drop the Id column from both test and train data. 

In [3]:
train_parent=house_data
test_parent=house_data_test 
house_data = house_data.drop('Id', axis=1)
house_data_test = house_data_test.drop('Id', axis=1)

Now we will analyze the NaN values present in the dataset and deal with them .

In [4]:
#We will find all the columns which have more than 40 % NaN data and drop then
threshold=0.4 * len(house_data)
df=pd.DataFrame(len(house_data) - house_data.count(),columns=['count'])
df.index[df['count'] > threshold]

Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [5]:
house_data = house_data.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
house_data_test = house_data_test.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

Will verify that the Label is a numeric data

In [6]:
house_data['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Find all the numeric columns and replace the NaN values with 0 ,
and for categorical columns ,replace NaN values with 'None'.

In [7]:
house_data.select_dtypes(include=np.number).columns #will give all numeric columns ,we will remove the SalePrice column 
for col in ('MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'):
    
    house_data[col] = house_data[col].fillna(0)
    house_data_test[col] = house_data_test[col].fillna('0')

In [8]:
house_data.select_dtypes(exclude=np.number).columns
for col in ('MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'):
    
    house_data[col] = house_data[col].fillna('None')
    house_data_test[col] = house_data_test[col].fillna('None')

Verify that there are no null values in the data set

In [9]:
house_data[house_data.isnull().any(axis=1)]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [10]:
house_data_test[house_data_test.isnull().any(axis=1)]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition


 **Combining the two datasets and then doing One Hot Encoding on the combined dataset.**

In [11]:
train=house_data
test=house_data_test

#Assigning a flag to training and testing dataset for segregation after OHE .
train['train']=1 
test['train']=0

#Combining training and testing dataset

combined=pd.concat([train,test])

In [12]:
#Applying One Hot Encoding to categorical data
ohe_data_frame=pd.get_dummies(combined, 
                           columns=['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      )

In [13]:
#Splitting the combined dataset after doing OHE .
train_df=ohe_data_frame[ohe_data_frame['train']==1]
test_df=ohe_data_frame[ohe_data_frame['train']==0]
train_df.drop(['train'],axis=1,inplace=True)             #Drop the Flag(train) coloumn from training dataset
test_df.drop(['train','SalePrice'],axis=1,inplace=True)     #Drop the Flag(train),Label(SalePrice) coloumn from test dataset

In [14]:
house_data=train_df
house_data_test=test_df

**Data Cleaning is now complete We can now use our data to build our models**

In [15]:
X_train = house_data.drop('SalePrice', axis=1)
# Taking the labels (price)
Y_train = house_data['SalePrice']
X_test = house_data_test

Let's apply Gradient Boosting for regression and find the best parameter for GBR using GridSearchCV    

In [16]:
"""  
from sklearn.model_selection import GridSearchCV

num_estimators = [100, 200, 500]
learn_rates = [0.01, 0.02, 0.05, 0.1]
max_depths = [4, 6, 8]

param_grid = {'n_estimators': num_estimators,
              'learning_rate': learn_rates,
              'max_depth': max_depths}

grid_search = GridSearchCV(GradientBoostingRegressor(min_samples_split=2, loss='ls'),
                           param_grid, cv=3, return_train_score=True)
grid_search.fit(X_train, Y_train)

grid_search.best_params_  
"""

"  \nfrom sklearn.model_selection import GridSearchCV\n\nnum_estimators = [100, 200, 500]\nlearn_rates = [0.01, 0.02, 0.05, 0.1]\nmax_depths = [4, 6, 8]\n\nparam_grid = {'n_estimators': num_estimators,\n              'learning_rate': learn_rates,\n              'max_depth': max_depths}\n\ngrid_search = GridSearchCV(GradientBoostingRegressor(min_samples_split=2, loss='ls'),\n                           param_grid, cv=3, return_train_score=True)\ngrid_search.fit(X_train, Y_train)\n\ngrid_search.best_params_  \n"

In [17]:
#GardientBoosting
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
gbr_model = GradientBoostingRegressor(**params)
gbr_model.fit(X_train, Y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
gbr_model.score(X_train, Y_train)

0.997857379737625

In [19]:
#Predicting the SalePrice for the test data
y_grad_predict = gbr_model.predict(X_test)
print(y_grad_predict)

[126094.58131761 166852.84527976 186235.76392715 ... 158488.63572855
 114096.7144326  222305.81656333]


In [20]:
#Submission 
my_submission = pd.DataFrame({'Id': test_parent.Id, 'SalePrice': y_grad_predict})
print(my_submission)

my_submission.to_csv('submission.csv', encoding='utf-8', index=False)

        Id      SalePrice
0     1461  126094.581318
1     1462  166852.845280
2     1463  186235.763927
3     1464  194985.629689
4     1465  187131.078478
5     1466  175615.964225
6     1467  162572.584677
7     1468  166966.515617
8     1469  189027.850122
9     1470  127764.454796
10    1471  195926.187678
11    1472   92454.967681
12    1473   96009.334597
13    1474  153991.708051
14    1475  132908.261123
15    1476  383883.963959
16    1477  256019.802808
17    1478  302340.504341
18    1479  267992.743345
19    1480  460225.606419
20    1481  327837.728008
21    1482  208823.790673
22    1483  166843.028693
23    1484  162177.057440
24    1485  176622.765288
25    1486  192391.753091
26    1487  369029.353883
27    1488  235527.873627
28    1489  213695.518167
29    1490  221906.760850
...    ...            ...
1429  2890   85240.298746
1430  2891  131798.329840
1431  2892   37604.777241
1432  2893   81657.075834
1433  2894   45442.553932
1434  2895  333246.538864
1435  2896  

If you found this notebook helpful or you just liked it , some upvotes would be very much appreciated - That will keep me motivated .

Please drop down suggestions and comments if any, so that i can learn to build better solutions.

**Thank You** :-)