In [1]:
import numpy as np 
import pandas as pd 
import math

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error


import xgboost as xgb
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.corr()["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [5]:
all_data = pd.concat([train,test],axis=0,sort=False)

In [6]:
numericals = list(all_data.drop('SalePrice',axis=1).select_dtypes(include='number'))
categoricals = list(all_data.select_dtypes(exclude='number'))

In [7]:
all_data[categoricals] = all_data[categoricals].replace(np.nan,'None')

In [8]:
for feature in numericals:
    all_data[feature].fillna(all_data[feature].median(), inplace=True)

In [9]:
test_ID = all_data[1460:2919]["Id"]
all_data.drop("Id",axis=1,inplace=True)

In [10]:
numericals.remove('Id')

In [11]:
features = test.drop('Id',axis=1).columns
label_encoder = LabelEncoder()

for feature in features:
    if all_data[feature].dtype == 'O':
        all_data[feature] = label_encoder.fit_transform(all_data[feature])
        
    elif all_data[feature].dtype == 'bool':
        all_data[feature] = all_data[feature].astype('int')

In [12]:
std_scaler = StandardScaler()
all_data[numericals] = std_scaler.fit_transform(all_data[numericals])

In [13]:
train = all_data[0:1460]
test = all_data[1460:2919]

In [14]:
X=train.drop(["SalePrice"],axis=1)
y=train["SalePrice"]

In [15]:
X.shape

(1460, 79)

In [16]:
ranks = {}

points = X.shape[0]
for i in range(points):
    X_ = X.drop(X.index[i])
    y_ = y.drop(y.index[i])

    lasso = Lasso(alpha=0.01,selection='random').fit(X_, y_)
    
    
    features_ = np.where(lasso.coef_>0)[0]
    for feature in features_:
        ranks[feature] = ranks.get(feature,0) + 1
        
best = {}
best = {k:v for k,v in zip(ranks.keys(),ranks.values()) if v > (points/2)}
len(best)

42

In [17]:
X = X.iloc[:,list(best.keys())]
X.shape

(1460, 42)

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=1)

In [19]:
param_grid = {'max_depth': [6,10,50], 
           'learning_rate': [0.01, 0.02, 0.05], 
           'n_estimators': [100, 500, 1000], 
           'colsample_bytree': [0.7, 1.0]} 

XGB = xgb.XGBRegressor(seed=20)

gridcv = GridSearchCV(estimator=XGB, param_grid=param_grid, cv=2)
gridcv.fit(X_train,y_train)

GridSearchCV(cv=2,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_cat..._step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,
                       

In [20]:
predictions = gridcv.predict(X_test)

RMSE = math.sqrt(mean_squared_error(y_test,predictions))
RMSE

22128.354235994007

In [21]:
test = test.iloc[:,list(best.keys())]

In [22]:
submission_df = pd.DataFrame(columns=["Id","SalePrice"])
submission_df["Id"] = test_ID
submission_df["SalePrice"] = gridcv.predict(test)
submission_df["Id"] = submission_df["Id"].astype(int)

In [23]:
submission_df.to_csv('submission.csv',index = False)
