In [1]:
import pandas as pd
import numpy as np
import eda
import preprocess

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns',None)

In [2]:
# Reading in the trainset.
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train,test = train_test_split(data, test_size=0.15)

In [4]:
# A pipe line created in feature engineering stage. (Look at 2)Preprocessing.ipynb
preprocess_pipe = Pipeline(steps=[('cat_impute', FunctionTransformer(preprocess.impute_cat_features)),
                                  ('numeric_impute',FunctionTransformer(preprocess.impute_numeric_data)),
                                  ('age_feature',FunctionTransformer(preprocess.create_age_feature)),
                                  ('log_transform',FunctionTransformer(preprocess.log_normalize)),
                                  ('rare_class', FunctionTransformer(preprocess.rare_categories)),
                                  ('cat_to_numeric', FunctionTransformer(preprocess.cat_to_numeric)),
                                  ('drop_feature',FunctionTransformer(preprocess.drop_feature) )
                                 ])

In [5]:
preprocess = preprocess_pipe.fit(train)
train_set = preprocess.transform(train)
test_set = preprocess.transform(test)

In [6]:
X_train, y_train = train_set.drop('SalePrice',axis=1), train_set[['SalePrice']]
X_test, y_test = test_set.drop('SalePrice',axis=1), test_set[['SalePrice']]

print(f"Xtrain shape: {X_train.shape}, ytrain shape:{y_train.shape}")
print(f"Xtest shape: {X_test.shape}, yest shape:{y_test.shape}")

Xtrain shape: (1241, 82), ytrain shape:(1241, 1)
Xtest shape: (219, 82), yest shape:(219, 1)


# Feature selection

In [10]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

In [11]:
def lasso_FeatureSelector(Xtrain,Xtest,ytrain):
    """Selects the most relevant features out of the features based on 
    Lasso regression.
    """
    # A pipeline to specify the steps of applying the model.
    select_feature_pipe = Pipeline([('scaler', MinMaxScaler()),
                                    ('lasso', Lasso(random_state=0))
                                   ])
    
    # A grid of values for alpha to search through thme for the best one.
    params = {"lasso__alpha": [0.001,.005,0.02,.05,0.5,1]}
    grid_search = GridSearchCV(estimator=select_feature_pipe,
                               param_grid=params
                              )

    grid_search.fit(Xtrain, ytrain)
    featureSelector = grid_search.best_estimator_
    featureSelector=featureSelector.fit(Xtrain,ytrain)
    selected_features=featureSelector.steps[1][1].coef_.astype('bool')
    features = Xtrain.loc[:,selected_features].columns
    return Xtrain[features],Xtest[features]

def randomForest_FeatureSelector(Xtrain, Xtest,ytrain, threshold=0.003):
    
    select_feature_pipe = Pipeline([('scaler', MinMaxScaler()),
                                    ('rf', RandomForestRegressor(random_state=0))
                                       ])

    # A grid of values for alpha to search through thme for the best one.
    params = {"rf__n_estimators": [100,200,300,500]}
    grid_search = GridSearchCV(estimator=select_feature_pipe,
                               param_grid=params
                              )

    grid_search.fit(Xtrain, ytrain)
    featureSelector = grid_search.best_estimator_
    featureSelector=featureSelector.fit(Xtrain,ytrain)
    fi=featureSelector.steps[1][1].feature_importances_
    df=pd.DataFrame({'FI':fi}, index=Xtrain.columns)
    mask = df['FI'] >= threshold
    features = df[mask].index
    return Xtrain[features],Xtest[features]


In [12]:
X_train_lasso,X_test_lasso = lasso_FeatureSelector(X_train,X_test,y_train)
X_train_rf,X_test_rf = randomForest_FeatureSelector(X_train,X_test,y_train)

# Prediction

### Random Forest

In [13]:
# Using lasso feature selection
pipeline = Pipeline([('scaler', MinMaxScaler()),
                     ('regressor', RandomForestRegressor(random_state=0))
                    ])

params = {'regressor__n_estimators':(200,500),
          
         }

grid_search = GridSearchCV(pipeline, params, verbose=0)

grid_search.fit(X_train_lasso, y_train)

#Best score and parameters
print("Best Parameters:\n")
for k,v in grid_search.best_params_.items():
    print(f"\t{k} : {v}")
print()
print(f"Best Score: {grid_search.best_score_}")


#Best estimator
print("Performance on the test data")
bestEstimator = grid_search.best_estimator_

bestEstimator.fit(X_train_lasso, y_train)
pred = bestEstimator.predict(X_test_lasso)

print(f"R2 score: {r2_score(y_test,pred)}")
print(f"Mean squared error: {mean_squared_error(y_test,pred)}")
print(f"Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred))}")

Best Parameters:

	regressor__n_estimators : 500

Best Score: 0.8645870117657747
Performance on the test data
R2 score: 0.9039519805908277
Mean squared error: 0.015990387080673157
Root mean squared error: 0.1264531022975441


In [14]:
# Usin random forest feature selection
pipeline = Pipeline([('scaler', MinMaxScaler()),
                     ('regressor', RandomForestRegressor(random_state=0))
                    ])

params = {'regressor__n_estimators':(200, 500),
          
         }

grid_search = GridSearchCV(pipeline, params, verbose=0)

grid_search.fit(X_train_rf, y_train)

#Best score and parameters
print("Best Parameters:\n")
for k,v in grid_search.best_params_.items():
    print(f"\t{k} : {v}")
print()
print(f"Best Score: {grid_search.best_score_}")


#Best estimator
print("Performance on the test data")
bestEstimator = grid_search.best_estimator_

bestEstimator.fit(X_train_rf, y_train)
pred = bestEstimator.predict(X_test_rf)

print(f"R2 score: {r2_score(y_test,pred)}")
print(f"Mean squared error: {mean_squared_error(y_test,pred)}")
print(f"Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred))}")

Best Parameters:

	regressor__n_estimators : 500

Best Score: 0.8614109972974479
Performance on the test data
R2 score: 0.9052619440636296
Mean squared error: 0.015772300095428723
Root mean squared error: 0.12558781826048546


# Kaggle test set

In [15]:
final_test = pd.read_csv('test.csv')

In [19]:
def kaggle_submission(data=final_test, model=bestEstimator,
                       Xtrain=X_train_rf, preprocess=preprocess):
    # Passing the test data through the preprocessing pipeline
    df = preprocess.transform(data)
    # Selecting the features based on Xtrain features
    df = df[Xtrain.columns]
    # Make prediction
    pred = bestEstimator.predict(df)
    # Prepare the final dataframe for submission
    df = pd.DataFrame({'Id':data.Id, 'SalePrice':np.exp(pred)})
    # Save the submission
    df.to_csv('submission.csv',index=False)
    return df
    

In [20]:
kaggle_submission()

Unnamed: 0,Id,SalePrice
0,1461,114214.068431
1,1462,152545.549193
2,1463,176651.221730
3,1464,182475.682296
4,1465,198196.980397
...,...,...
1454,2915,86643.104534
1455,2916,85754.589384
1456,2917,163568.625774
1457,2918,109405.192564
