# House Price - Pipeline, GridSearch, CrossValidation

## 1. Import and split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn import set_config


For other Classification models see here:

In [None]:
#other models:
#from sklearn.model_selection import train_test_split
#from sklearn.datasets import make_moons, make_circles, make_classification

#from sklearn.neural_network import MLPClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### 1.1. Training data

In [None]:
#Read Data
data = pd.read_csv("housing-classification-iter6_Liane.csv")

#Split
y = data.pop("Expensive")
X = data

In [None]:
#check missing values
#X.isna().sum()

In [None]:
#check data types (which columns are not numeric and what to do with them)
X.drop("Id", axis=1, inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   MSZoning       1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Heating        1460 non-null   object 
 12  Street         1460 non-null   object 
 13  CentralAir     1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  ExterQual      1460 non-null   object 
 16  ExterCond      1460 non-null   object 
 17  BsmtQual       1423 non-null   object 
 18  BsmtCond

**Conclusion:** 

**We have some missing values:**
* 1   LotFrontage    1201 non-null   float64
* 28  MasVnrArea     1452 non-null   float64
* 42  GarageYrBlt    1379 non-null   float64
* 17  BsmtQual       1423 non-null   object 
* 18  BsmtCond       1423 non-null   object 
* 19  BsmtExposure   1422 non-null   object 
* 20  BsmtFinType1   1423 non-null   object 
* 22  FireplaceQu    770 non-null    object 
* 51  Alley          91 non-null     object 
* 65  MasVnrType     1452 non-null   object 
* 66  BsmtFinType2   1422 non-null   object 
* 68  Electrical     1459 non-null   object 
* 70  GarageType     1379 non-null   object 
* 71  GarageFinish   1379 non-null   object 
* 72  GarageQual     1379 non-null   object 
* 73  GarageCond     1379 non-null   object 
* 75  PoolQC         7 non-null      object 
* 76  Fence          281 non-null    object 
* 77  MiscFeature    54 non-null     object 



**columns to be ordinal encoded:** 

* 15  ExterQual      1460 non-null   object ordinal
* 16  ExterCond      1460 non-null   object ordinal
* 17  BsmtQual       1423 non-null   object ordinal
* 18  BsmtCond       1423 non-null   object ordinal
* 19  BsmtExposure   1422 non-null   object ordinal
* 20  BsmtFinType1   1423 non-null   object ordinal
* 21  KitchenQual    1460 non-null   object ordinal
* 22  FireplaceQu    770 non-null    object ordinal
* 52  LotShape       1460 non-null   object ordinal
* 66  BsmtFinType2   1422 non-null   object ordinal
* 67  HeatingQC      1460 non-null   object ordinal
* 71  GarageFinish   1379 non-null   object ordinal
* 72  GarageQual     1379 non-null   object ordinal
* 73  GarageCond     1379 non-null   object ordinal
* 75  PoolQC         7 non-null      object ordinal
* 76  Fence          281 non-null    object ordinal


In [None]:
# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

### 1.2. Competition Data

In [None]:
competition_data_original = pd.read_csv("test-housing-classification_Liane.csv")
competition_data = pd.read_csv("test-housing-classification_Liane.csv")

In [None]:
competition_id = competition_data.pop("Id")
X_test_comp = competition_data

In [None]:
X_test_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1459 non-null   int64  
 1   LotFrontage    1232 non-null   float64
 2   TotalBsmtSF    1458 non-null   float64
 3   BedroomAbvGr   1459 non-null   int64  
 4   Fireplaces     1459 non-null   int64  
 5   PoolArea       1459 non-null   int64  
 6   GarageCars     1458 non-null   float64
 7   WoodDeckSF     1459 non-null   int64  
 8   ScreenPorch    1459 non-null   int64  
 9   MSZoning       1455 non-null   object 
 10  Condition1     1459 non-null   object 
 11  Heating        1459 non-null   object 
 12  Street         1459 non-null   object 
 13  CentralAir     1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  ExterQual      1459 non-null   object 
 16  ExterCond      1459 non-null   object 
 17  BsmtQual       1415 non-null   object 
 18  BsmtCond

## 2. Building `preprocessor` pipeline

In [None]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
772,7819,94.0,1029,3,1,0,2,144,0,RL,...,Detchd,Unf,TA,TA,Y,,MnPrv,,WD,Abnorml
157,12003,92.0,774,4,1,0,3,0,0,RL,...,BuiltIn,Fin,TA,TA,Y,,,,New,Partial
360,7540,,888,2,1,0,2,0,192,RL,...,Attchd,RFn,TA,TA,Y,,MnPrv,,WD,Normal
744,5395,41.0,1337,2,1,0,2,96,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
150,10356,120.0,969,3,0,0,2,0,0,RL,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal


In [None]:
# 2.a) Select categorical and numerical columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2.b) Numerical Pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant")   #strategy="constant"
)

# 2.c) Categorical Pipeline

# # # i. Defining the categorical encoder
ordinal_col_names = ['ExterQual', 
                     'ExterCond', 
                     'BsmtQual', 
                     'BsmtCond',
                     'BsmtExposure', 
                     'BsmtFinType1', 
                     'KitchenQual', 
                     'FireplaceQu', 
                     'LotShape', 
                     'BsmtFinType2',
                     'HeatingQC',
                     'GarageFinish', 
                     'GarageQual', 
                     'GarageCond', 
                     'PoolQC', 
                     'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
onehot_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

# # # ii. Defining values for Ordinal Encoding
ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["NA", 'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ['NA','MnWw','GdWo','MnPrv','GdPrv']

ordinal_cats_list = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                    BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
                    LotShape_cats, BsmtFinType2_cats, HeatingQC_cats, GarageFinish_cats, GarageQual_cats,
                    GarageCond_cats, PoolQC_cats, Fence_cats]


# # # iii. Defining the ColumnTransformer with 2 branches: ordinal & onehot (categorical encoder)
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats_list), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),
    ]
)

# # # iv. Categorical pipeline = "NA" imputer + categorical encoder
categoric_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 2.d) Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

**EXPLANATION FOR CATEGORICAL PIPELINE: --2.c)--**

**Start from the bottom! (# # # iv.):**
* `categoric_pipe` = make_pipeline(SimpleImputer(), `categorical_encoder`)



**needs `categorical_encoder` (# # # iii.):**
* `categorical_encoder` = ColumnTransformer(tranformers=[
    * ("cat_ordinal", OrdinalEncoder(categories=`ordinal_cats_list`), `ordinal_cols`), 
    * ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), `onehot_cols`)])


**needs `ordinal_cats_list` (# # # ii.) and needs `ordinal_cols`and `onehot_cols`(# # # i.):**

(\# # # i.) `ordinal_cols`and `onehot_cols`:
* ordinal_col_names = ['ExterQual', '...']
* `ordinal_cols` = X_cat.columns.get_indexer(ordinal_col_names)
* `onehot_cols` = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))


(\# # # ii.) `ordinal_cats_list`: define values and then make a list:**
* ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* etc.
* `ordinal_cats_list` = [ExterQual_cats, ExterCond_cats, ...]

## 3. Create `full_pipe` with Decision Tree

In [None]:
#create the full_pipeline
full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),   #
                          DecisionTreeClassifier(random_state=123))

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(2, 12),
#    "decisiontreeclassifier__min_samples_split": range(3, 40, 2),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define cross validation
search = RandomizedSearchCV(full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      n_iter=100)

# fit
search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}
best_params = {"dtree" : search.best_params_}


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [None]:
# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data = {"dtree" : acc_train}

In [None]:
### testing accuracy
y_test_pred = search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data = {"dtree" : acc_test}

In [None]:
#look at full_pipe and check if its good
from sklearn import set_config

set_config(display="diagram")
full_pipe

In [None]:
scores

{'dtree': 0.9315207780725023}

In [None]:
search.best_params_

{'decisiontreeclassifier__min_samples_leaf': 8,
 'decisiontreeclassifier__max_depth': 3,
 'decisiontreeclassifier__criterion': 'gini'}

### 3.2. Decision Tree for competition

In [None]:
y_test_comp = search.predict(X_test_comp) #The error message does not deal with the real column 6 but one of the encoded ones

In [None]:
# 1. Create the dictionary
pred = {
    "Id":competition_id, 
    "Expensive":y_test_comp
}

# 2. Build the dataframe
prediction_tree = pd.DataFrame(pred)

In [None]:
prediction_tree

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [None]:
prediction_tree.to_csv("prediction_tree_Liane.csv", index = False)

## 4. Create `knn_full_pipe` for KNN

In [None]:
#create the full_pipeline
knn_full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          KNeighborsClassifier()
                             )

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# define cross validation
knn_search = GridSearchCV(knn_full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
#                      n_iter=20
                    )

# fit
knn_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores["knn"] = knn_search.best_score_
best_params["knn"] = knn_search.best_params_


# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = knn_search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data["knn"] = acc_train

### testing accuracy
y_test_pred = knn_search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data["knn"] = acc_test

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [None]:
pd.DataFrame(scores, index=["best_score_"])

Unnamed: 0,dtree,knn
best_score_,0.931521,0.92637


### 4.2. knn for competition

In [None]:
y_test_comp_knn = knn_search.predict(X_test_comp) 

In [None]:
# 1. Create the dictionary
pred_knn = {
    "Id":competition_id, 
    "Expensive":y_test_comp_knn
}

# 2. Build the dataframe
prediction_knn = pd.DataFrame(pred_knn)

In [None]:
prediction_knn.to_csv("prediction_knn_Liane.csv", index = False)

## 5. Create `rforest_full_pipe` for RandomForest

I couldn't recreate the model with the best score but these were the parameters:
* 'rforest': {'randomforestclassifier__n_estimators': 200,
* 'randomforestclassifier__min_samples_leaf': 5,
* 'randomforestclassifier__max_depth': 11,
* 'randomforestclassifier__criterion': 'gini'}}

In [None]:
#create the full_pipeline
rforest_full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          RandomForestClassifier()
                             )

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
#    "randomforestclassifier__warm_start": [True, False],
    "randomforestclassifier__n_estimators": [100, 200],
    "randomforestclassifier__max_depth": range(2, 14),
    "randomforestclassifier__min_samples_leaf": range(2, 10),
    "randomforestclassifier__criterion":["gini", "entropy"]
}

# define cross validation
rforest_search = RandomizedSearchCV(rforest_full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      n_iter=50
                    )

# fit
rforest_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores["rforest"] = rforest_search.best_score_
best_params["rforest"] = rforest_search.best_params_


# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = rforest_search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data["rforest"] = acc_train

### testing accuracy
y_test_pred = rforest_search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data["rforest"] = acc_test

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [None]:
pd.DataFrame(scores, index=["best_score_"])


Unnamed: 0,dtree,knn,rforest
best_score_,0.931521,0.92637,0.950339


In [None]:
best_params

{'dtree': {'decisiontreeclassifier__min_samples_leaf': 8,
  'decisiontreeclassifier__max_depth': 3,
  'decisiontreeclassifier__criterion': 'gini'},
 'knn': {'kneighborsclassifier__n_neighbors': 7,
  'kneighborsclassifier__weights': 'distance'},
 'rforest': {'randomforestclassifier__n_estimators': 200,
  'randomforestclassifier__min_samples_leaf': 5,
  'randomforestclassifier__max_depth': 11,
  'randomforestclassifier__criterion': 'gini'}}

In [None]:
pd.DataFrame(scores_test_data, index=["acc_test"])

Unnamed: 0,dtree,knn,rforest
acc_test,0.931507,0.934932,0.945205


### 5.2. RandomForest for competition

In [None]:
y_test_comp_rforest = rforest_search.predict(X_test_comp)

In [None]:
# 1. Create the dictionary
pred_rforest = {
    "Id":competition_id, 
    "Expensive":y_test_comp_rforest
}

# 2. Build the dataframe
prediction_rforest = pd.DataFrame(pred_rforest)

In [None]:
prediction_rforest.to_csv("prediction_rforest_Liane_x.csv", index = False)