# Slowly building the preprocessor

In [None]:
import pandas as pd

In [None]:
# reading iteration 6
url = "https://drive.google.com/file/d/1TV-AIuArlnuOSLcsA66afqvJvJAnKVDs/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

In [None]:
data.shape

(1460, 81)

In [None]:
data.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


Make X and y

In [None]:
X = data.drop(columns=["Id"])
y = X.pop("Expensive")

Train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

Categorical and Numerical split

In [None]:
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

Pipeline 1: Numerical

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

numerical_pipe = make_pipeline(SimpleImputer())

Pipeline 2: Categorical

Pipeline 2.1: Ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

columns_to_ordinal = X_cat.columns.get_indexer(['ExterQual', 'ExterCond', 
                                                'BsmtQual', 'BsmtCond',
                                                'BsmtExposure', 'BsmtFinType1', 
                                                'KitchenQual', 'FireplaceQu'])

## explicitly determine categories for ordinal encoding
ExterQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats]

ordinal_encoder = OrdinalEncoder(categories=cats_ord)

Pipeline 2.2: One hot encoding 

In [None]:
from sklearn.preprocessing import OneHotEncoder

columns_to_ohe = X_cat.columns.get_indexer(list(set(X_cat) - set(columns_to_ordinal)))

ohe_encoder = OneHotEncoder(handle_unknown="ignore")

Bringing ordinal pipe and ohe pipe together

In [None]:
from sklearn.compose import ColumnTransformer

categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", ordinal_encoder, columns_to_ordinal),
        ("cat_onehot", ohe_encoder, columns_to_ohe),
    ]
)

In [None]:
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder)

Bringing together the numerical and categorical pipeline

In [None]:
final_preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numerical_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

# Our preprocessor in one cell

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

url = "https://drive.google.com/file/d/1TV-AIuArlnuOSLcsA66afqvJvJAnKVDs/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

X = data.drop(columns=["Id"])
y = X.pop("Expensive")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numerical_pipe = make_pipeline(SimpleImputer())

columns_to_ordinal = X_cat.columns.get_indexer(['ExterQual', 'ExterCond', 
                                                'BsmtQual', 'BsmtCond',
                                                'BsmtExposure', 'BsmtFinType1', 
                                                'KitchenQual', 'FireplaceQu'])

ExterQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats]

ordinal_encoder = OrdinalEncoder(categories=cats_ord)

columns_to_ohe = X_cat.columns.get_indexer(list(set(X_cat) - set(columns_to_ordinal)))

ohe_encoder = OneHotEncoder(handle_unknown="ignore")

categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", ordinal_encoder, columns_to_ordinal),
        ("cat_onehot", ohe_encoder, columns_to_ohe),
    ]
)

categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder)

final_preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numerical_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

# Modelling with GridSearchCV or RandomizedSearchCV

## Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

final_pipe_dt = make_pipeline(final_preprocessor,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(2, 15, 2)
}

dt_search = GridSearchCV(final_pipe_dt,
                        param_grid,
                        cv=5,
                        scoring='accuracy',
                        verbose=1)

dt_search.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipe',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAd...
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'columntransformer

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

final_pipe_knn = make_pipeline(final_preprocessor,
                              KNeighborsClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(final_pipe_knn,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

knn_search.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipe',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAd...
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'columntransformer__nu

# Making a diagram if you're ever confused

In [None]:
from sklearn import set_config
set_config(display="diagram")

In [None]:
numerical_pipe

In [None]:
categorical_encoder

In [None]:
categorical_pipe

In [None]:
final_pipe_dt

# Making a submission CSV

In [None]:
# import competition data
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)

In [None]:
my_test_X = competition_data.drop(columns=["Id"])

In [None]:
my_submission = pd.DataFrame(competition_data["Id"])
my_submission["Expensive"] = dt_search.predict(my_test_X)

my_submission.to_csv('my_submission_5.csv', index=False)

# Extras needed on colab
# from google.colab import files
# files.download("my_submission_5.csv")

In [None]:
# my_submission = pd.DataFrame(competition_data["Id"])
# my_submission["Expensive"] = knn_search.predict(my_test_X)

# my_submission.to_csv('my_submission_6.csv', index=False)

# # Extras needed on colab
# from google.colab import files
# files.download("my_submission_6.csv")