# ***IMPORTING LIBRARIES AND MODULES***

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config

In [None]:
pd.set_option('display.max_columns', None)


# ***GET DATAS***

In [None]:
url = "https://drive.google.com/file/d/1NFHZhCOxgW1bu5q32OqVIVBDtSo2Alkh/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)

# ***SPLIT DATAS***

In [None]:
# X and y creation
X = data.copy()
X.pop('Id')
y = X.pop("Expensive")
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# ***PREPROCESSOR***

In [None]:
# 1. defining categorical & ordinal columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2. numerical pipeline
numeric_pipe = make_pipeline(SimpleImputer())

# 3. categorical pipeline
    # # 3.1 defining ordinal & onehot columns

ordinal_col_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape',
       'BsmtFinType2', 'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PoolQC', 'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

X_cat_ordinal = X_cat.columns[ordinal_cols]
X_cat_ohe = X_cat.columns[ohe_cols]

    ## 3.2 explicitly determine categories for ordinal encoding including "N_A"
ExterQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["N_A",'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['N_A','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['N_A','Unf','RFn','Fin']
GarageQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ["N_A",'NA','MnWw','GdWo','MnPrv','GdPrv']

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
            LotShape_cats,BsmtFinType2_cats,HeatingQC_cats,GarageFinish_cats,GarageQual_cats,
            GarageCond_cats,PoolQC_cats,Fence_cats]

        ### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=cats_ord), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

    ## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)


In [None]:
print(set(ohe_cols))

{0, 1, 2, 3, 4, 5, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 37, 40, 41, 42}


In [None]:
ordinal_cols

array([ 6,  7,  8,  9, 10, 11, 12, 13, 15, 29, 30, 34, 35, 36, 38, 39])

# ***MODELLING***

## *DecisionTreeClassifier* 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier


scaler = StandardScaler()

full_pipeline = make_pipeline(full_preprocessing, 
                              scaler,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median","contant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "decisiontreeclassifier__max_depth": range(2, 15, 2),
    "decisiontreeclassifier__min_samples_leaf": range(10, 100, 10),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
    

}

search = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1, n_iter=100)

search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}


Fitting 5 folds for each of 100 candidates, totalling 500 fits


170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 355, in _fit
    **fit_params_steps[name],
  File "/usr/local/lib/python3.7/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packa

In [None]:
dt_pred_test = search.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_dt = accuracy_score(y_true = y_test,
               y_pred = dt_pred_test
              )

scores["accuracy_dt"] = accuracy_dt

## *KNN*

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_full_pipeline = make_pipeline(full_preprocessing,
                                  scaler,
                                  KNeighborsClassifier()
                                 )

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median","contant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = RandomizedSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1, n_iter=100)

knn_search.fit(X_train, y_train)

scores["knn"] = knn_search.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 355, in _fit
    **fit_params_steps[name],
  File "/usr/local/lib/python3.7/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packa

In [None]:
knn_pred_test = knn_search.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_knn = accuracy_score(y_true = y_test,
               y_pred = knn_pred_test
              )

scores["accuracy_knn"] = accuracy_knn

## *Logistic Regression*

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV


scaler = StandardScaler()

LR_full_pipeline = make_pipeline(full_preprocessing,scaler, LogisticRegression())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median","contant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "logisticregression__max_iter": range(1,100,10),
    #"logisticregression__solver" : ["newton-cg", "lbfgs", "liblinear"]
}

LR_search = RandomizedSearchCV(LR_full_pipeline,
                         param_grid,
                         cv=13,
                         scoring='accuracy',
                         verbose=1, n_iter=100)

LR_search.fit(X_train, y_train)

scores["LRegression"] = LR_search.best_score_

Fitting 13 folds for each of 100 candidates, totalling 1300 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessi

In [None]:
LR_pred_test = LR_search.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_LR = accuracy_score(y_true = y_test,
               y_pred = LR_pred_test
              )

scores["accuracy_LR"] = accuracy_LR

## *SVC*

In [None]:
from sklearn.svm import SVC

SVC_full_pipeline = make_pipeline(full_preprocessing,scaler, SVC())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median","contant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "svc__gamma":['scale', 'auto'],
    "svc__max_iter":range(1,200,10),

}

SVC_search = RandomizedSearchCV(SVC_full_pipeline,
                          param_grid, 
                          cv=10, 
                          scoring="accuracy",
                          verbose=1, n_iter=100)

SVC_search.fit(X_train, y_train)
scores["SVC"] = SVC_search.best_score_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


360 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 355, in _fit
    **fit_params_steps[name],
  File "/usr/local/lib/python3.7/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-pack

In [None]:
SVC_pred_test = SVC_search.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_SVC = accuracy_score(y_true = y_test,
               y_pred = SVC_pred_test
              )

scores["accuracy_SVC"] = accuracy_SVC

## *Random Forest*

In [None]:
from sklearn.ensemble import RandomForestClassifier

RandomForest_pipeline = make_pipeline(full_preprocessing, scaler, RandomForestClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median","contant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "randomforestclassifier__n_estimators": range(10, 100, 20),
    "randomforestclassifier__max_depth": range(2, 15, 2),
    "randomforestclassifier__min_samples_leaf": range(10, 300, 50),
    "randomforestclassifier__criterion": ["gini", "entropy"]
}

RandomForest_search = RandomizedSearchCV(RandomForest_pipeline,
                                   param_grid,
                                   cv=5,
                                   scoring="accuracy",
                                   verbose=1, n_iter=100)

RandomForest_search.fit(X_train, y_train)

scores['RandomForest'] = RandomForest_search.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


210 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 355, in _fit
    **fit_params_steps[name],
  File "/usr/local/lib/python3.7/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packa

In [None]:
RF_pred_test = RandomForest_search.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_RF = accuracy_score(y_true = y_test,
               y_pred = RF_pred_test
              )

scores["accuracy_RF"] = accuracy_RF

In [None]:
scores

{'dtree': 0.9358240710172041,
 'accuracy_dt': 0.9006849315068494,
 'knn': 0.9366567624078354,
 'accuracy_knn': 0.9143835616438356,
 'LRegression': 0.9511956208585423,
 'accuracy_LR': 0.958904109589041,
 'SVC': 0.9460580607132331,
 'accuracy_SVC': 0.9178082191780822,
 'RandomForest': 0.9409522761454092,
 'accuracy_RF': 0.9417808219178082}

In [None]:
url = " https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
Competition_DF = competition_df = pd.read_csv(path)

Compet_DF = Competition_DF.copy()
Compet_DF.pop('Id')

Alex_Compet_Submission = pd.DataFrame(Competition_DF["Id"])

LR_pred_sub = RandomForest_search.predict(Compet_DF)

Alex_Compet_Submission["Expensive"] = LR_pred_sub
Alex_Compet_Submission.head()
Alex_Compet_Submission.to_csv('Alex_Compet.csv', index=False)
from google.colab import files
files.download("Alex_Compet.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>