In [7]:
## import pandas & SKlearn

import pandas as pd
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
## reading
url = "https://drive.google.com/file/d/1fGzr9j80wgtijYqaHqCDLX8Bqfrzk3Mr/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)
data = data.drop(['Id'],axis=1)

In [4]:
y=data.pop('Expensive')
X=data

In [None]:
## split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=999)

In [None]:
#pipeline 

In [8]:
numeric_pipe = make_pipeline(
    KNNImputer(n_neighbors=5, missing_values=np.nan)
)
 
categoric_pipe1 = make_pipeline(
    SimpleImputer(strategy="constant", fill_value='0'),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6)
)

categoric_pipe2 = make_pipeline(
    SimpleImputer(strategy="constant", fill_value='0'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= np.nan)
)

filtered_columns = X_train.columns[~X_train.columns.isin(['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'CentralAir', 'MiscFeature'])]
X_train_filtered = X_train[filtered_columns]
X_test_filtered = X_test[filtered_columns]

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe1, make_column_selector(pattern= "MSZoning|Neighborhood|Condition1|Condition2|CentralAir|MiscFeature")),
        (categoric_pipe2, make_column_selector(dtype_include=object))
)

scaler = QuantileTransformer(n_quantiles= 25)

full_pipeline = make_pipeline(preprocessor, 
                              scaler,
                              HistGradientBoostingClassifier(),
                              memory=None)

full_pipeline.fit(X_train_filtered, y_train)

Exception in thread Thread-6 (_readerthread):
Traceback (most recent call last):
  File "C:\Users\Lenovo\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\Lenovo\anaconda3\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Lenovo\anaconda3\Lib\subprocess.py", line 1568, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "C:\Users\Lenovo\anaconda3\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 3: character maps to <undefined>
found 0 physical cores < 1
  File "C:\Users\Lenovo\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [None]:
# full_pipeline.named_steps['columntransformer'].transform(X_train)

In [9]:
full_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [10]:
y_train_predict = full_pipeline.predict(X_train)
accuracy_score(y_train, y_train_predict)

1.0

In [11]:
# Test
y_test_predict = full_pipeline.predict(X_test)
accuracy_score(y_test, y_test_predict)

0.9287671232876712

In [12]:
param_grid = {
            #'columntransformer__numeric__knnimputer__n_neighbors': [10, 25, 40],
            'quantiletransformer__n_quantiles': [2],
            'histgradientboostingclassifier__max_depth': [6],
            'histgradientboostingclassifier__learning_rate': [0.25, 0.3, 0.35],
            }

grid_search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=5,
    verbose=1,
)

grid_search.fit(X_train_filtered, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters found:  {'histgradientboostingclassifier__learning_rate': 0.25, 'histgradientboostingclassifier__max_depth': 6, 'quantiletransformer__n_quantiles': 2}
Best cross-validation score:  0.9561643835616438


In [13]:
grid_search.best_estimator_.fit(X_train_filtered, y_train)

In [None]:
## Importing  test.csv

In [15]:
url = "https://drive.google.com/file/d/14gPDFgjp9Up3tQBetXVMhh0MQepvaj_c/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
test_data = pd.read_csv(path)

In [19]:
predictions = grid_search.best_estimator_.predict(test_data)

In [None]:
id_column = test_data.pop('Id')

In [21]:
results = pd.DataFrame({'Id':id_column,'Expensive':predictions})
results.to_csv('Bingo.csv',index=False)