### File organization

To keep our work organized, let's first create a space to keep all the files we will create for competition submissions.

In [None]:
import os

os.mkdir('../comp_submissions/')

FileExistsError: [Errno 17] File exists: '../comp_submissions/'

### Data imports

In [None]:
import pandas as pd

url = "https://drive.google.com##############################WMKbUAEpwOrhbtRHn/view?usp=drive_link"
path = "https://drive.google.com/uc?export=download&id="+url.split('/')[-2]
data = pd.read_csv(path, index_col="Id")

test_url = "https://drive.google.com######################MOXHp/view?usp=drive_link"
test_path = "https://drive.google.com/uc?export=download&id="+test_url.split('/')[-2]
test_data = pd.read_csv(test_path, index_col="Id")

#test_data.info()

In [None]:
test_data.head()

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1462,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
1463,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
1464,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
1465,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [None]:
data = data.drop(['FireplaceQu', 'Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

In [None]:
print(data.columns)

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'BsmtFinType2',
       'HeatingQC', 'Electrical

In [None]:
target = data.pop('Expensive')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1234)
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(exclude='number').columns


In [None]:
#importing necessary libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
#models to try out
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB


To submit to the competition, predictions must be made on `test_data`.

In [None]:
#pipeline numerical and categorical - preprocessing
num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='none'),
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = make_column_transformer(
    (num_pipe, num_cols),
    (cat_pipe, cat_cols)
)
preprocessor

In [None]:
#make Gradient Boosting pipeline
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])
#train pipeline
gb_pipeline.fit(X_train, y_train)

#predictions
y_pred = gb_pipeline.predict(X_test)

#evaluation of model
accuracy = accuracy_score(y_test, y_pred)
print(f"gb_pipeline Accuracy: {accuracy:.2f}")



gb_pipeline Accuracy: 0.94


In [None]:
gb_pipeline.score(X_test, y_test)

0.9383561643835616

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
Expensive,Unnamed: 1_level_1
0,991
1,177


In [None]:
gb_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
          'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
          'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
          'BsmtFinSF1...
          'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'LotShape',
          'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
          'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
          'Exterior1st', 'Exterior2nd', 'BsmtFinType2', 'HeatingQC', 'Electrical',
          '

next is Naive Bayes

In [None]:
print(X_train.columns)


Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSZoning',
       'Condition1', 'Heating', 'Street', 'CentralAir', 'Foundation',
       'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'MSSubClass', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functiona

In [None]:
#RandomForestClassifier
rf_pipeline = make_pipeline(
    preprocessor,
    RandomForestClassifier(random_state=1234)
)

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {rf_accuracy:.2f}")

RandomForestClassifier Accuracy: 0.95


In [None]:
submission = test_data.copy()
submission["Expensive"] = rf_pipeline.predict(test_data)


submission['Expensive'].to_csv("../comp_submissions/classification_dummy5.csv")

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_pipeline = make_pipeline(
    preprocessor,
    LogisticRegression(random_state=1234)
)

logreg_pipeline.fit(X_train, y_train)
y_pred_logreg = logreg_pipeline.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy:.2f}")


Logistic Regression Accuracy: 0.94


In [None]:
submission = test_data.copy()
submission["Expensive"] = logreg_pipeline.predict(test_data)


submission['Expensive'].to_csv("../comp_submissions/classification_dummy6.csv")

In [None]:
from sklearn.svm import SVC

svm_pipeline = make_pipeline(
    preprocessor,
    SVC(random_state=1234)
)

svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.2f}")


SVM Accuracy: 0.95


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

#parameter tuning
svm_pipeline = make_pipeline(
    preprocessor,
    SVC(random_state=1234)
)

#hperparams RandomizedSearchCV
param_dist = {
    'svc__C': uniform(0.1, 10),  # Regularization parameter
    'svc__gamma': ['scale', 'auto', 0.1, 1, 10],  # Kernel coefficient
    'svc__kernel': ['linear', 'rbf', 'poly'],  # Different kernel types
}

#fatser search
svm_search = RandomizedSearchCV(svm_pipeline, param_distributions=param_dist, n_iter=10, cv=10, n_jobs=-1, random_state=1234)

#train model with hyperparam search
svm_search.fit(X_train, y_train)

#get best model
best_svm_model = svm_search.best_estimator_
best_params = svm_search.best_params_

#use best model for pred
y_pred_svm = best_svm_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


#evaluate
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Best Accuracy: {svm_accuracy:.2f}")
print(f"Best Parameters: {best_params}")


Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       252
           1       0.85      0.72      0.78        40

    accuracy                           0.95       292
   macro avg       0.91      0.85      0.88       292
weighted avg       0.94      0.95      0.94       292

SVM Best Accuracy: 0.95
Best Parameters: {'svc__C': 7.953585837137692, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}


In [None]:
submission = test_data.copy()
submission["Expensive"] = best_svm_model.predict(test_data)


submission['Expensive'].to_csv("../comp_submissions/classification_dummy_svm.csv")

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

stacking_pipeline = make_pipeline(
    preprocessor,
    StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=1234)),
            ('svc', SVC(random_state=1234))
        ],
        final_estimator=LogisticRegression()
    )
)

stacking_pipeline.fit(X_train, y_train)
y_pred_stacking = stacking_pipeline.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Classifier Accuracy: {stacking_accuracy:.2f}")

Stacking Classifier Accuracy: 0.96


In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform

#hyperparams space for RandomizedSearchCV (base classifiers and final estimator)
#controls how the decision trees in the forest grows and balances bias-variance tradeoff
rf_param_dist = {
    'stackingclassifier__rf__n_estimators': [50, 100, 200],  #number of trees in forest
    'stackingclassifier__rf__max_depth': [None, 10, 20, 30],  #maximum depth of tree layer
    'stackingclassifier__rf__min_samples_split': [2, 5, 10],  #min samples required to split an internal node
}
#customizes the SVM to handle different data distributions and features relationships effectively
svc_param_dist = {
    'stackingclassifier__svc__C': uniform(0.1, 10),  #regularization parameter, C leads to simpler model and bigger to laregr
    'stackingclassifier__svc__gamma': ['scale', 'auto', 0.1, 1, 10],  #kernel coefficient,  controlls the influence of each data point
    'stackingclassifier__svc__kernel': ['linear', 'rbf', 'poly'],  #tyope of kernels to transform the input space
}
#optimises the final estimator in stacking model to combine predictions from base classifiers
log_reg_param_dist = {
    'stackingclassifier__final_estimator__C': uniform(0.1, 10),  # regualraistion params LR
    'stackingclassifier__final_estimator__penalty': ['l2', 'l1'],  #regularization type for regressions
    'stackingclassifier__final_estimator__solver': ['liblinear', 'saga'],  #solver for LR, algorithmto optimize the model (faster or smaller ones)
}


#StratifiedKFold dross-validation to maintain class distribution
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

#make stacking classifier
stacking_classifier = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(random_state=1234)),
        ('svc', SVC(random_state=1234))
    ],
    final_estimator=LogisticRegression()
)

#pipeleine with preprocessor n stacking classifier
stacking_pipeline = make_pipeline(
    preprocessor,
    stacking_classifier
)

#RandomizedSearchCV for StackingClassifier with crossvalidadtion and fuerher iteratuions
#merges the dictionaries into one dictionary, and is then used as hyperparam for RandomSearch
param_dist = {**rf_param_dist, **svc_param_dist, **log_reg_param_dist}

stacking_search = RandomizedSearchCV(
    stacking_pipeline,
    param_distributions=param_dist,
    n_iter=20,  #number of iterations for random search
    cv=cv_strategy,  #StratifiedKFold for balanced class distribution
    n_jobs=-1,
    random_state=1234,
    verbose=1
)

#trains model with hyperparam search
stacking_search.fit(X_train, y_train)

#gets best model and params
best_stacking_model = stacking_search.best_estimator_
best_params = stacking_search.best_params_

#makes prediction by using the most efficuent model
y_pred_stacking = best_stacking_model.predict(X_test)

#classification report (precision, recall, F1-score) to check
print("Classification Report:\n", classification_report(y_test, y_pred_stacking))

#evaluate accuracy of the model
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Classifier Accuracy: {stacking_accuracy:.2f}")
print(f"Best Parameters: {best_params}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       252
           1       0.85      0.70      0.77        40

    accuracy                           0.94       292
   macro avg       0.90      0.84      0.87       292
weighted avg       0.94      0.94      0.94       292

Stacking Classifier Accuracy: 0.94
Best Parameters: {'stackingclassifier__final_estimator__C': 6.233648175941091, 'stackingclassifier__final_estimator__penalty': 'l1', 'stackingclassifier__final_estimator__solver': 'liblinear', 'stackingclassifier__rf__max_depth': 10, 'stackingclassifier__rf__min_samples_split': 5, 'stackingclassifier__rf__n_estimators': 200, 'stackingclassifier__svc__C': 9.117960513709921, 'stackingclassifier__svc__gamma': 'auto', 'stackingclassifier__svc__kernel': 'linear'}


In [None]:
submission = test_data.copy()
submission["Expensive"] = best_stacking_model.predict(test_data)


submission['Expensive'].to_csv("../comp_submissions/classification_dummy_final.csv")

l









# testing  models

In [None]:
submission = test_data.copy()
submission["Expensive"] = gb_pipeline.predict(test_data)


submission['Expensive'].to_csv("../comp_submissions/classification_dummy2.csv")

### KNN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
y = data['Expensive'].copy()
X = data.drop('Expensive', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_num = X_train.select_dtypes(include='number')
X_cat = X_train.select_dtypes(exclude='number')

In [None]:
num_pipe = make_pipeline(SimpleImputer(strategy='mean'))
cat_pipe = make_pipeline(SimpleImputer(strategy='constant',
                                       fill_value='N_A'),
                        OneHotEncoder(handle_unknown='infrequent_if_exist',
                                      sparse_output=False))
preprocessor = make_column_transformer((num_pipe, X_num.columns),
                                       (cat_pipe, X_cat.columns))

knn_pipe = make_pipeline(preprocessor, KNeighborsClassifier())

In [None]:
knn_pipe.fit(X_train, y_train)
accuracy_score(y_true=y_train,
               y_pred=knn_pipe.predict(X_train))

0.797945205479452

In [None]:
knn_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer())]),
                                    Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
          'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
          'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
          'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF...
          'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
          'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
          'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
          'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
          'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
          'MiscFeature', 'Sa

In [None]:
knn_pipe.set_params(kneighborsclassifier__n_neighbors=15, kneighborsclassifier__weights='distance')
knn_pipe.fit(X_train, y_train)
accuracy_score(y_true=y_train,
               y_pred=knn_pipe.predict(X_train))

1.0

In [None]:
knn_pipe.score(X_test, y_test)

0.7157534246575342

In [None]:
knn_pipe.predict(X_test)

array([1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0], dtype=int64)

The output of `.predict()` is an array. This will have to be joined with `Id`s to create a suitable submission.

In [None]:
prediction = X_test.copy()
prediction['Expensive'] = knn_pipe.predict(X_test)
prediction['Expensive']

Id
148     1
677     1
1305    0
1373    1
1428    1
       ..
1013    1
1060    1
891     0
629     1
353     0
Name: Expensive, Length: 292, dtype: int64

To maximize the use of our training data, we can `fit` one last time on the _full_ (unsplit) training data.

True prediction is then done on `test_data`.

In [None]:
knn_pipe.fit(X, y)

In [None]:
knn_pred = test_data.copy()
knn_pred['Expensive'] = knn_pipe.predict(test_data)
knn_pred['Expensive'].to_csv('../comp_submissions/classification_knn_untrimmed.csv')