<h1> <span style="color:#FFF9C1">Housing price: machine learning challenging</span> 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# <span style="color:#6FFFE9">1. Import Data and Preprocessing</span> 

In [None]:
# training data
data = pd.read_csv('./housing-classification-iter6.csv')
data.head(2)

In [None]:
data.shape

In [None]:
# testing data
test = pd.read_csv('./test-housing-classification.csv')
test.head(2)

In [None]:
test.shape

## <span style="color:#ffadad">Define feature vector(X) and  target column(y)</span>

### <span style="color:#33FFCC">check train dataset</span> 

In [None]:
data.loc[data["MiscFeature"]=="Shed",["Expensive"]]

In [None]:
data.loc[(data["Fence"].notna()) & (data["Expensive"]== 1),["Fence", "Expensive"] ]

In [None]:
X = data.copy()
X = data.drop(columns=["Alley", "PoolQC", "MiscFeature","Id"])
y = X.pop("Expensive")

In [None]:
X.isna().sum()

## <span style="color:#ffadad">Data Splitting: Train vs. Test dataset</span>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## <span style="color:#ffadad">Building pipeline</span>

In [None]:
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

# <span style="color:#6FFFE9">Modeling</span> 

## <span style="color:#ffadad">Decision Tree</span>

### <span style="color:#33FFCC">Round 1</span> 

In [None]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(5, 41 , 5),
    "decisiontreeclassifier__min_samples_leaf": range(3, 30, 2)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 
scores = {"dtree" : search.best_score_}

scores

In [None]:
# best parameters
search.best_params_

### <span style="color:#33FFCC">Checking the Accuracy</span>

In [None]:
# Make prediciton on the train and test dataset
train_accuracy = accuracy_score(search.predict(X_train), y_train)
test_accuracy  = accuracy_score(search.predict(X_test), y_test)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

### <span style="color:#33FFCC">Round 2</span> 

In [None]:
full_pipeline = make_pipeline(preprocessor, 
                              
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    
    "decisiontreeclassifier__max_depth": range(30, 80 , 5),
    "decisiontreeclassifier__min_samples_leaf": range(3, 30, 2),
    
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 
scores = {"dtree1" : search.best_score_}

scores

In [None]:
# best parameters
search.best_params_

### <span style="color:#33FFCC">Checking the Accuracy</span>

In [None]:
# Make prediciton on the train and test dataset
train_accuracy = accuracy_score(search.predict(X_train), y_train)
test_accuracy  = accuracy_score(search.predict(X_test), y_test)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

### <span style="color:#33FFCC">Round 3</span> 

In [None]:
full_pipeline = make_pipeline(preprocessor, 
                              
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    
    "decisiontreeclassifier__max_depth": range(20, 50 , 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 20, 2),
    
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 
scores["dtree2"] = search.best_score_

scores

In [None]:
# best parameters
search.best_params_

In [None]:
# Make prediciton on the train and test dataset
train_accuracy = accuracy_score(search.predict(X_train), y_train)
test_accuracy  = accuracy_score(search.predict(X_test), y_test)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

## <span style="color:#ffadad">KNN</span>

### <span style="color:#33FFCC">Round 1</span> 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(),
                                  KNeighborsClassifier()
                                 )

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 20),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

knn_search.fit(X_train, y_train)

scores["knn"] = knn_search.best_score_

scores

In [None]:
# best parameters
knn_search.best_params_

### <span style="color:#33FFCC">Checking the Accuracy</span>

In [None]:
# Make prediciton on the train and test dataset
train_accuracy = accuracy_score(knn_search.predict(X_train), y_train)
test_accuracy  = accuracy_score(knn_search.predict(X_test), y_test)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

### <span style="color:#33FFCC">Round 2</span> 

In [None]:
knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(),
                                  KNeighborsClassifier()
                                 )

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 40),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

knn_search.fit(X_train, y_train)

scores["knn1"] = knn_search.best_score_

scores

In [None]:
# best parameters
knn_search.best_params_

### <span style="color:#33FFCC">Checking the Accuracy</span>

In [None]:
# Make prediciton on the train and test dataset
train_accuracy = accuracy_score(knn_search.predict(X_train), y_train)
test_accuracy  = accuracy_score(knn_search.predict(X_test), y_test)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

## <span style="color:#ffadad">Applying model to the validation data</span>

### <span style="color:#33FFCC">Decision Tree Model</span> 

#### <span style="color:#F1C0E8">create a Df with "id" and "expendive" clumns</span> 

In [None]:
test_tree = test.copy()

In [None]:
# applying model to the validation dataset
y_validation_pred = search.predict(test_tree)
y_validation_pred

In [None]:
test_tree["Expensive"] = y_validation_pred
test_tree.head(2)

In [None]:
submission_tree = pd.DataFrame({
                "Id" : test_tree["Id"],
                "Expensive" : test_tree["Expensive"]
})
submission_tree

#### <span style="color:#F1C0E8">save Df as a CSV file </span> 

In [None]:
submission_tree.to_csv("submission_tree.csv", index=False)

### <span style="color:#33FFCC">KNN Model</span> 

In [None]:
test_knn = test.copy()

#### <span style="color:#F1C0E8">create a Df with "id" and "expendive" clumns</span> 

In [None]:
# applying model to the validation dataset
y_validation_pred = knn_search.predict(test_knn)

# add results: "Expensive" column
test_knn["Expensive"] = y_validation_pred

# select "Id" and "Expensive" from testing Df
submission_knn = test_knn.loc[:,["Id","Expensive"]]
submission_knn.head()

#### <span style="color:#F1C0E8">save Df as a CSV file </span> 

In [None]:
submission_knn.to_csv("submission_knn.csv", index=False)