In [1]:
import pandas as pd
import numpy as np

In [2]:
training_data = pd.read_csv("./dataset/processed_training_data.csv")
test_data = pd.read_csv("./dataset/processed_test_data.csv")



#### Preprocessing

In [3]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np

numerical_features = training_data.select_dtypes(include='number').columns.tolist()
categorical_features = training_data.select_dtypes(include='object').columns.tolist()

column_transformer = ColumnTransformer(
        transformers=[
            ('scaler',StandardScaler(),numerical_features),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), categorical_features)
        ],
        remainder='passthrough',
    )

processed_features = column_transformer.fit_transform(training_data).toarray()

# Get the list of all the features after transformation
encoded_cat_columns = column_transformer.named_transformers_['encoder'] \
                                          .get_feature_names_out(input_features=categorical_features)
all_column_names = numerical_features+list(encoded_cat_columns)
processed_df = pd.DataFrame(processed_features, columns=all_column_names)

# Process the test data
processed_test_data = test_data.copy()
processed_test_data["SalePrice"] = np.ones(test_data.shape[0])
processed_test_data.drop("Id", axis=1, inplace=True)

processed_test_features = column_transformer.transform(processed_test_data).toarray()
processed_test_df = pd.DataFrame(processed_test_features, columns=all_column_names)

#### Splitting the data into training and testing sets

In [4]:
from sklearn.model_selection import train_test_split

X = processed_df.drop("SalePrice", axis=1)
y = processed_df["SalePrice"]


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

#### Create GridSearchCV Function

In [13]:
from sklearn.model_selection import GridSearchCV
def get_best_model(regressor,params):
    grid_search = GridSearchCV(regressor, params, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

    return grid_search.best_params_

#### Create Regression Models

#### Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_params: dict = {
        "max_depth": [8, 15, None],
        "max_features": [5, 7],
        "min_samples_split": [15, 20],
        "n_estimators": [100,200, 300],
    }
rf_model_params = get_best_model(rf, rf_params)

Best cross-validation score: 0.80


##### Gradient Boosting

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
gb_params: dict = {
        "max_depth": [5, 8],
        "max_features": [5, 7],
        "min_samples_split": [15, 20],
         "learning_rate": [0.01, 0.1],
        "n_estimators": [100,200, 300],
    }

gb_model_params = get_best_model(gb, gb_params)

Best cross-validation score: 0.88


##### AdaBoost

In [11]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor()
ada_params: dict = {
        "learning_rate": [0.01, 0.1, 1],
        "n_estimators": [10, 30, 50, 70, 100, 300, 500],
    }

ada_model_params = get_best_model(ada, ada_params)

Best cross-validation score: 0.81


##### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt_params: dict = {"max_depth": [None, *range(1, 10)], "min_samples_split": range(2, 30)}

dt_model_params = get_best_model(dt, dt_params)

Best cross-validation score: 0.72


#### Train The Best Model With Whole Training Set

In [18]:
# Gradient Boosting gave the best results
x_test = processed_test_df.drop("SalePrice", axis=1)
x_test["BsmtUnfSF"] = x_test["BsmtUnfSF"].fillna(x_test["BsmtUnfSF"].mean())
x_test["BsmtFullBath"] = x_test["BsmtFullBath"].fillna(x_test["BsmtFullBath"].mean())
x_test["GarageArea"] = x_test["GarageArea"].fillna(x_test["GarageArea"].mean())

In [20]:
gb = GradientBoostingRegressor(**gb_model_params)
gb.fit(X, y)

pred_scaled = gb.predict(x_test)

test_data["SalePrice"] = pred_scaled.reshape(-1)
test_data[numerical_features] = column_transformer.named_transformers_['scaler'].inverse_transform(test_data[numerical_features])

In [21]:
submission = pd.DataFrame()
submission["Id"] = test_data["Id"]
submission["SalePrice"] = test_data["SalePrice"]
submission["SalePrice"].fillna(submission["SalePrice"].mean(),inplace=True)
submission.to_csv("./dataset/submission_ml.csv",index=False)