In [None]:
import pandas as pd
import os
import numpy as np

train_data = pd.read_csv("./datasets/train.csv")
x_test_df = pd.read_csv("./datasets/test.csv")

In [None]:
x_train_df_full, y_train_df_full = train_data.loc[:, train_data.columns != "SalePrice"], train_data.loc[:, "SalePrice"]


x_train_ids = x_train_df_full['Id']
x_test_ids = x_test_df['Id']
columns_to_drop = ['Id', 'PoolQC', 'Alley', 'MiscFeature']
x_train_df_full = x_train_df_full.drop(columns=columns_to_drop)
x_test_df = x_test_df.drop(columns=columns_to_drop)

print( x_test_df.shape)

In [None]:
num_attribs = x_train_df_full.select_dtypes(np.number).columns.tolist()
cat_attribs = x_train_df_full.select_dtypes('object').columns.tolist()
# print(x_train_full[cat_attribs].info(), x_test[cat_attribs].info())

In [None]:
# x_train_full[cat_attribs] = x_train_full[cat_attribs].apply(lambda x: x.fillna(x.value_counts().index[0]))
# x_test[cat_attribs] = x_test[cat_attribs].apply(lambda x: x.fillna(x.value_counts().index[0]))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()), #scale the data using the dataset w/o nums
])

cat_pipeline = Pipeline([
    ('imputer', MostFrequentImputer()),
    ('labels', OrdinalEncoder()),
    #('cat',  OneHotEncoder(sparse=False)) #one-hot encode the columns in the dataset
])

def get_prepared_data():
    return ColumnTransformer(transformers=[
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs) #one-hot encode the columns in the dataset
    ])

In [None]:
x_train_full_prepared = get_prepared_data().fit_transform(x_train_df_full)
x_test = get_prepared_data().fit_transform(x_test_df)

In [None]:
from sklearn.model_selection import train_test_split

def get_prepared_train_val(train_data, train_labels):
    x_train, x_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.25)
    y_train = list(y_train)
    y_val = list(y_val)
    return x_train, x_val, y_train, y_val

x_train, x_val, y_train, y_val = get_prepared_train_val(x_train_full_prepared, y_train_df_full)

In [None]:
from sklearn.ensemble import RandomForestRegressor

randf_reg = RandomForestRegressor()

randf_reg.fit(x_train, y_train)
randf_reg.score(x_val, y_val)

In [None]:
randf_reg.predict(x_test)

Drop the unimportant features

In [None]:
importances_cols = sorted(zip(randf_reg.feature_importances_, num_attribs + cat_attribs), reverse=True)
unimportant_cols = [col_name for _, col_name in importances_cols[len(importances_cols)//2+4:]]
importances_cols

In [None]:
x_train_df_full.drop(columns=unimportant_cols, inplace=True)
x_test_df.drop(columns=unimportant_cols, inplace=True)

In [None]:
num_attribs = x_train_df_full.select_dtypes(np.number).columns.tolist()
cat_attribs = x_train_df_full.select_dtypes('object').columns.tolist()
x_train_full_prepared = get_prepared_data().fit_transform(x_train_df_full)
x_test = get_prepared_data().fit_transform(x_test_df)
x_train, x_val, y_train, y_val = get_prepared_train_val(x_train_full_prepared, y_train_df_full)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = [
    {'n_estimators': np.arange(200, 300, 5)},
]

grid_searcher = RandomizedSearchCV(RandomForestRegressor(), params, n_iter=30, cv=3, verbose=2)

In [None]:
# grid_searcher.fit(x_train, y_train)

In [None]:
# grid_searcher.best_params_

In [None]:
# model = grid_searcher.best_estimator_
model = RandomForestRegressor(n_estimators=230)
model.fit(x_train, y_train)
model.score(x_val, y_val)

In [None]:
res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': model.predict(x_test)})

In [None]:
res.to_csv('random_forest_out.csv', index=False)

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svr = SVR()
svr.fit(x_train, y_train)

mean_squared_error(y_val, svr.predict(x_val))

In [None]:
from sklearn.decomposition import PCA

pca_reducer = PCA(n_components=0.95)

In [None]:
x_train_new = pca_reducer.fit_transform(x_train)

In [None]:
x_train.shape

In [None]:
'''
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(x_train, y_train)
log_reg.score(x_val, y_val)
'''

In [None]:
# res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': log_reg.predict(x_test)})
# res.to_csv('log_reg_out.csv', index=False)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg = GradientBoostingRegressor()
gb_reg.fit(x_train, y_train)

In [None]:
gb_reg.score(x_val, y_val)

In [None]:
res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': gb_reg.predict(x_test)})
res.to_csv('gb_reg_out.csv', index=False)

In [None]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators=[('rand forest', model),  ('gradient boost reg', gb_reg)])
voting_reg.fit(x_train, y_train)
voting_reg.score(x_val, y_val)

In [None]:
res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': voting_reg.predict(x_test)})
res.to_csv('voting_reg_out.csv', index=False)

Gradient Boosting Regression seems to give the best output with a RMSE log error of 0.14485. I'll hyperparameter tune it

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'learning_rate': [0.008, 0.01, 0.012],
                  'n_estimators' : [550, 600, 650],
                 }

grid_searcher = GridSearchCV(GradientBoostingRegressor(subsample=0.1, max_depth=8), param_grid=params, cv=3, verbose=1)
# grid_searcher.fit(x_train, y_train)
grid_searcher.fit(x_train_full_prepared, list(y_train_df_full))

In [None]:
res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': grid_searcher.best_estimator_.predict(x_test)})
res.to_csv('gb_reg_tuned_out.csv', index=False)

In [None]:
grid_searcher.best_params_

In [None]:
tuned_gb_reg = GradientBoostingRegressor(subsample=0.1, max_depth=8, n_estimators=580, learning_rate=0.01)
tuned_gb_reg.fit(x_train_full_prepared, list(y_train_df_full))

In [None]:
tuned_randf_reg =  RandomForestRegressor(n_estimators=200)
tuned_randf_reg.fit(x_train_full_prepared, list(y_train_df_full))

In [None]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators=[('rand forest', tuned_randf_reg),  ('gradient boost reg', tuned_gb_reg)])
voting_reg.fit(x_train_full_prepared, list(y_train_df_full))

In [None]:
res = pd.DataFrame({'Id': x_test_ids, 'SalePrice': voting_reg.predict(x_test)})
res.to_csv('voting_reg_tuned_rfreg_gbreg_out.csv', index=False)