In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model, svm
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, normalize
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import scipy.stats
import matplotlib.pyplot as plt
import xgboost
import seaborn as sns
from sklearn.neural_network import MLPRegressor
import tqdm
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import optuna

import warnings # supress warnings
warnings.filterwarnings('ignore')


In [2]:
raw_training_dataset = pd.read_csv('../EDA/train.csv')
raw_test_dataset = pd.read_csv('../EDA/test.csv')

In [3]:
submission_id = np.array(raw_test_dataset['Id'])

In [4]:
numerical_features = ['MSSubClass',
                    'LotFrontage',
                    'LotArea',
                    'OverallQual',
                    'OverallCond',
                    'YearBuilt',
                    'YearRemodAdd',
                    'MasVnrArea',
                    'BsmtFinSF1',
                    'BsmtFinSF2',
                    'BsmtUnfSF',
                    'TotalBsmtSF',
                    '1stFlrSF',
                    '2ndFlrSF',
                    'LowQualFinSF',
                    'GrLivArea',
                    'BsmtFullBath',
                    'BsmtHalfBath',
                    'FullBath',
                    'HalfBath',
                    'BedroomAbvGr',
                    'KitchenAbvGr',
                    'TotRmsAbvGrd',
                    'Fireplaces',
                    'GarageYrBlt',
                    'GarageCars',
                    'GarageArea',
                    'WoodDeckSF',
                    'OpenPorchSF',
                    'EnclosedPorch',
                    '3SsnPorch',
                    'ScreenPorch',
                    'PoolArea',
                    'MiscVal',
                    'MoSold',
                    'YrSold',]

In [5]:
categorical_features = [col for col in raw_training_dataset.columns if col not in numerical_features+['Id', 'SalePrice']]

In [6]:
correlation_dict = {}
for col in numerical_features:
    # print(f"Correlation between {col} and SalesPrice = {scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])}")
    raw_training_dataset[col] = raw_training_dataset[col].fillna(0)
    raw_test_dataset[col] = raw_test_dataset[col].fillna(0) 
    correlation_dict[col] = [round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[0], 5), 
                            round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict
relevant_numerical_features = []
for key, value in correlation_dict.items():
    # picking only columns that have absolute correlation >= 0.5 and with a p-value of < 0.05
    if abs(correlation_dict[key][0]) >= 0.5 and correlation_dict[key][1] <= 0.05:
        relevant_numerical_features.append(key)

print(f"Relevant Numerical Features: {relevant_numerical_features}")


correlation_dict_categorical = {}
for col in categorical_features:
    one_hot_df = pd.get_dummies(raw_training_dataset[col])
    one_hot_col_names = one_hot_df.columns
    for ohc in one_hot_col_names:
        correlation_dict_categorical[f'{col}_{ohc}'] = [round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[0], 5), 
                                            round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict_categorical
relevant_categorical_features = []
for key in correlation_dict_categorical.keys():
    # picking only columns that have absolute correlation >= 0.5 and with a p-value of <= 0.05
    if abs(correlation_dict_categorical[key][0]) >= 0.5 and correlation_dict_categorical[key][1] <= 0.05:
        relevant_categorical_features.append(key)


print(f"Relevant Categorical Features: {relevant_categorical_features}")




Relevant Numerical Features: ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']
Relevant Categorical Features: ['ExterQual_TA', 'BsmtQual_Ex', 'KitchenQual_Ex', 'KitchenQual_TA']


In [7]:
one_hot_df = pd.get_dummies(raw_training_dataset[[c.split("_")[0] for c in relevant_categorical_features]])

normalized_df = raw_training_dataset[relevant_numerical_features]
n_rows = normalized_df.shape[0]
n_cols = normalized_df.shape[1]
for i in range(n_cols):
    normalized_df.iloc[:,i] = normalize([normalized_df.iloc[:,i]]).reshape(-1,1)

normalized_onehot_df = pd.concat([normalized_df, one_hot_df], axis=1)

X = normalized_onehot_df.values
y = raw_training_dataset.iloc[:,-1:].values

normalized_test_df = raw_test_dataset[relevant_numerical_features]
# normalized_test_df.fillna(0)
n_cols = normalized_test_df.shape[1]
for i in range(n_cols):
    normalized_test_df.iloc[:,i] = normalize([normalized_test_df.iloc[:,i]]).reshape(-1,1)

one_hot_df_test = pd.get_dummies(raw_test_dataset[[c.split("_")[0] for c in relevant_categorical_features]])

normalized_onehot_df_test = pd.concat([normalized_test_df, one_hot_df_test], axis=1)

x_test = normalized_onehot_df_test.values

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

In [None]:
def algorithm_objective(trial):
   
    x_train, x_val, y_train, y_val = train_test_split(X, y, train_size = 0.1, random_state=41)
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 1, 2000),
        'max_depth': trial.suggest_int('max_depth', 5,100),
        'random_state': 41,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgboost.XGBRegressor(**param, eval_metric= 'rmsle')  
    model.fit(x_train,y_train,eval_set=[(x_train, y_train), (x_val, y_val)],verbose=False)
    predictions = model.predict(x_val)
    validations = cross_val_score(model, x_val, y_val, cv = 5, n_jobs = -1,scoring = 'neg_mean_squared_log_error')
    return np.mean(validations)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(algorithm_objective, n_trials=10000)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
best_param = {'lambda': 8.821073920585857,
                'alpha': 8.612310788986797,
                'colsample_bytree': 0.5,
                'subsample': 0.4,
                'learning_rate': 0.005918228156202181,
                'n_estimators': 1479,
                'max_depth': 6,
                'min_child_weight': 1}

xgboost_model = xgboost.XGBRegressor(eval_metric= 'rmse', **best_param)
xgboost_model.fit(X, y)
pred_test = xgboost_model.predict(x_test)
submission_df = pd.DataFrame(columns=['Id','SalePrice'])
submission_df['Id'] = submission_id
submission_df['SalePrice'] = pred_test
submission_df
submission_df.to_csv('submission.csv', index=False)
