In [None]:
import sys
src_path = "../src/"
if src_path not in sys.path:
    sys.path.append(src_path)

from helpers_module import helpers as hlp


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt


from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, MissingIndicator

import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,mean_squared_log_error

pd.options.display.max_columns = None
pd.options.display.max_rows = 30
plt.style.use(style='ggplot')

## Load data

In [None]:
RANDOM_STATE=27

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

test_id_col = test_df['Id']
n_train = train_df.shape[0]
n_test = test_df.shape[0]

print(f"Train rows: {n_train}, Test rows: {n_test}")
train_df


## Common preprocessing

In [None]:
# drop train rows without target value
train_df.dropna(subset = ['SalePrice'], inplace=True)

# keep target column from train and keep it in variable
target = train_df['SalePrice']
train_df.drop(columns=['SalePrice'], inplace=True)

# Concat train and test to common prepearing 
union_df = pd.concat([train_df, test_df]).reset_index(drop=True)

## Prepearing data


### Hard drop columns

In [None]:
# hard list of columns to drop
columns_to_drop_hard = ['Id']
union_df = union_df.drop(columns_to_drop_hard, axis=1)


### Check missing values

In [None]:
def print_missing_values(df):
    data = df.isna().sum().sort_values(ascending=False) / df.shape[0] * 100
    data = data.head(10)

    plt.figure(figsize=(16,12))
    ax = sns.barplot(y=data.index, x=data.values)
    ax.set_title("Missing values in %")
    
print_missing_values(union_df)

### Check unique values in categorical

In [None]:
df = hlp.get_unique_values_by_columns(union_df) 
df.head(20)

### Fill categorical missing values

In [None]:
# fill 'None' in columns where NA mean None (e.g. no basement or garage)
for col in [
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'FireplaceQu', 
        'GarageFinish', 'GarageType', 'GarageQual', 'GarageCond',
        'Alley', 'PoolQC', 'Fence', 'MiscFeature',
        'MasVnrType'
    ] :

    union_df[col].fillna('None', inplace=True)

    
# fill most frequent
for col in [
        'MSZoning', 'Functional', 'SaleType', 'Exterior2nd', 'Exterior1st', 'KitchenQual', 'Electrical'
    ]:
    
    union_df[col].fillna(union_df[col].mode()[0], inplace=True)


In [None]:
ax = union_df['Utilities'].value_counts().plot(kind='bar')
ax.set_title('Utilities')

# non-informative - just drop
union_df.drop(columns=['Utilities'], inplace=True)

### Fill numerical missing values

In [None]:
# fill median of Neighborhood (locations) in LotFrontage
union_df["LotFrontage"] = union_df\
                            .groupby("Neighborhood")["LotFrontage"]\
                            .transform(lambda x: x.fillna(x.median()))


for col in [
        'GarageYrBlt', 'GarageArea', 'GarageCars',
        'MasVnrArea',
        'BsmtFullBath','BsmtHalfBath','TotalBsmtSF','BsmtUnfSF','BsmtFinSF2', 'BsmtFinSF1'
    ]:
    
    union_df[col].fillna(0, inplace=True)

    
numeric_cols = hlp.get_numeric_cols(union_df)

###  Check missing values again

In [None]:
print("Count of missing values: ", union_df.isna().sum().sum())

### Encoding categorical columns

In [None]:
cat=2

# 1. Just drop all categorial columns
if cat == 1:
    union_df = hlp.drop_str_cols(union_df)

# ------ OR ------

# 2. Encode by simple labels
if cat == 2:
    union_df = hlp.encode_with_labels(union_df)

# ------ OR ------

# 3. Encode by one hot
if cat == 3:
    union_df = hlp.encode_with_one_hot(union_df)


## Numerical columns processing

In [None]:
# min_max_scaler = MinMaxScaler()
# union_df = min_max_scaler.fit_transform()

## Split data back

In [None]:
train_df = union_df[:n_train]
test_df = union_df[n_train:]

train_df


## Prepearing for model

### Skew

Prices are right skewed - will use log to normalize

In [None]:
print ("Skew is:", target.skew())

fig, axes = plt.subplots(1,2, figsize=(12,6))

sns.histplot(target, kde=True, ax=axes[0])
sns.histplot(np.log1p(target), kde=True, ax=axes[1], color="red")
plt.show()

### Split data

In [None]:
X = train_df
y = np.log1p(target)

X_test = test_df

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=RANDOM_STATE)

## Research Model

### Find best hyperparameters for XGBoost

In [None]:
need_find_best_params=False

if need_find_best_params:
    xgb_params = {
        'colsample_bytree': 0.3, 
        'learning_rate': 0.05, 
        'max_depth': 3, 
        'min_child_weight': 1, 
        'reg_alpha': 0, 
        'reg_lambda': 2, 
        'subsample': 0.5
    }

    param_grid = {
        "n_estimators": [500,600,700,800,900,1000,1200,1500,1700,2000,2500,3000,4000,5000]
    }


    reserch_model = XGBRegressor(**xgb_params)

    xgb_rscv = GridSearchCV(reserch_model, param_grid = param_grid, 
                                  scoring='neg_mean_squared_error',
                                  n_jobs=4,
                                  cv=5,
                                  verbose = True)

#     model_xgboost = xgb_rscv.fit(X_train, y_train,early_stopping_rounds=200,
#               eval_set=[(X_valid, y_valid)], eval_metric='rmsle', verbose=False)

    model_xgboost = xgb_rscv.fit(X, y, eval_metric='rmse', verbose=False)

    best_params = model_xgboost.best_params_
    print(f"Best score: {model_xgboost.best_score_:.5f} \nBest params: {best_params}")


### Find optimal  n_estimators

In [None]:
# xgb_params = best_params

# xgb_params = {    
#     "learning_rate": 0.05,
#     "max_depth": 3,    
#     'reg_lambda': 1.5,
#     'n_estimators': 500
# }

xgb_params = {
    'colsample_bytree': 0.3, 
    'learning_rate': 0.05, 
    'max_depth': 3, 
    'min_child_weight': 1, 
    'n_estimators': 700, 
    'reg_alpha': 0, 
    'reg_lambda': 2, 
    'subsample': 0.5
}

reserch_model = XGBRegressor(**xgb_params)

reserch_model.fit(X_train, y_train, early_stopping_rounds=200, 
          eval_set=[(X_valid, y_valid)], eval_metric='rmse', verbose=False)

y_train_pred = reserch_model.predict(X_train)
y_valid_pred = reserch_model.predict(X_valid)

print("RMSE train: {:.5f}".format(sqrt(mean_squared_error(y_train, y_train_pred))))
print("RMSE: {:.5f}".format(sqrt(mean_squared_error(y_valid, y_valid_pred))))
print("R2: {:.5f}".format(r2_score(y_valid, y_valid_pred)))
print("Best: {:.5f}, iter={:d}".format(reserch_model.best_score, reserch_model.best_iteration))



### Cross score check by rmse and r2

In [None]:
%%time
xgb_params['n_estimators'] = reserch_model.best_iteration

# check by r2 score by cross validation
model_for_cross_val = XGBRegressor(**xgb_params)
 
scores = cross_validate(model_for_cross_val, X, y,
                        n_jobs=4, cv=5,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)

print("RMSE train mean: {:10.5f}".format(scores['train_neg_mean_squared_error'].mean()))
print("R2 train mean:   {:10.5f}".format(scores['train_r2'].mean()))
print("----")
print("RMSE test mean:  {:10.5f}".format(scores['test_neg_mean_squared_error'].mean()))
print("R2 test mean:    {:10.5f}".format(scores['test_r2'].mean()))

In [None]:
union_df

## Train model

In [None]:
model_final = XGBRegressor(**xgb_params)
model_final.fit(X, y)

y_train_pred = model_final.predict(X)

print("RMSE train: {:.5f}".format(sqrt(mean_squared_error(y, y_train_pred))))
print("RMSLE train: {:.5f}".format(sqrt(mean_squared_log_error(y, y_train_pred))))
print("R2 train: {:.5f}".format(r2_score(y, y_train_pred)))

## Prediction and save result

In [None]:
def params_to_str(params):
    res = ""
    for k,v in params.items():
        for s in k.split('_'):
            res += s[:2]
        res += str(v)
        res += "_"
        
    return res[:-1]


In [None]:
y_test_pred = np.expm1(model_final.predict(X_test))

output = pd.DataFrame({'Id': test_id_col, 'SalePrice': y_test_pred})
output.to_csv(f'../data/rmse_{rmse:.5f}_xgb{reserch_model.best_iteration}_r_Id_enc-cat{cat}_logy_{params_to_str(xgb_params)}.csv', index=False)