## Data Loading

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('./Datasets/train.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

# Feature Selection and Cleaning

### Numerical Columns

**Imputation**

In [2]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
X_num_imputed  = pd.DataFrame(imputer.fit_transform(X_full[numerical_cols]))
X_num_imputed.columns = X_full[numerical_cols].columns

**Normalization**

In [3]:
# from sklearn.preprocessing import normalize 

# X_num_norm = pd.DataFrame(normalize(X_num_imputed))
# X_num_norm.columns = X_num_imputed.columns

In [4]:
from sklearn.preprocessing import StandardScaler 

scal = StandardScaler()

X_num_norm = pd.DataFrame(scal.fit_transform(X_num_imputed))
X_num_norm.columns = X_num_imputed.columns


In [5]:
X_num_final = X_num_norm.copy()

### Categorical Columns

**Imputation**

In [6]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")
X_cat_imputed  = pd.DataFrame(imputer.fit_transform(X_full[categorical_cols]))
X_cat_imputed.columns = X_full[categorical_cols].columns

**Encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# #Low Cardinality Encoding
enc = OneHotEncoder(handle_unknown='ignore')
low_card_cols = [cname for cname in X_cat_imputed.columns if
                    X_cat_imputed[cname].nunique() < 10]
X_cat_enc = pd.DataFrame(enc.fit_transform(X_cat_imputed[low_card_cols]))
print("monsters done")
X_cat_enc.index = X_cat_imputed.index
X_cat_imputed.drop(low_card_cols, axis=1, inplace=True)
X_cat_imputed = X_cat_imputed.join(X_cat_enc)

#High Cardinality Encoding
enc = ce.CountEncoder()
high_card_cols = list(set(X_cat_imputed) - set(low_card_cols))
high_card_enc = pd.DataFrame(enc.fit_transform(X_cat_imputed[high_card_cols]))
high_card_enc.columns = X_cat_imputed[high_card_cols].columns
X_cat_imputed.update(high_card_enc)

# enc = ce.CountEncoder()
# high_card_enc = pd.DataFrame(enc.fit_transform(X_cat_imputed[categorical_cols]))
# high_card_enc.columns = X_cat_imputed[categorical_cols].columns
# X_cat_imputed.update(high_card_enc)

monsters done


  keys, counts = _value_counts_arraylike(values, dropna)


**Normalization**

In [None]:
# X_cat_norm = pd.DataFrame(normalize(X_cat_imputed))
# X_cat_norm.columns = X_cat_imputed.columns

In [None]:
from sklearn.preprocessing import StandardScaler 

scal = StandardScaler()

X_cat_norm = pd.DataFrame(scal.fit_transform(X_cat_imputed))
X_cat_norm.columns = X_cat_norm.columns

In [None]:
X_cat_final = X_cat_norm.copy() 

**Final Data**

In [None]:
X_final = X_num_final.join(X_cat_final)
final_cols = X_final.columns

### Drop columns have very little correlation with the output

In [None]:
# to_del = []
# corr = X_final.corrwith(y).abs()
# for col_name, c in corr.iteritems():
#     if c < 0.03:
#         to_del.append(col_name)
# X_final.drop(to_del, axis=1, inplace=True)
# final_cols = [col for col in final_cols if col not in to_del]

### Drop columns have very high correlation

In [None]:
# all_corr = X_final.corr()
# to_del = []
# for col in all_corr:
#     if col not in to_del:
#         for col_name, c in all_corr[col].iteritems():
#             if col_name != col and (c > 0.96 or c < -0.96):
#                 to_del.append(col_name)
# X_final.drop(to_del, axis=1, inplace=True)
# final_cols = [col for col in final_cols if col not in to_del]

### Train XGB

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid = {
    'model__n_estimators': [n for n in range(1000, 1201, 50)],
    'model__learning_rate': [n/1000 for n in range(5, 10, 1)]
}
search = GridSearchCV(XGBRegressor(), param_grid, n_jobs=-1,cv=5, verbose=5, scoring='neg_mean_absolute_error')
search.fit(X_final, y)

In [None]:
search.best_params_

In [None]:
search.cv_results_

In [None]:
search.best_estimator_

In [None]:
search.best_score_