In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import seaborn as sns
import missingno as msno
import matplotlib.pylab as plt
%matplotlib inline
from sklearn import preprocessing
!pip3 install dataprep
from dataprep.eda import plot, plot_correlation, create_report, plot_missing

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)
df_train.info()

In [None]:
create_report(df_train)

In [None]:
# Duplicate removal 
df_train.drop_duplicates(inplace=True)

# Filling missing values
string_fill = ['PoolQC','Alley','Fence','MiscFeature','FireplaceQu','GarageYrBlt','GarageCond','GarageType','GarageFinish',
               'GarageQual','BsmtExposure','BsmtFinType2','BsmtCond','BsmtQual','BsmtFinType1','MasVnrType','Electrical']
numeric_fill = ['LotFrontage','MasVnrArea']

df_train[string_fill] = df_train[string_fill].fillna(value='Missing')
df_train[numeric_fill] = df_train[numeric_fill].fillna(value=-1)
df_test[string_fill] = df_test[string_fill].fillna(value='Missing')
df_test[numeric_fill] = df_test[numeric_fill].fillna(value=-1)

In [None]:
plot(df_train)

In [None]:
target = 'SalePrice'
categorical_numeric = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and df_train[var].nunique()<10]
continuous = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and var not in categorical_numeric]
categorical_object = [var for var in df_train.columns if df_train[var].dtype=='O']
sorted_features = [target]+categorical_numeric+continuous+categorical_object
print('Total columns: '+str(df_train.columns.size)+'\nColumns after sorting: '+str(len(sorted_features)))

# EDA

### Categorical features analysis

In [None]:
# Categorical value counts 
plot(df_train[categorical_object+categorical_numeric])

### Continuous features analysis

In [None]:
# Continuous variables analysis
plot(df_train[continuous])

### Target variable analysis

In [None]:
plot(df_train,target)

# Feature selection

In [None]:
!pip3 install feature_engine

In [None]:
from feature_engine.selection import *

In [None]:
X_train = df_train.copy()[df_train.columns.difference([target])]
y_train = df_train[target]
X_test = df_test.copy()[df_test.columns.difference([target])]

### Drop duplicate features

In [None]:
sel = DropDuplicateFeatures(variables=None, missing_values='raise') 
sel.fit(X_train)
X_train = sel.transform(X_train) 
X_test = sel.transform(X_test)
sel.features_to_drop_

### Constant/quasi-constant features

In [None]:
sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')
sel.fit(X_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
sel.features_to_drop_

### Feature correlation

In [None]:
# Model performance based correlated features removal
# random forest 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor( 
    n_estimators=10, 
    random_state=20, 
    n_jobs=4, 
) 
# correlation selector 
sel = SmartCorrelatedSelection( 
    variables=None, # if none, selector examines all numerical variables 
    method="pearson", 
    threshold=0.8, 
    missing_values="raise", 
    selection_method="model_performance", 
    estimator=rf, 
    scoring="r2", 
    cv=3, 
) 
# this may take a while, because we are training 
# a random forest per correlation group 
sel.fit(X_train, y_train)
X_train = sel.transform(X_train)  
X_test = sel.transform(X_test)
sel.correlated_feature_sets_

### Continuous target

In [None]:
# Dataprep version 
for column in categorical_object+categorical_numeric: 
    plot(df_train,column,target).show() 
for column in continuous: 
    plot(df_train,column,target).show()

### Correlation verification

In [None]:
# Dataprep version
plot_correlation(df_train[continuous+[target]])

# Correlation between categorical features and continuous target - ANOVA
#TODO for feature vs feature 
from sklearn.feature_selection import SelectKBest, f_regression 
selection_model = SelectKBest(f_regression, k=40).fit_transform(X_train, y_train)
mask = selection_model.get_support()
display('Most important features: ': X_train.columns[mask])

### Boruta feature selection

In [None]:
!pip3 install Boruta

In [None]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy
from sklearn.preprocessing import OrdinalEncoder

X_train_boruta = X_train.copy()

enc = OrdinalEncoder()
X_train_boruta[categorical_object] = enc.fit_transform(X_train_boruta[categorical_object])

rf_b = RandomForestRegressor(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf_b, n_estimators='auto', verbose=2, random_state=1)
feat_selector.fit(X_train_boruta.values, y_train.values)
feat_selector.support_
display(feat_selector.ranking_)
# X_filtered = feat_selector.transform(X_train)

print("\n------Support and Ranking for each feature------") 
for i in range(len(feat_selector.support_)): 
    if feat_selector.support_[i]: 
        print("Passes the test: ", X_train.columns[i], 
              " - Ranking: ", feat_selector.ranking_[i]) 
    else: 
        print("Doesn't pass the test: ", 
              X_train.columns[i], " - Ranking: ", feat_selector.ranking_[i])

# Feature engineering

# Model optimization and selection with optuna

In [None]:
import optuna
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')



def objective(trial): 
    
    classifier_name = trial.suggest_categorical("classifier", ["random_forest", 'XGBoost'])
     
#     if classifier_name == "logit": 
         
#         logit_penalty = trial.suggest_categorical('logit_penalty', ['l1','l2']) 
#         logit_c = trial.suggest_float('logit_c', 0.001, 10) 
#         logit_solver = 'saga' 
         
#         model = LogisticRegression( 
#             penalty=logit_penalty, 
#             C=logit_c, 
#             solver=logit_solver, 
#         ) 
         
#     elif classifier_name =="RF": 
         
#         rf_n_estimators = trial.suggest_int("rf_n_estimators", 100, 1000) 
#         rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy']) 
#         rf_max_depth = trial.suggest_int("rf_max_depth", 1, 4) 
#         rf_min_samples_split = trial.suggest_float("rf_min_samples_split", 0.01, 1) 
#         model = RandomForestClassifier( 
#             n_estimators=rf_n_estimators, 
#             criterion=rf_criterion, 
#             max_depth=rf_max_depth, 
#             min_samples_split=rf_min_samples_split, 
#         ) 
         
#     if classifier_name == 'GBM': 
         
#         gbm_n_estimators = trial.suggest_int("gbm_n_estimators", 100, 1000) 
#         gbm_criterion = trial.suggest_categorical("gbm_criterion", ['mse', 'friedman_mse']) 
#         gbm_max_depth = trial.suggest_int("gbm_max_depth", 1, 4) 
#         gbm_min_samples_split = trial.suggest_float("gbm_min_samples_split", 0.01, 1) 
#         model = GradientBoostingClassifier( 
#             n_estimators=gbm_n_estimators, 
#             criterion=gbm_criterion, 
#             max_depth=gbm_max_depth, 
#             min_samples_split=gbm_min_samples_split, 
#         ) 
        
#     else:
        
        if classifier_name == "random_forest":
            params = {
                'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
                'min_samples_split' : trial.suggest_int('min_samples_split', 3, 10),
                'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 3, 10),
                'max_features' : trial.suggest_categorical("max_features", ["auto", "sqrt"]),
                'max_depth' : trial.suggest_int('max_depth', 3, 10),
                'bootstrap' : True,
                'random_state' : 42
            }
            model = RandomForestRegressor(**params)
            
        else:
            params = {
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
                'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
                'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
                'eval_metric': 'r2_score', #TODO what if I want to minimize some metrics?
                'use_label_encoder': False
            }

            # Fit the model
            model = XGBRegressor(**params)
     
    score = cross_val_score(model, X_train, y_train, cv=3) 
    average_score = score.mean() 
     
    return average_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

display(study.best_params)
display(study.best_value)

In [None]:
# from xgboost import XGBRegressor

# params = [

# ]

# for i, param in enumerate(params):
#     model = XGBRegressor(**param)

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     output = pd.DataFrame({
#     "Id": pd.read_csv('"../input/house-prices-advanced-regression-techniques/test.csv"')['Id'],
#     "SalePrice": y_pred
#     })

#     output.to_csv(f'{i}submission.csv', index=False)
#     display("Your submission was successfully saved!")