In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

from scipy.special import boxcox1p, inv_boxcox1p

from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import skew
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

### Init

In [None]:
def boxcox_transform_selective(df):
    float_vars = df.select_dtypes('float')
    for fv in float_vars:
        skew_value = skew(df[fv])
        if skew_value > 0.5:
            sk = []
            for lam in range(-100, 100, 1):
                sk.append([abs(boxcox1p(df[fv], lam/1000).skew()), lam/1000])
            lmbda = min(sk, key=lambda x: x[0])[1]
            df[fv] = boxcox1p(df[fv], lmbda)
    return df

def dtypes_selection(df, id_col):
    st1 = df.nunique().reset_index(name='count_unique')
    st2 = df.describe().T.reset_index()
    st3 = round(df.isna().sum()/df.shape[0], 2).reset_index(name='p_miss')
    st = st1.merge(st2, on='index', how='left').merge(st3, on='index', how='left')
    st.fillna(-1, inplace=True)

    conds = [
        st['count'] == -1,
        st['count_unique'] == 2,
        st['index']==id_col
    ]

    res = [
        'object',
        'bool',
        'int64'
    ]
    st['dtype'] = np.select(conds, res, default='float64')
    return st

def fill_missing_basic(df):
    float_vars = list(df.select_dtypes('float'))
    object_vars = list(df.select_dtypes('object'))
    
    df[float_vars] = df[float_vars].fillna(0)
    df[object_vars] = df[object_vars].fillna('none')
    
    return df

def outliers_replacing_basic(df):
    float_vars = df.select_dtypes('float')
    for fv in float_vars:
        high_cutoff = df[fv].quantile(0.99)
        df[fv] = np.where(df[fv]>high_cutoff, high_cutoff, df[fv])
    return df

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def rmsle_cv(model, X, y, n_folds = 10):
    kf = KFold(n_folds, shuffle=True, random_state=7).get_n_splits(X.values)
    rmse = np.sqrt(-cross_val_score(model, X.values, y, scoring="neg_mean_squared_error", cv = kf))
    avg_score = np.mean(rmse)
    return(avg_score)

def boxcox_transform(df):
    float_vars = df.select_dtypes('float')
    lmbdas = []
    for fv in float_vars:
        sk = []
        for lam in range(-100, 100, 1):
            sk.append([abs(boxcox1p(df[fv], lam/1000).skew()), lam/1000])
        
        lmbda = min(sk, key=lambda x: x[0])[1]
        lmbdas.append([fv, lmbda])
        
        df[fv] = boxcox1p(df[fv], lmbda)
    return df, lmbdas

def qqplot(X, y, model):
    predictions = model.predict(X.values)
    plt.scatter(y, predictions);
    
def transform_vars(df):
    float_vars = list(df.select_dtypes('float'))
    for fv in float_vars:
        df[fv+'_sqr'] = df[fv]*df[fv]
        df[fv+'_cube'] = df[fv]*df[fv]*df[fv]
    return df

def data_preprocessing(X_train, X_test, outliers=True, trans_vars=True, boxcox=True):
    
    T = pd.concat([X_train, X_test]).reset_index(drop=True)
    
    ### Manual I ----------------------------------
    T['LotFrontage'] = T.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
   
    ## Missing Values ----------------------------------
    T = fill_missing_basic(T)
    
    ### Manual II ----------------------------------
    T = T.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

    T['YrBltAndRemod']=T['YearBuilt']+T['YearRemodAdd']
    T['TotalSF']=T['TotalBsmtSF'] + T['1stFlrSF'] + T['2ndFlrSF']

    T['Total_sqr_footage'] = (T['BsmtFinSF1'] + T['BsmtFinSF2'] +
                                     T['1stFlrSF'] + T['2ndFlrSF'])

    T['Total_Bathrooms'] = (T['FullBath'] + (0.5 * T['HalfBath']) +
                                   T['BsmtFullBath'] + (0.5 * T['BsmtHalfBath']))

    T['Total_porch_sf'] = (T['OpenPorchSF'] + T['3SsnPorch'] +
                                  T['EnclosedPorch'] + T['ScreenPorch'] +
                                  T['WoodDeckSF'])
    
    T['haspool'] = T['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    T['has2ndFlrSF'] = T['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    T['hasgarage'] = T['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    T['hasbsmt'] = T['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    T['hasfireplace'] = T['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    ### Outliers ----------------------------------
    if outliers==True:
        T = outliers_replacing_basic(T)

    ### Square, Cube ----------------------------------
    if trans_vars==True:
        T = transform_vars(T)

    ### BoxCox ----------------------------------
    if boxcox==True:
        T = boxcox_transform_selective(T)

    ### Get dummies ----------------------------------
    T = pd.get_dummies(T)
    print(T.shape)

    X_train = T.iloc[:len(X_train), :]
    X_test = T.iloc[len(X_train):, :]
    
    return X_train, X_test

### Execution

In [None]:
### Define Types
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

st = dtypes_selection(df_train, 'Id')
dtypes = {col: dtype for col, dtype in st[['index', 'dtype']].values}

### Read with dtypes
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', dtype=dtypes)
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', dtype=dtypes)

### clean the data
df_train = df_train[df_train.GrLivArea < 4500]
outliers = [30, 88, 462, 631, 1322]
df_train = df_train.drop(df_train.index[outliers])

### Label X and y
y_col = 'SalePrice'
X_train, y_train = df_train.drop(y_col, axis=1), df_train[y_col].to_frame()
y_train = np.log1p(y_train)
X_test = df_test

X_train, X_test = data_preprocessing(X_train, X_test)

gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42) 
gbr_model = gbr.fit(X_train, y_train)
predicted_prices = gbr_model.predict(X_test)

submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': predicted_prices})
submission.to_csv('submission.csv', index=False)