In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import tqdm
import missingno as mn

# Visualization libs
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

palette = sns.hls_palette(8, l=0.7, s=.8)
sns.set(rc={"figure.figsize": (6, 6)}, palette=palette)
sns.set_style("whitegrid")
mpl.rcParams['figure.figsize'] = (10, 5)

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

y = train_df.SalePrice
train_df.drop('SalePrice', axis=1, inplace=True)

In [5]:
# Сохраним информацию о лейблах, чтобы было откуда восстановить
label_encoders = {}

def concat_datasets(train, test):
    trn = train.copy()
    tst = test.copy()
    trn['is_test'] = 0
    tst['is_test'] = 1
    return pd.concat([trn, tst])

def split_datasetss(df):
    train = df[df['is_test'] == 0].drop('is_test', axis=1)
    test = df[df['is_test'] == 1].drop('is_test', axis=1)
    return (train, test)

def preprocess_data(df):
    cat_features = []
    set_features = (
        'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle',
        'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
        'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'
    )
    fill_zeroes = (
        'MasVnrArea', 'GarageCars',
        'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF',
        'BsmtFullBath', 'BsmtHalfBath',
        'GarageArea'
    )
    
    for col in cat_features:
        df[col] = df[col].astype(str)

    for col in set_features:
        label = LabelEncoder()
        label_encoders[col] = label
        values = list(df[col].values)
        label.fit(values)
        df[col] = label.transform(values)
    
    for col in fill_zeroes:
        df[col] = df[col].fillna(0)

    features_to_drop = ['Id', 'LotFrontage', 'GarageYrBlt']

    return df.drop(features_to_drop, axis=1)

In [6]:
df = concat_datasets(train_df, test_df)
df = preprocess_data(df)
X, X_test = split_datasetss(df)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [38]:
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [39]:
model.score(X_val, y_val)

0.90165153860316472

In [40]:
from hyperopt import hp
from sklearn.model_selection import cross_val_score

In [42]:
scores = cross_val_score(model, X, y, cv=10)
scores

array([ 0.88909635,  0.9096808 ,  0.92893512,  0.80255052,  0.90613016,
        0.88410713,  0.87280312,  0.90601343,  0.8946367 ,  0.87081541])