In [1]:
%load_ext autoreload
%autoreload 2

from pandas_profiling import ProfileReport
%pylab inline

import warnings
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb

Populating the interactive namespace from numpy and matplotlib


In [2]:
df_train = pd.read_csv('./data/train_2016_v2.csv')

In [3]:
dtypes = {
    'parcelid': int,
    'airconditioningtypeid': str,
    'architecturalstyletypeid': str,
    'buildingclasstypeid': str,
    'buildingqualitytypeid': str,
    'decktypeid': str,
    'heatingorsystemtypeid': str,
    'pooltypeid10': str,
    'pooltypeid2': str,
    'pooltypeid7': str,
    'regionidcity': str,
    'regionidcounty': str,
    'regionidneighborhood': str,
    'regionidzip': str,
    'typeconstructiontypeid': str,
    'hashottuborspa': str,
    'propertycountylandusecode': str,
    'propertylandusetypeid': str,
    'propertyzoningdesc': str,
    'rawcensustractandblock': str,
    'fireplaceflag': str,
    'taxdelinquencyflag': str,
    'censustractandblock': str,
}

props_df = pd.read_csv('./data/properties_2016.csv', dtype=dtypes)

dtypes.update({
    'fireplacecnt': int,
    'fullbathcnt': int,
    'garagecarcnt': int,
    'poolcnt': int,
    'threequarterbathnbr': int,
    'unitcnt': int,
    'yearbuilt': int,
    'numberofstories': int,
    'structuretaxvaluedollarcnt': int,
    'taxvaluedollarcnt': int,
    'landtaxvaluedollarcnt': int,
    'taxdelinquencyyear': int,
    'roomcnt': int,
    'bedroomcnt': int,
    'assessmentyear': int,
    'fips': int,
})

In [4]:
df_train = df_train.merge(props_df, on='parcelid', how='left')
df_train.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016-01-01,1.0,,,2.0,3.0,,4.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371066461001.0
1,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016-01-01,1.0,,,3.0,2.0,,4.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374638003004.0
3,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372963002002.0
4,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590423381006.0


In [5]:
def log_transf(df):
    if 'structuretaxvaluedollarcnt' in df.columns:
        df.loc[:, 'structuretaxvaluedollarcnt'] = np.log1p(df.structuretaxvaluedollarcnt).copy()
    if 'taxvaluedollarcnt' in df.columns:
        df.loc[:, 'taxvaluedollarcnt'] = np.log1p(df.taxvaluedollarcnt).copy()
    if 'calculatedfinishedsquarefeet' in df.columns:
        df.loc[:, 'calculatedfinishedsquarefeet'] = np.log1p(df.calculatedfinishedsquarefeet).copy()
    if 'lotsizesquarefeet' in df.columns:
        df.loc[:, 'lotsizesquarefeet'] = np.log1p(df.lotsizesquarefeet).copy()
    if 'finishedsquarefeet12' in df.columns:
        df.loc[:, 'finishedsquarefeet12'] = np.log1p(df.finishedsquarefeet12).copy()
    return df

def preprocess(df, rejected_cols=[]):
    df = df.copy()
    df['latitude'] = df.latitude / 1e6
    df['longitude'] = df.longitude / 1e6
    
    # From https://www.kaggle.com/nikunjm88/creating-additional-features
    #life of property
    df['N-life'] = 2018 - df['yearbuilt']

    #error in calculation of the finished living area of home
    df['N-LivingAreaError'] = df['calculatedfinishedsquarefeet']/df['finishedsquarefeet12']

    #proportion of living area
    df['N-LivingAreaProp'] = df['calculatedfinishedsquarefeet']/df['lotsizesquarefeet']
    df['N-LivingAreaProp2'] = df['finishedsquarefeet12']/df['finishedsquarefeet15']

    #Amout of extra space
    df['N-ExtraSpace'] = df['lotsizesquarefeet'] - df['calculatedfinishedsquarefeet'] 
    df['N-ExtraSpace-2'] = df['finishedsquarefeet15'] - df['finishedsquarefeet12'] 

    #Total number of rooms
    df['N-TotalRooms'] = df['bathroomcnt']*df['bedroomcnt']

    #Average room size
    df['N-AvRoomSize'] = df['calculatedfinishedsquarefeet']/df['roomcnt'] 

    # Number of Extra rooms
    df['N-ExtraRooms'] = df['roomcnt'] - df['N-TotalRooms'] 

    #Ratio of the built structure value to land area
    df['N-ValueProp'] = df['structuretaxvaluedollarcnt']/df['landtaxvaluedollarcnt']

    #Does property have a garage, pool or hot tub and AC?
    df['N-GarPoolAC'] = ((df['garagecarcnt'] > 0) &
                         (df['pooltypeid10'] != '0') &
                         (df['airconditioningtypeid'] != '5'))*1 

    df["N-location"] = df["latitude"] + df["longitude"]
    df["N-location-2"] = df["latitude"] * df["longitude"]
    df["N-location-2round"] = df["N-location-2"].round(-4)

    df["N-latitude-round"] = df["latitude"].round(-4)
    df["N-longitude-round"] = df["longitude"].round(-4)

    #Ratio of tax of property over parcel
    df['N-ValueRatio'] = df['taxvaluedollarcnt']/df['taxamount']

    #TotalTaxScore
    df['N-TaxScore'] = df['taxvaluedollarcnt']*df['taxamount']

    #polnomials of tax delinquency year
    df["N-taxdelinquencyyear-2"] = df["taxdelinquencyyear"] ** 2
    df["N-taxdelinquencyyear-3"] = df["taxdelinquencyyear"] ** 3

    #Length of time since unpaid taxes
    df['N-life'] = 2018 - df['taxdelinquencyyear']

    #Number of properties in the zip
    zip_count = df['regionidzip'].value_counts().to_dict()
    df['N-zip_count'] = df['regionidzip'].map(zip_count)

    #Number of properties in the city
    city_count = df['regionidcity'].value_counts().to_dict()
    df['N-city_count'] = df['regionidcity'].map(city_count)

    #Number of properties in the city
    region_count = df['regionidcounty'].value_counts().to_dict()
    df['N-county_count'] = df['regionidcounty'].map(city_count)

    #Indicator whether it has AC or not
    df['N-ACInd'] = (df['airconditioningtypeid']!=5)*1

    #Indicator whether it has Heating or not 
    df['N-HeatInd'] = (df['heatingorsystemtypeid']!=13)*1

    #There's 25 different property uses - let's compress them down to 4 categories
    df['N-PropType'] = df.propertylandusetypeid.replace({
        31 : "Mixed",
        46 : "Other",
        47 : "Mixed",
        246 : "Mixed",
        247 : "Mixed",
        248 : "Mixed",
        260 : "Home",
        261 : "Home",
        262 : "Home",
        263 : "Home",
        264 : "Home",
        265 : "Home",
        266 : "Home",
        267 : "Home",
        268 : "Home",
        269 : "Not Built",
        270 : "Home",
        271 : "Home",
        273 : "Home",
        274 : "Other",
        275 : "Home",
        276 : "Home",
        279 : "Home",
        290 : "Not Built",
        291 : "Not Built"
    })

    #polnomials of the variable
    df["N-structuretaxvaluedollarcnt-2"] = df["structuretaxvaluedollarcnt"] ** 2
    df["N-structuretaxvaluedollarcnt-3"] = df["structuretaxvaluedollarcnt"] ** 3

    #Average structuretaxvaluedollarcnt by city
    group = df.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
    df['N-Avg-structuretaxvaluedollarcnt'] = df['regionidcity'].map(group)

    #Deviation away from average
    df['N-Dev-structuretaxvaluedollarcnt'] = (abs((df['structuretaxvaluedollarcnt'] -
                                                   df['N-Avg-structuretaxvaluedollarcnt'])) /
                                              df['N-Avg-structuretaxvaluedollarcnt'])

    df['fireplaceflag'] = df.fireplaceflag == 'True'
    df['taxdelinquencyflag'] = df.taxdelinquencyflag == 'Y'
    df['hashottuborspa'] = df.hashottuborspa == 'true'

    df['fireplacecnt'] = df.fireplacecnt.fillna(0).astype(int)
    df['fullbathcnt'] = df.fullbathcnt.fillna(0).astype(int)
    df['garagecarcnt'] = df.garagecarcnt.fillna(0).astype(int)
    df['poolcnt'] = df.poolcnt.fillna(0).astype(int)
    df['threequarterbathnbr'] = df.threequarterbathnbr.fillna(0).astype(int)
    df['unitcnt'] = df.threequarterbathnbr.fillna(0).astype(int)
    df['numberofstories'] = df.numberofstories.fillna(0).astype(int)
    df['numberofstories'] = df.numberofstories.fillna(0).astype(int)
    df['roomcnt'] = df.roomcnt.fillna(0).astype(int)
    df['bedroomcnt'] = df.bedroomcnt.fillna(0).astype(int)
    df['bedroomcnt'] = df.bedroomcnt.fillna(0).astype(int)

    df['fips'] = df.fips.fillna(-1).astype(int)

    df['yearbuilt'] = df.yearbuilt.fillna(np.round(df.yearbuilt.mean())).astype(int)
    df['taxdelinquencyyear'] = df.taxdelinquencyyear.fillna(
        np.round(df.taxdelinquencyyear.mean())
    ).astype(int)
    df['assessmentyear'] = df.assessmentyear.fillna(
        np.round(df.assessmentyear.mean())
    ).astype(int)
    df['structuretaxvaluedollarcnt'] = df.structuretaxvaluedollarcnt.fillna(
        np.round(df.structuretaxvaluedollarcnt.mean())
    ).astype(int)
    df['taxvaluedollarcnt'] = df.taxvaluedollarcnt.fillna(
        np.round(df.taxvaluedollarcnt.mean())
    ).astype(int)
    df['landtaxvaluedollarcnt'] = df.landtaxvaluedollarcnt.fillna(
        np.round(df.landtaxvaluedollarcnt.mean())
    ).astype(int)
    
    # drop columns with std 0
    df = df.drop(['buildingclasstypeid', 'pooltypeid10', 'pooltypeid7',
                  'storytypeid', 'poolcnt', 'assessmentyear'], axis=1, errors='ignore')

    df_cols = df.columns
    df.loc[:, df_cols.str.contains('sqft')] = df.loc[:, df_cols.str.contains('sqft')].fillna(0)
    
    # set nan values in *squarefeet* columns to 0
    df.loc[:, df_cols.str.contains('squarefeet')] = df.loc[:, df_cols.str.contains('squarefeet')].fillna(0)

    
    # fill NaNs in *typeid* columns to -1
    df.loc[:, df_cols.str.contains('typeid')] = df.loc[:, df_cols.str.contains('typeid')].fillna(-1)
    
    # set nan values in *cnt* columns to 0
    df.loc[:, df_cols.str.contains('cnt')] = df.loc[:, df_cols.str.contains('cnt')].fillna(0)
    
    # set NaNs in rest of counts/sums columns to 0
    cols = ['calculatedbathnbr', 'poolsizesum', 'threequarterbathnbr', 'numberofstories']
    df.loc[:, cols] = df.loc[:, cols].fillna(0)

    # set NaNs in rest of id/code columns to 0
    cols = ['regionidcity', 'regionidneighborhood', 'regionidzip', 'censustractandblock']
    df.loc[:, cols] = df.loc[:, cols].fillna(-1)

    # set NaNs in rest of dates columns to mean
    cols = ['yearbuilt', 'taxamount', 'taxdelinquencyyear']
    df.loc[:, cols] = df.loc[:, cols].fillna(df.loc[:, cols].mean())

    drop_cols = set(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc',
                     'propertycountylandusecode']+rejected_cols)
    drop_cols &= set(df.columns)
    X = df.drop(drop_cols, axis=1).copy()
    y = None
    if 'logerror' in df.columns:
        y = df['logerror'].values.copy()

    for c in X.columns:
        if X[c].dtype == 'object':
            lbl = LabelEncoder()
            X[c] = lbl.fit_transform(list(X[c].values))

    X = log_transf(X)
    
    return X, y

In [6]:
df_train1, df_test =\
    train_test_split(df_train, test_size=0.2, random_state=0, stratify=np.sign(df_train.logerror))

In [7]:
rejected_cols1 = [
    'calculatedbathnbr',
    'finishedsquarefeet12',
    'finishedsquarefeet13',
    'finishedsquarefeet15',
    'finishedsquarefeet50',
    'finishedsquarefeet6',
    'fullbathcnt',
    'landtaxvaluedollarcnt',
    'roomcnt',
    'taxamount'
]
X_train, y_train = preprocess(df_train1, rejected_cols1)
X_test, y_test = preprocess(df_test, rejected_cols1)

In [8]:
rejected_cols2 = [
    'N-GarPoolAC',
    'N-PropType',
    'N-structuretaxvaluedollarcnt-3',
    'N-taxdelinquencyyear-3',
    'censustractandblock',
    'typeconstructiontypeid',
    'unitcnt'
]
X_train = X_train.drop(rejected_cols2, axis=1)
X_test = X_test.drop(rejected_cols2, axis=1)

# Hyperparameter optimization

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_uniform, randint as sp_randint

In [16]:
param_space = {
    'boosting_type': ['dart'],
    'num_leaves': sp_randint(20, 200),
    'learning_rate': sp_uniform(0.0001, 0.2),
    'n_estimators': sp_randint(90, 300),
    'objective': ['regression_l1'],
    'subsample': sp_uniform(0.7, 0.3),
    'colsample_bytree': sp_uniform(0.7, 0.3),
    'nthread': [-1],
    'silent': [False],
    'seed ' : [0],
    'max_bin': [500],
    
}


rs = RandomizedSearchCV(lgb.sklearn.LGBMRegressor(), param_space, n_jobs=1,
                        verbose=2, random_state=0, scoring='neg_mean_absolute_error',
                        n_iter=20, cv=5)

rs.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802 
[CV]  boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802, total=   5.3s
[CV] boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.8s remaining:    0.0s


[CV]  boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802, total=   5.5s
[CV] boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802 
[CV]  boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802, total=   5.3s
[CV] boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.143137873274, max_bin=500, n_estimators=157, nthread=-1, num_leaves=123, objective=regression_l1, seed =0, silent=False, subsample=0.827096439802 
[CV]  boosting_type=dart, colsample_bytree=0.864644051178, learning_rate=0.14313787327

[CV]  boosting_type=dart, colsample_bytree=0.950823629061, learning_rate=0.0675792320835, max_bin=500, n_estimators=238, nthread=-1, num_leaves=135, objective=regression_l1, seed =0, silent=False, subsample=0.810472461952, total=  11.7s
[CV] boosting_type=dart, colsample_bytree=0.987146547686, learning_rate=0.0281701560825, max_bin=500, n_estimators=282, nthread=-1, num_leaves=102, objective=regression_l1, seed =0, silent=False, subsample=0.842082413582 
[CV]  boosting_type=dart, colsample_bytree=0.987146547686, learning_rate=0.0281701560825, max_bin=500, n_estimators=282, nthread=-1, num_leaves=102, objective=regression_l1, seed =0, silent=False, subsample=0.842082413582, total=  10.3s
[CV] boosting_type=dart, colsample_bytree=0.987146547686, learning_rate=0.0281701560825, max_bin=500, n_estimators=282, nthread=-1, num_leaves=102, objective=regression_l1, seed =0, silent=False, subsample=0.842082413582 
[CV]  boosting_type=dart, colsample_bytree=0.987146547686, learning_rate=0.0281701

[CV]  boosting_type=dart, colsample_bytree=0.842080125804, learning_rate=0.0373664686654, max_bin=500, n_estimators=204, nthread=-1, num_leaves=48, objective=regression_l1, seed =0, silent=False, subsample=0.870530184661, total=   5.1s
[CV] boosting_type=dart, colsample_bytree=0.842080125804, learning_rate=0.0373664686654, max_bin=500, n_estimators=204, nthread=-1, num_leaves=48, objective=regression_l1, seed =0, silent=False, subsample=0.870530184661 
[CV]  boosting_type=dart, colsample_bytree=0.842080125804, learning_rate=0.0373664686654, max_bin=500, n_estimators=204, nthread=-1, num_leaves=48, objective=regression_l1, seed =0, silent=False, subsample=0.870530184661, total=   5.8s
[CV] boosting_type=dart, colsample_bytree=0.842080125804, learning_rate=0.0373664686654, max_bin=500, n_estimators=204, nthread=-1, num_leaves=48, objective=regression_l1, seed =0, silent=False, subsample=0.870530184661 
[CV]  boosting_type=dart, colsample_bytree=0.842080125804, learning_rate=0.03736646866

[CV]  boosting_type=dart, colsample_bytree=0.738677889296, learning_rate=0.0631856701848, max_bin=500, n_estimators=143, nthread=-1, num_leaves=160, objective=regression_l1, seed =0, silent=False, subsample=0.871059031125, total=   6.6s
[CV] boosting_type=dart, colsample_bytree=0.738677889296, learning_rate=0.0631856701848, max_bin=500, n_estimators=143, nthread=-1, num_leaves=160, objective=regression_l1, seed =0, silent=False, subsample=0.871059031125 
[CV]  boosting_type=dart, colsample_bytree=0.738677889296, learning_rate=0.0631856701848, max_bin=500, n_estimators=143, nthread=-1, num_leaves=160, objective=regression_l1, seed =0, silent=False, subsample=0.871059031125, total=   6.5s
[CV] boosting_type=dart, colsample_bytree=0.738677889296, learning_rate=0.0631856701848, max_bin=500, n_estimators=143, nthread=-1, num_leaves=160, objective=regression_l1, seed =0, silent=False, subsample=0.871059031125 
[CV]  boosting_type=dart, colsample_bytree=0.738677889296, learning_rate=0.0631856

[CV]  boosting_type=dart, colsample_bytree=0.88705303034, learning_rate=0.0677015229678, max_bin=500, n_estimators=207, nthread=-1, num_leaves=105, objective=regression_l1, seed =0, silent=False, subsample=0.795160522621, total=   8.4s
[CV] boosting_type=dart, colsample_bytree=0.88705303034, learning_rate=0.0677015229678, max_bin=500, n_estimators=207, nthread=-1, num_leaves=105, objective=regression_l1, seed =0, silent=False, subsample=0.795160522621 
[CV]  boosting_type=dart, colsample_bytree=0.88705303034, learning_rate=0.0677015229678, max_bin=500, n_estimators=207, nthread=-1, num_leaves=105, objective=regression_l1, seed =0, silent=False, subsample=0.795160522621, total=   8.5s
[CV] boosting_type=dart, colsample_bytree=0.933503644608, learning_rate=0.19001421069, max_bin=500, n_estimators=185, nthread=-1, num_leaves=114, objective=regression_l1, seed =0, silent=False, subsample=0.729130382738 
[CV]  boosting_type=dart, colsample_bytree=0.933503644608, learning_rate=0.19001421069,

[CV]  boosting_type=dart, colsample_bytree=0.784842088773, learning_rate=0.0241393122426, max_bin=500, n_estimators=113, nthread=-1, num_leaves=150, objective=regression_l1, seed =0, silent=False, subsample=0.906598354842, total=   4.0s
[CV] boosting_type=dart, colsample_bytree=0.784842088773, learning_rate=0.0241393122426, max_bin=500, n_estimators=113, nthread=-1, num_leaves=150, objective=regression_l1, seed =0, silent=False, subsample=0.906598354842 
[CV]  boosting_type=dart, colsample_bytree=0.784842088773, learning_rate=0.0241393122426, max_bin=500, n_estimators=113, nthread=-1, num_leaves=150, objective=regression_l1, seed =0, silent=False, subsample=0.906598354842, total=   4.0s
[CV] boosting_type=dart, colsample_bytree=0.784842088773, learning_rate=0.0241393122426, max_bin=500, n_estimators=113, nthread=-1, num_leaves=150, objective=regression_l1, seed =0, silent=False, subsample=0.906598354842 
[CV]  boosting_type=dart, colsample_bytree=0.784842088773, learning_rate=0.0241393

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'boosting_type': ['dart'], 'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe142586b38>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe1425867f0>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe1...rozen object at 0x7fe142581668>, 'nthread': [-1], 'silent': [False], 'seed ': [0], 'max_bin': [500]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score=True, 

In [17]:
rs.best_score_

-0.067217490306621136

In [14]:
rs.best_score_

-0.067208155795749638

In [15]:
rs.best_params_

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.79724230233796423,
 'learning_rate': 0.030034973436736636,
 'max_bin': 500,
 'n_estimators': 107,
 'nthread': -1,
 'num_leaves': 99,
 'objective': 'regression_l1',
 'seed ': 0,
 'silent': False,
 'subsample': 0.97077954265882127}

In [18]:
rs.best_params_

{'boosting_type': 'dart',
 'colsample_bytree': 0.98714654768591381,
 'learning_rate': 0.028170156082529031,
 'max_bin': 500,
 'n_estimators': 282,
 'nthread': -1,
 'num_leaves': 102,
 'objective': 'regression_l1',
 'seed ': 0,
 'silent': False,
 'subsample': 0.84208241358211311}

In [14]:
cv_lgb_params = {
    'boosting_type': 'dart',
    'colsample_bytree': 1,
    'learning_rate': 0.03,
    'max_bin': 500,
    'n_estimators': 300,
    'nthread': -1,
    'num_leaves': 100,
    'objective': 'regression_l1',
    'seed ': 0,
    'silent': False,
    'subsample': 1
}
cv_res = cross_val_score(lgb.sklearn.LGBMRegressor(**cv_lgb_params), X_train, y_train,
                         scoring='neg_mean_absolute_error', cv=10)
cv_res.mean(), cv_res.std()*2

(-0.067207517688343252, 0.0023791918647937398)

In [18]:
cv_lgb_params = {
    'boosting_type': 'dart',
    'colsample_bytree': 0.8,
    'learning_rate': 0.03,
    'max_bin': 500,
    'n_estimators': 100,
    'nthread': -1,
    'num_leaves': 100,
    'objective': 'regression_l1',
    'seed ': 0,
    'silent': False,
    'subsample': 0.9
}
cv_res = cross_val_score(lgb.sklearn.LGBMRegressor(**cv_lgb_params), X_train, y_train,
                         scoring='neg_mean_absolute_error', cv=10)
cv_res.mean(), cv_res.std()*2

(-0.067370749230678123, 0.0024195444722896779)

In [15]:
cv_lgb_params = {'boosting_type': 'gbdt',
 'colsample_bytree': 0.79724230233796423,
 'learning_rate': 0.030034973436736636,
 'max_bin': 500,
 'n_estimators': 107,
 'nthread': -1,
 'num_leaves': 99,
 'objective': 'regression_l1',
 'seed ': 0,
 'silent': False,
 'subsample': 0.97077954265882127}
cv_res = cross_val_score(lgb.sklearn.LGBMRegressor(**cv_lgb_params), X_train, y_train,
                         scoring='neg_mean_absolute_error', cv=10)
cv_res.mean(), cv_res.std()*2

(-0.067166672928227117, 0.0023895877713070923)

In [17]:
cv_lgb_params = {'boosting_type': 'dart',
 'colsample_bytree': 0.98714654768591381,
 'learning_rate': 0.028170156082529031,
 'max_bin': 500,
 'n_estimators': 282,
 'nthread': -1,
 'num_leaves': 102,
 'objective': 'regression_l1',
 'seed ': 0,
 'silent': False,
 'subsample': 0.84208241358211311}

cv_res = cross_val_score(lgb.sklearn.LGBMRegressor(**cv_lgb_params), X_train, y_train,
                         scoring='neg_mean_absolute_error', cv=10)
cv_res.mean(), cv_res.std()*2

(-0.067214367267048741, 0.0023853885812529516)