<a href="https://colab.research.google.com/github/mariuszkr33/dw_matrix/blob/master/KonkursDW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)

from sklearn.metrics import mean_absolute_error

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

from sklearn.model_selection import KFold

import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
train = pd.read_hdf('../input/property.train.h5')
test = pd.read_hdf('../input/property.test.h5') #zbi√≥r testowy bez odpowiedzi
df_all = pd.concat([train, test], axis=0)
print("train & test: ", train.shape, test.shape)

In [None]:
all_breadcrumbs = set()

def extract_b(b): 
    for value in b:
        all_breadcrumbs.add(value)
    
_ = df_all['breadcrumbs'].map(extract_b)

all_breadcrumbs

In [None]:
idx_breadcrumbs, labels_breadcrumbs = pd.factorize( list(all_breadcrumbs) )

for idx_breadcrumb in idx_breadcrumbs:
    cat_name = '_cat_breadcrumb_{0}'.format( labels_breadcrumbs[idx_breadcrumb].replace(' ', '_') )
    ben_label = labels_breadcrumbs[idx_breadcrumb]
    
    df_all[cat_name] = df_all['breadcrumbs'].map(lambda x: int(ben_label in x))

In [None]:
print("extract geo_block")    
def extract_geoblock(vals):
    vals = vals[: int(len(vals) / 2)]
    return {"geo_block_{}".format(idx):val.strip().lower() for idx, val in enumerate(vals)}

df_geo_block = df_all["geo_block"].map(extract_geoblock).apply(pd.Series)
df_all = pd.concat([df_all, df_geo_block], axis=1)

feats_geo_block = [ x for x in df_geo_block.columns]
for f in feats_geo_block:
    df_all["{}_cat".format(f)] = df_all[f].factorize()[0]

In [None]:
factorize_feats = [f for f in train.columns if ":" in f]
for feat in factorize_feats:
    df_all["{}_cat".format(feat)] = df_all[feat].factorize()[0]
    
feats = [x for x in df_all.columns if "_cat" in x]
print("feats: ", feats)
df_train, df_test = df_all[ False ==  df_all["price"].isnull() ], df_all[ df_all["price"].isnull() ]
X_train, X_test = df_train[feats].values, df_test[feats].values
y_train = df_train["price"].values
y_log_train = np.log(y_train)

print("train model")
model = xgb.XGBRegressor(colsample_bytree=0.86, learning_rate=0.14, max_depth=19, min_child_weight=3.0, random_state=1200, reg_alpha=1.62, reg_lambda=0.8948, subsample=0.8697)
model.fit(X_train, y_log_train)

print("predict")
y_log_pred = model.predict(X_test)
y_pred = np.exp(y_log_pred)


print("save file")
test['price'] = y_pred
test[ ['id', 'price'] ].to_csv('simple_xgboost.csv', index=False) 

In [None]:
import scikitplot as skplt

In [None]:
skplt.estimators.plot_feature_importances(model, feature_names=feats, x_tick_rotation=90, figsize=(15, 5));

In [None]:
skplt.estimators.plot_learning_curve(model, X, y, figsize=(15, 5), cv=3, scoring='accuracy');

In [None]:
from functools import partial
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
def objective(space):
    
    xgb_params = {
        'max_depth': int(space['max_depth']),
        'colsample_bytree': space['colsample_bytree'],
        'learning_rate': space['learning_rate'],
        'subsample': space['subsample'],
        'random_state': int(space['random_state']),
        'min_child_weight': int(space['min_child_weight']),
        'reg_alpha': space['reg_alpha'],
        'reg_lambda': space['reg_lambda'],
        'n_estimators': 100,
        'objective': 'reg:squarederror'
    }
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    score = mean_absolute_error(y_test, y_pred)
    
    return{'loss':score, 'status': STATUS_OK }
    
space ={
    'max_depth': hp.quniform ('x_max_depth', 5, 20, 1),
    'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.8, 1.),
    'learning_rate': hp.uniform ('x_learning_rate', 0.05, 0.2),
    'subsample': hp.uniform ('x_subsample', 0.7, 1.),
    'random_state': hp.quniform ('x_random_state', 0, 10000, 50),
    'min_child_weight': hp.quniform ('x_min_child_weight', 1, 10, 1),
    'reg_alpha': hp.loguniform ('x_reg_alpha', 0., 1.),
    'reg_lambda': hp.uniform ('x_reg_lambda', 0.7, 1.),
}


trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=partial(tpe.suggest, n_startup_jobs=1),
            max_evals=30,
            trials=trials)

print("The best params: ", best_params)

In [None]:
trials.best_trial

In [None]:
all_geo_block = set()

def extract_g(g): 
    for value in g:
        all_geo_block.add(value)
    
__ = df_all['geo_block'].map(extract_g)

all_geo_block

In [None]:
len(all_geo_block)

In [None]:
idx_geo_blocks, labels_geo_blocks = pd.factorize( list(all_geo_block) )

for idx_geo_block in idx_geo_blocks:
    cat_name = '_cat_geo_block_{0}'.format( labels_geo_blocks[idx_geo_block].replace(' ', '_') )
    ben_label = labels_geo_blocks[idx_geo_block]
    
    df_all[cat_name] = df_all['geo_block'].map(lambda x: int(ben_label in x))

In [None]:
df_all.head()

In [None]:
all_breadcrumbs = set()

def extract_b(b): 
    for value in b:
        all_breadcrumbs.add(value)
    
_ = train['breadcrumbs'].map(extract_b)

all_breadcrumbs

In [None]:
len(all_breadcrumbs)

In [None]:
idx_breadcrumbs, labels_breadcrumbs = pd.factorize( list(all_breadcrumbs) )

for idx_breadcrumb in idx_breadcrumbs:
    cat_name = 'breadcrumb_{0}'.format( labels_breadcrumbs[idx_breadcrumb].replace(' ', '_') )
    ben_label = labels_breadcrumbs[idx_breadcrumb]
    
    train[cat_name] = train['breadcrumbs'].map(lambda x: int(ben_label in x))

In [None]:
train.head()

In [None]:
df_train['price_log'] = np.log( df_train['price'] )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train[feats],df_train[['price_log']],test_size=0.3315779464899577)
model = xgb.XGBRegressor(n_estimators=70, learning_rate=0.2, max_depth=10, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test['price_log'], model.predict(X_test))

In [None]:
y_pred[y_pred<0] = y_train.min()
print("save file")
test['price'] = y_pred
test[ ['id', 'price'] ].to_csv('no_simple_xgboost.csv', index=False)