In [61]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
import glob
import re
import time
import seaborn as sns
import xgboost as xgb
import csv
import pickle
import uuid
import math
import tensorflow as tf
from tqdm import tqdm
color = sns.color_palette()

In [62]:
DATA_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/'
RAW_DATA_PATH = DATA_PATH + 'raw_data/'
PRE_PROCESSED_DATA_PATH = DATA_PATH + 'pre_processed_data/'
TRAIN_DATA = PRE_PROCESSED_DATA_PATH + 'train_pre_processed_1495598960.csv'
TEST_DATA = PRE_PROCESSED_DATA_PATH + 'test_pre_processed_1495598960.csv'
MACRO_DATA = PRE_PROCESSED_DATA_PATH + 'macro_pre_processed_1495598960.csv'
MODELS_PATH = '/kaggle/dev/ashish/sberbank-russian-housing-market/models/tensorflow/'
SUBMISSIONS_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/submissions/'
TENSORBOARD_SUMMARIES_PATH = DATA_PATH + 'tensorboard_summaries/'

In [63]:
# Prep
train_df = pd.read_csv(TRAIN_DATA, parse_dates=['timestamp'])
macro_df = pd.read_csv(MACRO_DATA, parse_dates=['timestamp'])
test_df = pd.read_csv(TEST_DATA, parse_dates=['timestamp'])

print('Train data', train_df.shape)
print('Test data', test_df.shape)
print('Macro data', macro_df.shape)

Train data (30471, 295)
Test data (7662, 294)
Macro data (2484, 100)


In [12]:
print('Merging with macro data..')
# Merge train/test data with macro data
train_macro_df =  pd.merge(train_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(train_macro_df) == len(train_df))
test_macro_df =  pd.merge(test_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(test_macro_df) == len(test_df))

print('Train + Macro data', train_macro_df.shape)
print('Test + Macro data', test_macro_df.shape)

Merging with macro data..
Train + Macro data (30471, 394)
Test + Macro data (7662, 393)


In [64]:
def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmsle:', error, '; std:', error_std)
    return ("rmsle", error)

def rmse(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(y_pred - y_label)
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmse:', error, '; std:', error_std)
    return ("rmse", error)

X = train_macro_df[list(train_columns)]
Y = train_macro_df.price_doc.values

train_X, val_X, train_Y, val_Y = model_selection.train_test_split(X, Y, train_size=0.8, random_state=42)

print('train_X.shape', train_X.shape)
print('train_Y.shape', train_Y.shape)
print('val_X.shape', val_X.shape)
print('val_Y.shape', val_Y.shape)

train_X.shape (24376, 390)
train_Y.shape (24376,)
val_X.shape (6095, 390)
val_Y.shape (6095,)


In [66]:
train_df.columns.values

array(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq', 'state',
       'product_type', 'sub_area', 'area_m', 'raion_popul',
       'green_zone_part', 'indust_part', 'children_preschool',
       'preschool_quota', 'preschool_education_centers_raion',
       'children_school', 'school_quota', 'school_education_centers_raion',
       'school_education_centers_top_20_raion', 'hospital_beds_raion',
       'healthcare_centers_raion', 'university_top_20_raion',
       'sport_objects_raion', 'additional_education_raion',
       'culture_objects_top_25', 'culture_objects_top_25_raion',
       'shopping_centers_raion', 'office_raion',
       'thermal_power_plant_raion', 'incineration_raion',
       'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion',
       'big_market_raion', 'nuclear_reactor_raion',
       'detention_facility_raion', 'full_all', 'male_f', 'female_f',
       'young_all', 'young_male', 'young

In [68]:
print('Loading data..')
# Prep
train_df = pd.read_csv(TRAIN_DATA, parse_dates=['timestamp'])
train_df['life_by_full_sq'] = train_df['life_sq']/train_df['full_sq']
train_df['floor_by_maxfloor_sq'] = train_df['floor']/train_df['max_floor']
train_df['kitch_by_full_sq'] = train_df['kitch_sq']/train_df['full_sq']

macro_df = pd.read_csv(MACRO_DATA, parse_dates=['timestamp'])
test_df = pd.read_csv(TEST_DATA, parse_dates=['timestamp'])

print('  Train data', train_df.shape)
print('  Test data', test_df.shape)
print('  Macro data', macro_df.shape)

# Fix child_on_acc_pre_school column
# macro_df.loc[macro_df['child_on_acc_pre_school'] == '#!', 'child_on_acc_pre_school'] = 0 

print('Merging with macro data..')
# Merge train/test data with macro data
train_macro_df =  pd.merge(train_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(train_macro_df) == len(train_df))
test_macro_df =  pd.merge(test_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(test_macro_df) == len(test_df))

print('  Train + Macro data', train_macro_df.shape)
print('  Test + Macro data', test_macro_df.shape)

def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    #print('rmsle:', error, '; std:', error_std)
    return ("rmsle", error)

def rmse(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(y_pred - y_label)
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    #print('rmse:', error, '; std:', error_std)
    return ("rmse", error)

X = train_macro_df[list(train_columns)]
Y = train_macro_df.price_doc.values

Loading data..
  Train data (30471, 298)
  Test data (7662, 294)
  Macro data (2484, 100)
Merging with macro data..
  Train + Macro data (30471, 397)
  Test + Macro data (7662, 393)


In [70]:
NUM_FOLDS = 5
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y, val_Y = Y[train_idxs], Y[val_idxs]

#     print('  train_X.shape', train_X.shape)
#     print('  train_Y.shape', train_Y.shape)
#     print('  val_X.shape', val_X.shape)
#     print('  val_Y.shape', val_Y.shape)

    print('  Training', model_id)
    model = xgb.XGBRegressor(max_depth = 10,
                            gamma=0.5,
                            objective="reg:linear",
                            n_estimators=10000,
                            learning_rate=0.005,
                            nthread=12,
                            subsample=0.8,
                            colsample_bytree=0.70,
                            colsample_bylevel=0.70,
                            seed=42,
                            silent=True)

    model.fit(train_X, train_Y, eval_set=[(train_X, train_Y), (val_X, val_Y)], verbose=False, eval_metric=rmsle, early_stopping_rounds=50)
    model_id = "model-" + str(int(time.time()))
    pickle.dump(model, open(MODELS_PATH + model_id + ".xgb", "wb"))
    evals_result = model.evals_result()
    #pickle.dump(evals_result, open(MODELS_PATH + model_id + "-evals-result.pk", "wb"))
    best_val_error_idx = np.argmin(evals_result['validation_1']['rmsle'])
    best_val_error = evals_result['validation_1']['rmsle'][best_val_error_idx]
    best_train_error = evals_result['validation_0']['rmsle'][best_val_error_idx]
    best_errors[model_id] = (best_train_error, best_val_error)
    print('  Best train err', best_train_error, 'val err', best_val_error)
    print('  Saved', model_id)
    
min_error_model_id = None
min_val_error = 1e15
sum_train_error = 0.0
sum_val_error = 0.0
for key, val in best_errors.items():
    sum_train_error += val[0]
    sum_val_error += val[1]
    if val[1] < min_val_error:
        min_val_error = val[1]
        min_error_model_id = key

print('Avg model', min_error_model_id,
      'train err', sum_train_error/5.0,
      'val err', sum_val_error/5.0)
print('Best model', min_error_model_id,
      'train err', best_errors[min_error_model_id][0],
      'val err', best_errors[min_error_model_id][1])

Fold 0
  Training model-1496053360


KeyboardInterrupt: 