In [129]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
import glob
import re
import time
import seaborn as sns
import xgboost as xgb
import csv
import pickle
import uuid
import math
import tensorflow as tf
from tqdm import tqdm
color = sns.color_palette()

In [125]:
DATA_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/'
RAW_DATA_PATH = DATA_PATH + 'raw_data/'
PRE_PROCESSED_DATA_PATH = DATA_PATH + 'pre_processed_data/'
TRAIN_DATA = PRE_PROCESSED_DATA_PATH + 'train_pre_processed_1495598960.csv'
TEST_DATA = PRE_PROCESSED_DATA_PATH + 'test_pre_processed_1495598960.csv'
MACRO_DATA = PRE_PROCESSED_DATA_PATH + 'macro_pre_processed_1495598960.csv'
MODELS_PATH = '/kaggle/dev/ashish/sberbank-russian-housing-market/models/tensorflow/'
SUBMISSIONS_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/submissions/'
TENSORBOARD_SUMMARIES_PATH = DATA_PATH + 'tensorboard_summaries/'

In [126]:
# Prep
train_df = pd.read_csv(TRAIN_DATA, parse_dates=['timestamp'])
macro_df = pd.read_csv(MACRO_DATA, parse_dates=['timestamp'])
test_df = pd.read_csv(TEST_DATA, parse_dates=['timestamp'])

print('Train data', train_df.shape)
print('Test data', test_df.shape)
print('Macro data', macro_df.shape)

Train data (30471, 295)
Test data (7662, 294)
Macro data (2484, 100)


In [127]:
print('Merging with macro data..')
# Merge train/test data with macro data
train_macro_df =  pd.merge(train_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(train_macro_df) == len(train_df))
test_macro_df =  pd.merge(test_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(test_macro_df) == len(test_df))

print('Train + Macro data', train_macro_df.shape)
print('Test + Macro data', test_macro_df.shape)

Merging with macro data..
Train + Macro data (30471, 394)
Test + Macro data (7662, 393)


In [128]:
def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmsle:', error, '; std:', error_std)
    return ("rmsle", error)

def rmse(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(y_pred - y_label)
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmse:', error, '; std:', error_std)
    return ("rmse", error)

X = train_macro_df[list(train_columns)]
Y = train_macro_df.price_doc.values

train_X, val_X, train_Y, val_Y = model_selection.train_test_split(X, Y, train_size=0.8, random_state=42)

print('train_X.shape', train_X.shape)
print('train_Y.shape', train_Y.shape)
print('val_X.shape', val_X.shape)
print('val_Y.shape', val_Y.shape)

KeyError: "['kitch_by_full_sq' 'floor_by_maxfloor_sq' 'life_by_full_sq'] not in index"

In [106]:
core_features = set([
"price_doc",
"id",
"timestamp",
"full_sq",
"life_sq",
"floor",
"max_floor",
"material",
"build_year",
"num_room",
"kitch_sq",
"state",
"product_type",
"sub_area"])

macro_features = set([
"timestamp",
"oil_urals",
"gdp_quart",
"gdp_quart_growth",
"cpi",
"ppi",
"gdp_deflator",
"balance_trade",
"balance_trade_growth",
"usdrub",
"eurrub",
"brent",
"net_capital_export",
"gdp_annual",
"gdp_annual_growth",
"average_provision_of_build_contract",
"average_provision_of_build_contract_moscow",
"rts",
"micex",
"micex_rgbi_tr",
"micex_cbi_tr",
"deposits_value",
"deposits_growth",
"deposits_rate",
"mortgage_value",
"mortgage_growth",
"mortgage_rate",
"grp",
"grp_growth",
"income_per_cap",
"real_dispos_income_per_cap_growth",
"salary",
"salary_growth",
"fixed_basket",
"retail_trade_turnover",
"retail_trade_turnover_per_cap",
"retail_trade_turnover_growth",
"labor_force",
"unemployment",
"employment",
"invest_fixed_capital_per_cap",
"invest_fixed_assets",
"profitable_enterpr_share",
"unprofitable_enterpr_share",
"share_own_revenues",
"overdue_wages_per_cap",
"fin_res_per_cap",
"marriages_per_1000_cap",
"divorce_rate",
"construction_value",
"invest_fixed_assets_phys",
"pop_natural_increase",
"pop_migration",
"pop_total_inc",
"childbirth",
"mortality",
"housing_fund_sqm",
"lodging_sqm_per_cap",
"water_pipes_share",
"baths_share",
"sewerage_share",
"gas_share",
"hot_water_share",
"electric_stove_share",
"heating_share",
"old_house_share",
"average_life_exp",
"infant_mortarity_per_1000_cap",
"perinatal_mort_per_1000_cap",
"incidence_population",
"rent_price_4+room_bus",
"rent_price_3room_bus",
"rent_price_2room_bus",
"rent_price_1room_bus",
"rent_price_3room_eco",
"rent_price_2room_eco",
"rent_price_1room_eco",
"load_of_teachers_preschool_per_teacher",
"child_on_acc_pre_school",
"load_of_teachers_school_per_teacher",
"students_state_oneshift",
"modern_education_share",
"old_education_build_share",
"provision_doctors",
"provision_nurse",
"load_on_doctors",
"power_clinics",
"hospital_beds_available_per_cap",
"hospital_bed_occupancy_per_year",
"provision_retail_space_sqm",
"provision_retail_space_modern_sqm",
"retail_trade_turnover_per_cap",
"turnover_catering_per_cap",
"theaters_viewers_per_1000_cap",
"seats_theather_rfmin_per_100000_cap",
"museum_visitis_per_100_cap",
"bandwidth_sports",
"population_reg_sports_share",
"students_reg_sports_share",
"apartment_build",
"apartment_fund_sqm"])

neighbourhood_features = set([
"area_m",
"raion_popul",
"green_zone_part",
"indust_part",
"children_preschool",
"preschool_quota",
"preschool_education_centers_raion",
"children_school",
"school_quota",
"school_education_centers_raion",
"school_education_centers_top_20_raion",
"hospital_beds_raion",
"healthcare_centers_raion",
"university_top_20_raion",
"sport_objects_raion",
"additional_education_raion",
"culture_objects_top_25",
"culture_objects_top_25_raion",
"shopping_centers_raion",
"office_raion",
"thermal_power_plant_raion",
"incineration_raion",
"oil_chemistry_raion",
"radiation_raion",
"railroad_terminal_raion",
"big_market_raion",
"nuclear_reactor_raion",
"detention_facility_raion",
"full_all",
"male_f",
"female_f",
"young_all",
"young_male",
"young_female",
"work_all",
"work_male",
"work_female",
"ekder_all",
"ekder_male",
"ekder_female",
"0_6_all",
"0_6_male",
"0_6_female",
"7_14_all",
"7_14_male",
"7_14_female",
"0_17_all",
"0_17_male",
"0_17_female",
"16_29_all",
"16_29_male",
"16_29_female",
"0_13_all",
"0_13_male",
"0_13_female",
"raion_build_count_with_material_info",
"build_count_block",
"build_count_wood",
"build_count_frame",
"build_count_brick",
"build_count_monolith",
"build_count_panel",
"build_count_foam",
"build_count_slag",
"build_count_mix",
"raion_build_count_with_builddate_info",
"build_count_before_1920",
"build_count_1921-1945",
"build_count_1946-1970",
"build_count_1971-1995",
"build_count_after_1995",
"7_14_male",
"7_14_female",
"0_17_all",
"0_17_male",
"0_17_female",
"16_29_all",
"16_29_male",
"16_29_female",
"0_13_all",
"0_13_male",
"0_13_female",
"metro_min_avto",
"metro_km_avto",
"metro_min_walk",
"metro_km_walk",
"kindergarten_km",
"school_km",
"park_km",
"green_zone_km",
"industrial_zone_km",
"water_treatment_km",
"cemetery_km",
"incineration_km",
"railroad_station_walk_km",
"railroad_station_walk_min",
"ID_railroad_station_walk",
"railroad_station_avto_km",
"railroad_station_avto_min",
"ID_railroad_station_avto",
"public_transport_station_km",
"public_transport_station_min_walk",
"water_km",
"water_1line",
"mkad_km",
"ttk_km",
"sadovoe_km",
"bulvar_ring_km",
"kremlin_km",
"big_road1_km",
"ID_big_road1",
"big_road1_1line",
"big_road2_km",
"ID_big_road2",
"railroad_km",
"railroad_1line",
"zd_vokzaly_avto_km",
"ID_railroad_terminal",
"bus_terminal_avto_km",
"ID_bus_terminal",
"oil_chemistry_km",
"nuclear_reactor_km",
"radiation_km",
"power_transmission_line_km",
"thermal_power_plant_km",
"ts_km",
"big_market_km",
"market_shop_km",
"fitness_km",
"swim_pool_km",
"ice_rink_km",
"stadium_km",
"basketball_km",
"hospice_morgue_km",
"detention_facility_km",
"public_healthcare_km",
"university_km",
"workplaces_km",
"shopping_centers_km",
"office_km",
"additional_education_km",
"preschool_km",
"big_church_km",
"church_synagogue_km",
"mosque_km",
"theater_km",
"museum_km",
"exhibition_km",
"catering_km",
"ecology"])

neighbourhood_count_features = set([
"green_part_500",
"prom_part_500",
"office_count_500",
"office_sqm_500",
"trc_count_500",
"trc_sqm_500",
"cafe_count_500",
"cafe_sum_500_min_price_avg",
"cafe_sum_500_max_price_avg",
"cafe_avg_price_500",
"cafe_count_500_na_price",
"cafe_count_500_price_500",
"cafe_count_500_price_1000",
"cafe_count_500_price_1500",
"cafe_count_500_price_2500",
"cafe_count_500_price_4000",
"cafe_count_500_price_high",
"big_church_count_500",
"church_count_500",
"mosque_count_500",
"leisure_count_500",
"sport_count_500",
"market_count_500",
"green_part_1000",
"prom_part_1000",
"office_count_1000",
"office_sqm_1000",
"trc_count_1000",
"trc_sqm_1000",
"cafe_count_1000",
"cafe_sum_1000_min_price_avg",
"cafe_sum_1000_max_price_avg",
"cafe_avg_price_1000",
"cafe_count_1000_na_price",
"cafe_count_1000_price_500",
"cafe_count_1000_price_1000",
"cafe_count_1000_price_1500",
"cafe_count_1000_price_2500",
"cafe_count_1000_price_4000",
"cafe_count_1000_price_high",
"big_church_count_1000",
"church_count_1000",
"mosque_count_1000",
"leisure_count_1000",
"sport_count_1000",
"market_count_1000",
"green_part_1500",
"prom_part_1500",
"office_count_1500",
"office_sqm_1500",
"trc_count_1500",
"trc_sqm_1500",
"cafe_count_1500",
"cafe_sum_1500_min_price_avg",
"cafe_sum_1500_max_price_avg",
"cafe_avg_price_1500",
"cafe_count_1500_na_price",
"cafe_count_1500_price_500",
"cafe_count_1500_price_1000",
"cafe_count_1500_price_1500",
"cafe_count_1500_price_2500",
"cafe_count_1500_price_4000",
"cafe_count_1500_price_high",
"big_church_count_1500",
"church_count_1500",
"mosque_count_1500",
"leisure_count_1500",
"sport_count_1500",
"market_count_1500",
"green_part_2000",
"prom_part_2000",
"office_count_2000",
"office_sqm_2000",
"trc_count_2000",
"trc_sqm_2000",
"cafe_count_2000",
"cafe_sum_2000_min_price_avg",
"cafe_sum_2000_max_price_avg",
"cafe_avg_price_2000",
"cafe_count_2000_na_price",
"cafe_count_2000_price_500",
"cafe_count_2000_price_1000",
"cafe_count_2000_price_1500",
"cafe_count_2000_price_2500",
"cafe_count_2000_price_4000",
"cafe_count_2000_price_high",
"big_church_count_2000",
"church_count_2000",
"mosque_count_2000",
"leisure_count_2000",
"sport_count_2000",
"market_count_2000",
"green_part_3000",
"prom_part_3000",
"office_count_3000",
"office_sqm_3000",
"trc_count_3000",
"trc_sqm_3000",
"cafe_count_3000",
"cafe_sum_3000_min_price_avg",
"cafe_sum_3000_max_price_avg",
"cafe_avg_price_3000",
"cafe_count_3000_na_price",
"cafe_count_3000_price_500",
"cafe_count_3000_price_1000",
"cafe_count_3000_price_1500",
"cafe_count_3000_price_2500",
"cafe_count_3000_price_4000",
"cafe_count_3000_price_high",
"big_church_count_3000",
"church_count_3000",
"mosque_count_3000",
"leisure_count_3000",
"sport_count_3000",
"market_count_3000",
"green_part_5000",
"prom_part_5000",
"office_count_5000",
"office_sqm_5000",
"trc_count_5000",
"trc_sqm_5000",
"cafe_count_5000",
"cafe_sum_5000_min_price_avg",
"cafe_sum_5000_max_price_avg",
"cafe_avg_price_5000",
"cafe_count_5000_na_price",
"cafe_count_5000_price_500",
"cafe_count_5000_price_1000",
"cafe_count_5000_price_1500",
"cafe_count_5000_price_2500",
"cafe_count_5000_price_4000",
"cafe_count_5000_price_high",
"big_church_count_5000",
"church_count_5000",
"mosque_count_5000",
"leisure_count_5000",
"sport_count_5000",
"market_count_5000"])

In [118]:
print('Loading data..')
# Prep
train_df = pd.read_csv(TRAIN_DATA, parse_dates=['timestamp'])
macro_df = pd.read_csv(MACRO_DATA, parse_dates=['timestamp'])
test_df = pd.read_csv(TEST_DATA, parse_dates=['timestamp'])

# Create custom features
train_df['life_by_full_sq'] = train_df['life_sq']/train_df['full_sq']
train_df['floor_by_maxfloor_sq'] = train_df['floor']/train_df['max_floor']
train_df['kitch_by_full_sq'] = train_df['kitch_sq']/train_df['full_sq']

print('  Train data', train_df.shape)
print('  Test data', test_df.shape)
print('  Macro data', macro_df.shape)

# Fix child_on_acc_pre_school column
# macro_df.loc[macro_df['child_on_acc_pre_school'] == '#!', 'child_on_acc_pre_school'] = 0 

print('Merging with macro data..')
# Merge train/test data with macro data
train_macro_df =  pd.merge(train_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(train_macro_df) == len(train_df))
test_macro_df =  pd.merge(test_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(test_macro_df) == len(test_df))

print('  Train + Macro data', train_macro_df.shape)
print('  Test + Macro data', test_macro_df.shape)

def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    #print('rmsle:', error, '; std:', error_std)
    return ("rmsle", error)

def rmse(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(y_pred - y_label)
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    #print('rmse:', error, '; std:', error_std)
    return ("rmse", error)

print("Prepping training set")
train_columns = set(train_macro_df.columns)
drop_columns = set(['id', 'timestamp', 'price_doc', 'child_on_acc_pre_school'])
#drop_columns.update(macro_features)
drop_columns.update(neighbourhood_count_features)

for col in drop_columns:
    if col in train_columns:
        train_columns.remove(col)

X = train_macro_df[list(train_columns)]
Y = train_macro_df.price_doc.values

print("  X.shape", X.shape)
print("  Y.shape", Y.shape)

Loading data..
  Train data (30471, 298)
  Test data (7662, 294)
  Macro data (2484, 100)
Merging with macro data..
  Train + Macro data (30471, 397)
  Test + Macro data (7662, 393)
Prepping training set
  X.shape (30471, 255)
  Y.shape (30471,)


In [119]:
# Normalize km columns
# for col in X.columns.values:
#     if col.find('km') != -1:
#         mean = np.mean(X[col].values)
#         amin = np.amin(X[col].values)
#         amax = np.amax(X[col].values)
#         #stddev = np.std(X[col].values)
#         X[col] = (X[col] - amin)/ (amax - amin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [123]:
NUM_FOLDS = 5
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y, val_Y = Y[train_idxs], Y[val_idxs]

#     print('  train_X.shape', train_X.shape)
#     print('  train_Y.shape', train_Y.shape)
#     print('  val_X.shape', val_X.shape)
#     print('  val_Y.shape', val_Y.shape)

    print('  Training', model_id)
    model = xgb.XGBRegressor(max_depth = 10,
                            gamma=0.5,
                            objective="reg:linear",
                            n_estimators=10000,
                            learning_rate=0.005,
                            nthread=12,
                            subsample=0.8,
                            colsample_bytree=0.70,
                            colsample_bylevel=0.70,
                            seed=42,
                            silent=True)

    model.fit(train_X, train_Y, eval_set=[(train_X, train_Y), (val_X, val_Y)], verbose=False, eval_metric=rmsle, early_stopping_rounds=50)
    model_id = "model-" + str(int(time.time()))
    pickle.dump(model, open(MODELS_PATH + model_id + ".xgb", "wb"))
    evals_result = model.evals_result()
    #pickle.dump(evals_result, open(MODELS_PATH + model_id + "-evals-result.pk", "wb"))
    best_val_error_idx = np.argmin(evals_result['validation_1']['rmsle'])
    best_val_error = evals_result['validation_1']['rmsle'][best_val_error_idx]
    best_train_error = evals_result['validation_0']['rmsle'][best_val_error_idx]
    best_errors[model_id] = (best_train_error, best_val_error)
    print('  Best train err', best_train_error, 'val err', best_val_error)
    print('  Saved', model_id)
    
min_error_model_id = None
min_val_error = 1e15
sum_train_error = 0.0
sum_val_error = 0.0
for key, val in best_errors.items():
    sum_train_error += val[0]
    sum_val_error += val[1]
    if val[1] < min_val_error:
        min_val_error = val[1]
        min_error_model_id = key

print('Avg train err {0:.5f} val err {1:.5f}'.format(sum_train_error/float(NUM_FOLDS), sum_val_error/float(NUM_FOLDS)))
print('Best model', min_error_model_id,
      'train err', best_errors[min_error_model_id][0],
      'val err', best_errors[min_error_model_id][1])

Fold 0
  Training model-1496056266
  Best train err 0.370839 val err 0.457439
  Saved model-1496059206
Fold 1
  Training model-1496059206
  Best train err 0.369211 val err 0.459755
  Saved model-1496059227
Fold 2
  Training model-1496059227
  Best train err 0.36692 val err 0.452171
  Saved model-1496059248
Fold 3
  Training model-1496059248
  Best train err 0.367006 val err 0.47085
  Saved model-1496059269
Fold 4
  Training model-1496059269
  Best train err 0.369765 val err 0.46519
  Saved model-1496059288
Avg train err 0.36875 val err 0.46108
Best model model-1496059248 train err 0.36692 val err 0.452171


In [110]:
feature_importances_df = pd.DataFrame.from_dict(model.booster().get_score(), orient="index").reset_index()
feature_importances_df.columns = ['feature_name', 'importance_score']
feature_importances_df = feature_importances_df.sort_values(['importance_score'], ascending=[False])
feature_importances_df

Unnamed: 0,feature_name,importance_score
42,full_sq,8959
13,fitness_km,8767
25,metro_km_walk,6517
31,mosque_km,5556
92,kitch_by_full_sq,5127
1,salary_growth,4362
32,radiation_km,4189
66,shopping_centers_km,3671
90,life_by_full_sq,3601
63,big_road2_km,3514
