In [191]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
import glob
import re
import time
import seaborn as sns
import xgboost as xgb
import csv
import pickle
color = sns.color_palette()

In [192]:
DATA_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/'
TRAIN_DATA = DATA_PATH + 'train.csv'
TEST_DATA = DATA_PATH + 'test.csv'
MACRO_DATA = DATA_PATH + 'macro.csv'
MODELS_PATH = '/kaggle/dev/ashish/sberbank-russian-housing-market/models/'
SUBMISSIONS_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/submissions/'

In [193]:
train_df = pd.read_csv(TRAIN_DATA)
macro_df = pd.read_csv(MACRO_DATA)
test_df = pd.read_csv(TEST_DATA)

In [194]:
print(train_df.shape)
print(test_df.shape)
print(macro_df.shape)

(30471, 292)
(7662, 291)
(2484, 100)


In [195]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
macro_df['timestamp'] = pd.to_datetime(macro_df['timestamp'])

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(range(train_df.shape[0]), np.sort(train_df.price_doc.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('price', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(train_df.price_doc.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(np.log(train_df.price_doc.values), bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()

In [74]:
train_df['yearmonth'] = train_df['timestamp'].apply(lambda x: x[:4] + x[5:7])
grouped_df = train_df.groupby('yearmonth')['price_doc'].aggregate(np.median).reset_index()

TypeError: 'Timestamp' object is not subscriptable

In [None]:
plt.figure(figsize=(16,8))
sns.barplot(grouped_df.yearmonth.values, grouped_df.price_doc.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=16)
plt.xlabel('Year Month', fontsize=16)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.ix[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(['column_name'], ascending=[True])
missing_df

In [None]:
ind = np.arange(missing_df.shape[0])
plt.figure(figsize=(12,18))
fig, ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind, missing_df.missing_count.values, color='y')
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values, rotation='horizontal')
plt.show()

In [196]:
train_macro_df =  pd.merge(train_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(train_macro_df) == len(train_df))

In [197]:
test_macro_df =  pd.merge(test_df, macro_df, left_on='timestamp', right_on='timestamp', how='inner')
assert(len(test_macro_df) == len(test_df))

In [200]:
for i in train_macro_df.columns:
    try:
        if not(train_macro_df[i].dtype == test_macro_df[i].dtype):
            print('train:', i, '-', train_macro_df[i].dtype, '; test:', i, '-', test_macro_df[i].dtype)
    except KeyError:
        print('KeyError- skipping for:', i)

KeyError- skipping for: price_doc


In [199]:
train_macro_df['full_sq'] = train_macro_df['full_sq'].astype('float64')
test_macro_df['floor'] = test_macro_df['floor'].astype('float64')
test_macro_df['max_floor'] = test_macro_df['max_floor'].astype('float64')
test_macro_df['material'] = test_macro_df['material'].astype('float64')
test_macro_df['num_room'] = test_macro_df['num_room'].astype('float64')

In [201]:
train_columns = set(train_macro_df.columns)
drop_columns = ['id', 'timestamp', 'price_doc', 'child_on_acc_pre_school']

for col in drop_columns:
    train_columns.remove(col)

for f in train_columns:
    if train_macro_df[f].dtype=='object':
        print('encoding column:', f, '(original dtype:', train_macro_df[f].dtype, ')')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(set(train_macro_df[f].values) | set(test_macro_df[f].values)))
        try:
            train_macro_df[f] = lbl.transform(list(train_macro_df[f].values))
            test_macro_df[f] = lbl.transform(list(test_macro_df[f].values))
        except TypeError as err:
            print(f)
            print(err)

train_macro_df.to_csv(DATA_PATH + 'train_preprocessed.csv')
test_macro_df.to_csv(DATA_PATH + 'test_preprocessed.csv')

encoding column: sub_area (original dtype: object )
encoding column: detention_facility_raion (original dtype: object )
encoding column: product_type (original dtype: object )
encoding column: railroad_terminal_raion (original dtype: object )
encoding column: old_education_build_share (original dtype: object )
encoding column: thermal_power_plant_raion (original dtype: object )
encoding column: radiation_raion (original dtype: object )
encoding column: modern_education_share (original dtype: object )
encoding column: railroad_1line (original dtype: object )
encoding column: culture_objects_top_25 (original dtype: object )
encoding column: oil_chemistry_raion (original dtype: object )
encoding column: big_road1_1line (original dtype: object )
encoding column: nuclear_reactor_raion (original dtype: object )
encoding column: big_market_raion (original dtype: object )
encoding column: water_1line (original dtype: object )
encoding column: ecology (original dtype: object )
encoding column: 

In [202]:
train_df = pd.read_csv(DATA_PATH + 'train_preprocessed.csv')
test_df = pd.read_csv(DATA_PATH + 'test_preprocessed.csv')
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

  interactivity=interactivity, compiler=compiler, result=result)


train_df.shape (30471, 392)
test_df.shape (7662, 391)


In [203]:
def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmsle:', error, '; std:', error_std)
    return ("", error)

X = train_df[list(train_columns)]
Y = train_df.price_doc.values

train_X, val_X, train_Y, val_Y = model_selection.train_test_split(X, Y, train_size=0.8, random_state=42)

print('train_X.shape', train_X.shape)
print('train_Y.shape', train_Y.shape)
print('val_X.shape', val_X.shape)
print('val_Y.shape', val_Y.shape)

train_X.shape (24376, 387)
train_Y.shape (24376,)
val_X.shape (6095, 387)
val_Y.shape (6095,)


In [207]:
model = xgb.XGBRegressor(max_depth=10,
                        gamma=0.5,
                        objective="reg:linear",
                        n_estimators=10000,
                        #min_child_weight=6,
                        learning_rate=0.01,
                        nthread=12,
                        #subsample=0.70,
                        #colsample_bytree=0.70,
                        seed=43,
                        #max_delta_step=1,
                        reg_alpha=0.5,
                        reg_lambda=1)

model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], verbose=True, eval_metric=rmsle, early_stopping_rounds=50)

pickle.dump(model, open(MODELS_PATH + "model-" + str(int(time.time())) + ".xgb", "wb"))

rmsle: 4.58666 ; std: 2.16646
[0]	validation_0-:4.58666
Will train until validation_0- hasn't improved in 50 rounds.
rmsle: 3.90185 ; std: 1.97704
[1]	validation_0-:3.90185
rmsle: 3.50316 ; std: 1.85185
[2]	validation_0-:3.50316
rmsle: 3.22258 ; std: 1.76488
[3]	validation_0-:3.22257
rmsle: 3.00529 ; std: 1.67929
[4]	validation_0-:3.00529
rmsle: 2.82921 ; std: 1.61416
[5]	validation_0-:2.82921
rmsle: 2.68054 ; std: 1.55175
[6]	validation_0-:2.68054
rmsle: 2.55335 ; std: 1.49956
[7]	validation_0-:2.55335
rmsle: 2.4412 ; std: 1.4452
[8]	validation_0-:2.4412
rmsle: 2.34192 ; std: 1.38943
[9]	validation_0-:2.34192
rmsle: 2.25166 ; std: 1.33787
[10]	validation_0-:2.25166
rmsle: 2.17064 ; std: 1.30151
[11]	validation_0-:2.17064
rmsle: 2.09726 ; std: 1.27264
[12]	validation_0-:2.09726
rmsle: 2.02968 ; std: 1.2447
[13]	validation_0-:2.02968
rmsle: 1.96754 ; std: 1.22076
[14]	validation_0-:1.96754
rmsle: 1.90876 ; std: 1.18572
[15]	validation_0-:1.90876
rmsle: 1.85449 ; std: 1.16119
[16]	valida

rmsle: 0.511147 ; std: 0.753018
[138]	validation_0-:0.511147
rmsle: 0.509721 ; std: 0.754424
[139]	validation_0-:0.509721
rmsle: 0.508378 ; std: 0.755825
[140]	validation_0-:0.508378
rmsle: 0.507054 ; std: 0.757196
[141]	validation_0-:0.507054
rmsle: 0.505752 ; std: 0.758587
[142]	validation_0-:0.505752
rmsle: 0.504463 ; std: 0.759932
[143]	validation_0-:0.504463
rmsle: 0.503214 ; std: 0.761211
[144]	validation_0-:0.503214
rmsle: 0.502035 ; std: 0.762549
[145]	validation_0-:0.502035
rmsle: 0.50084 ; std: 0.763866
[146]	validation_0-:0.50084
rmsle: 0.499712 ; std: 0.765185
[147]	validation_0-:0.499712
rmsle: 0.498576 ; std: 0.766457
[148]	validation_0-:0.498576
rmsle: 0.497502 ; std: 0.767724
[149]	validation_0-:0.497502
rmsle: 0.496452 ; std: 0.768966
[150]	validation_0-:0.496452
rmsle: 0.495419 ; std: 0.770206
[151]	validation_0-:0.495419
rmsle: 0.494405 ; std: 0.771361
[152]	validation_0-:0.494405
rmsle: 0.493422 ; std: 0.772556
[153]	validation_0-:0.493422
rmsle: 0.49245 ; std: 0.77

rmsle: 0.4604 ; std: 0.848994
[274]	validation_0-:0.4604
rmsle: 0.460393 ; std: 0.849269
[275]	validation_0-:0.460393
rmsle: 0.460396 ; std: 0.849577
[276]	validation_0-:0.460396
rmsle: 0.460369 ; std: 0.849846
[277]	validation_0-:0.460369
rmsle: 0.46036 ; std: 0.850128
[278]	validation_0-:0.46036
rmsle: 0.460348 ; std: 0.850316
[279]	validation_0-:0.460348
rmsle: 0.460327 ; std: 0.850587
[280]	validation_0-:0.460327
rmsle: 0.460336 ; std: 0.85089
[281]	validation_0-:0.460336
rmsle: 0.460348 ; std: 0.85118
[282]	validation_0-:0.460348
rmsle: 0.46035 ; std: 0.851457
[283]	validation_0-:0.46035
rmsle: 0.46036 ; std: 0.851744
[284]	validation_0-:0.46036
rmsle: 0.460374 ; std: 0.852028
[285]	validation_0-:0.460374
rmsle: 0.460375 ; std: 0.85232
[286]	validation_0-:0.460375
rmsle: 0.460378 ; std: 0.852576
[287]	validation_0-:0.460378
rmsle: 0.460377 ; std: 0.852799
[288]	validation_0-:0.460377
rmsle: 0.460383 ; std: 0.853053
[289]	validation_0-:0.460383
rmsle: 0.460381 ; std: 0.853287
[290]

In [16]:
train_X

array([[46, nan, 4.0, ..., 0, 6, 1],
       [134, nan, 2.0, ..., 0, 0, 0],
       [44, 28.0, 6.0, ..., 8, 82, 12],
       ..., 
       [59, 41.0, 4.0, ..., 7, 44, 7],
       [64, 33.0, 2.0, ..., 0, 6, 1],
       [33, 13.0, 15.0, ..., 0, 13, 1]], dtype=object)

In [208]:
test_columns = list(train_columns)
test_columns.append('id')

test_X =  test_df[test_columns]
print(test_X.shape)

(7662, 388)


In [209]:
model = pickle.load(open(MODELS_PATH + 'model-1494950650.xgb', 'rb'))
filename = SUBMISSIONS_PATH + 'submission-' + str(int(time.time())) + ".csv"
test_X['predicted_price_doc'] = model.predict(test_X.loc[:, train_columns])
predicted_submission_df = test_X[['id', 'predicted_price_doc']]
predicted_submission_df.columns = ['id', 'price_doc']
predicted_submission_df.to_csv(filename, index=False)
print('Generated submission file: {}'.format(filename))

Generated submission file: /kaggle/dev/sberbank-russian-housing-market-data/submissions/submission-1494950723.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
