###### %matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
import glob
import re
import time
import seaborn as sns
import xgboost as xgb
import csv
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
color = sns.color_palette()

In [28]:
RAW_DATA_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/raw_data/'
DATA_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/'
TRAIN_DATA = RAW_DATA_PATH + 'train.csv'
TEST_DATA = RAW_DATA_PATH + 'test.csv'
MACRO_DATA = RAW_DATA_PATH + 'macro.csv'
MODELS_PATH = '/kaggle/dev/ashish/sberbank-russian-housing-market/models/'
SUBMISSIONS_PATH = '/kaggle/dev/sberbank-russian-housing-market-data/submissions/'

In [31]:
train_df = pd.read_csv(TRAIN_DATA, parse_dates=['timestamp'])
macro_df = pd.read_csv(MACRO_DATA, parse_dates=['timestamp'])
test_df = pd.read_csv(TEST_DATA, parse_dates=['timestamp'])
print(train_df.shape)
print(test_df.shape)
print(macro_df.shape)

(30471, 292)
(7662, 291)
(2484, 100)


In [5]:
class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print('Dropping {0:} with vif={1:}'.format(X.columns[maxloc], max_vif))
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [6]:
train_columns = set(train_df.columns)
drop_columns = ['id', 'timestamp', 'price_doc']

for col in drop_columns:
    if col in train_columns:
        train_columns.remove(col)

for f in train_columns:
    if train_df[f].dtype=='object':
        print('encoding column:', f, '(original dtype:', train_df[f].dtype, ')')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(set(train_df[f].values)))
        try:
            train_df[f] = lbl.transform(list(train_df[f].values))
            #test_macro_df[f] = lbl.transform(list(test_macro_df[f].values))
        except TypeError as err:
            print(f)
            print(err)

X = train_df[list(train_columns)]
Y = train_df.price_doc.values
transformer = ReduceVIF()

# Only use 10 columns for speed in this example
X = transformer.fit_transform(X, Y)

X.to_csv(DATA_PATH + 'train_vif_preprocessed.csv')

X.head()

encoding column: railroad_terminal_raion (original dtype: object )
encoding column: big_market_raion (original dtype: object )
encoding column: oil_chemistry_raion (original dtype: object )
encoding column: sub_area (original dtype: object )
encoding column: water_1line (original dtype: object )
encoding column: radiation_raion (original dtype: object )
encoding column: detention_facility_raion (original dtype: object )
encoding column: nuclear_reactor_raion (original dtype: object )
encoding column: ecology (original dtype: object )
encoding column: railroad_1line (original dtype: object )
encoding column: culture_objects_top_25 (original dtype: object )
encoding column: thermal_power_plant_raion (original dtype: object )
encoding column: incineration_raion (original dtype: object )
encoding column: product_type (original dtype: object )
encoding column: big_road1_1line (original dtype: object )
ReduceVIF fit
ReduceVIF transform


  vif = 1. / (1. - r_squared_i)


Dropping cafe_count_1000 with vif=inf
Dropping build_count_1946-1970 with vif=inf
Dropping public_transport_station_min_walk with vif=inf
Dropping cafe_count_1500 with vif=inf
Dropping cafe_count_500_price_4000 with vif=inf
Dropping children_preschool with vif=inf
Dropping 7_14_all with vif=inf
Dropping cafe_count_2000_price_500 with vif=inf
Dropping railroad_station_walk_km with vif=inf
Dropping raion_popul with vif=inf
Dropping cafe_count_3000_price_1500 with vif=inf
Dropping cafe_count_5000_na_price with vif=inf
Dropping metro_min_walk with vif=inf
Dropping full_all with vif=35322350018592.125
Dropping work_all with vif=6823635799046.206
Dropping 16_29_all with vif=3476340893377.4575
Dropping cafe_avg_price_500 with vif=810656039487.0841
Dropping ekder_all with vif=327344063626.28986
Dropping young_all with vif=155680371514.95917
Dropping 0_13_all with vif=132456864674.65172
Dropping 0_17_all with vif=91848339432.02528
Dropping cafe_avg_price_5000 with vif=36410672148.456985
Droppin

Dropping swim_pool_km with vif=34.200475823304195
Dropping big_market_km with vif=32.689631617682856
Dropping cafe_count_3000_price_high with vif=32.11059525574277
Dropping shopping_centers_km with vif=31.346348527398288
Dropping big_church_count_5000 with vif=30.80531256339312
Dropping bus_terminal_avto_km with vif=27.413377075436564
Dropping cafe_sum_2000_min_price_avg with vif=25.96119890981726
Dropping sport_objects_raion with vif=25.737822998338498
Dropping green_part_3000 with vif=24.80423249210087
Dropping build_count_panel with vif=23.64369435795179
Dropping leisure_count_1500 with vif=23.225656483603498
Dropping office_sqm_2000 with vif=22.98390215589573
Dropping cafe_sum_1000_min_price_avg with vif=22.836925831094394
Dropping ts_km with vif=22.726525878058084
Dropping church_count_500 with vif=22.51198207759561
Dropping prom_part_2000 with vif=22.243410839960177
Dropping theater_km with vif=21.869870395458545
Dropping trc_count_2000 with vif=21.761905082505375
Dropping big_ro

Unnamed: 0,build_count_1971-1995,ID_big_road2,catering_km,additional_education_raion,public_transport_station_km,mosque_count_1500,railroad_terminal_raion,big_market_raion,hospital_beds_raion,ID_big_road1,...,material,incineration_raion,build_count_foam,school_education_centers_top_20_raion,product_type,cafe_count_500_price_1000,trc_count_500,big_road1_1line,life_sq,mosque_count_2000
0,206.0,5.0,0.516838,3.0,0.274985,0.0,0.0,0.0,240.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0
1,84.0,4.0,0.230287,1.0,0.065263,0.0,0.0,0.0,229.0,2.0,...,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,19.0,0.0
2,63.0,4.0,0.190462,1.0,0.328756,0.0,0.0,0.0,1183.0,3.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,29.0,0.0
3,130.0,17.0,0.46582,6.0,0.131597,0.0,0.0,0.0,990.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0
4,62.0,10.0,0.026102,2.0,0.07148,0.0,1.0,0.0,562.0,4.0,...,1.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,77.0,0.0


In [22]:
new_train_columns = list(X.columns.values)

with open(DATA_PATH + 'train_vif_columns.pk', 'wb') as f:
    pickle.dump(new_train_columns, f, protocol=pickle.HIGHEST_PROTOCOL)

In [32]:
# **** START HERE *******
train_columns = set(pickle.load(open(DATA_PATH + 'train_vif_columns.pk', 'rb')))
drop_columns = ['id', 'timestamp', 'price_doc']

for col in drop_columns:
    if col in train_columns:
        train_columns.remove(col)

for f in train_columns:
    if train_df[f].dtype=='object':
        print('encoding column:', f, '(original dtype:', train_df[f].dtype, ')')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(set(train_df[f].values) | set(test_df[f].values)))
        try:
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
        except TypeError as err:
            print(f)
            print(err)

X = train_df[list(train_columns)]
Y = train_df.price_doc.values


encoding column: railroad_terminal_raion (original dtype: object )
encoding column: culture_objects_top_25 (original dtype: object )
encoding column: big_market_raion (original dtype: object )
encoding column: nuclear_reactor_raion (original dtype: object )
encoding column: detention_facility_raion (original dtype: object )
encoding column: thermal_power_plant_raion (original dtype: object )
encoding column: incineration_raion (original dtype: object )
encoding column: product_type (original dtype: object )
encoding column: railroad_1line (original dtype: object )
encoding column: oil_chemistry_raion (original dtype: object )
encoding column: big_road1_1line (original dtype: object )
encoding column: water_1line (original dtype: object )
encoding column: radiation_raion (original dtype: object )


In [33]:
def rmsle(y_predicted, y_true):
    y_pred = y_predicted
    y_label = y_true.get_label()
    temp = np.square(np.log(y_pred + 1.0) - np.log(y_label + 1.0))
    error = np.sqrt(np.mean(temp))
    error_std = np.sqrt(np.std(temp))
    print('rmsle:', error, '; std:', error_std)
    return ("", error)

#X = train_df[list(train_columns)]
#Y = train_df.price_doc.values

train_X, val_X, train_Y, val_Y = model_selection.train_test_split(X, Y, train_size=0.7, random_state=42)

print('train_X.shape', train_X.shape)
print('train_Y.shape', train_Y.shape)
print('val_X.shape', val_X.shape)
print('val_Y.shape', val_Y.shape)

train_X.shape (21329, 66)
train_Y.shape (21329,)
val_X.shape (9142, 66)
val_Y.shape (9142,)


In [34]:
model = xgb.XGBRegressor(max_depth=8,
                        #gamma=0.5,
                        objective="reg:linear",
                        n_estimators=10000,
                        #min_child_weight=6,
                        learning_rate=0.05,
                        nthread=12,
                        subsample=0.70,
                        colsample_bytree=0.70,
                        seed=42,
                        #max_delta_step=1,
                        reg_alpha=0.5,
                        reg_lambda=0.5)

model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], verbose=True, eval_metric=rmsle, early_stopping_rounds=50)

model_id = "model-" + str(int(time.time()))
pickle.dump(model, open(MODELS_PATH + model_id + ".xgb", "wb"))

print("Saved model ", model_id)

rmsle: 2.95956 ; std: 1.54846
[0]	validation_0-:2.95956
Will train until validation_0- hasn't improved in 50 rounds.
rmsle: 2.27857 ; std: 1.29584
[1]	validation_0-:2.27857
rmsle: 1.91318 ; std: 1.12471
[2]	validation_0-:1.91318
rmsle: 1.66288 ; std: 1.00544
[3]	validation_0-:1.66288
rmsle: 1.46637 ; std: 0.923889
[4]	validation_0-:1.46637
rmsle: 1.32042 ; std: 0.845294
[5]	validation_0-:1.32042
rmsle: 1.20159 ; std: 0.780877
[6]	validation_0-:1.20159
rmsle: 1.10063 ; std: 0.731503
[7]	validation_0-:1.10063
rmsle: 1.01769 ; std: 0.690887
[8]	validation_0-:1.01769
rmsle: 0.947229 ; std: 0.659963
[9]	validation_0-:0.947229
rmsle: 0.886784 ; std: 0.640725
[10]	validation_0-:0.886784
rmsle: 0.833085 ; std: 0.631186
[11]	validation_0-:0.833085
rmsle: 0.786128 ; std: 0.630487
[12]	validation_0-:0.786128
rmsle: 0.745577 ; std: 0.634609
[13]	validation_0-:0.745577
rmsle: 0.710381 ; std: 0.641321
[14]	validation_0-:0.710381
rmsle: 0.679704 ; std: 0.648639
[15]	validation_0-:0.679704
rmsle: 0.65

In [35]:
new_train_columns = list(X.columns.values)
test_columns = list(new_train_columns)
test_columns.append('id')

test_X =  test_df[test_columns]
print(test_X.shape)

(7662, 67)


In [36]:
model = pickle.load(open(MODELS_PATH + 'model-1495170292.xgb', 'rb'))
filename = SUBMISSIONS_PATH + 'submission-' + str(int(time.time())) + ".csv"
test_X['predicted_price_doc'] = model.predict(test_X.loc[:, new_train_columns])
predicted_submission_df = test_X[['id', 'predicted_price_doc']]
predicted_submission_df.columns = ['id', 'price_doc']
predicted_submission_df.to_csv(filename, index=False)
print('Generated submission file: {}'.format(filename))

Generated submission file: /kaggle/dev/sberbank-russian-housing-market-data/submissions/submission-1495170322.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
