In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

## settings

In [2]:
path2traindata='../../data/train_2016.csv'
path2property='../../data/properties_2016.csv'
path2sample='../../data/sample_submission.csv'

## utility 

In [3]:
def array_stats(X):
    X=np.asarray(X)
    print ('array shape: ',X.shape, X.dtype)
    #print 'min: %.3f, max:%.3f, avg: %.3f, std:%.3f' %(np.min(X),np.max(X),np.mean(X),np.std(X))
    print ('min: {}, max: {}, avg: {:.3}, std:{:.3}'.format( np.min(X),np.max(X),np.mean(X),np.std(X)))

### load data

In [4]:
print('Loading data ...')

# transactions file: contains parcelid,log_error,transaction data
train = pd.read_csv(path2traindata)

# property file, parcelid and all features
prop = pd.read_csv(path2property)

Loading data ...


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
print ('train: ', train.shape)
print ('prop: ', prop.shape)

('train: ', (90811, 3))
('prop: ', (2985217, 58))


In [6]:
train.head
prop.head

<bound method DataFrame.head of           parcelid  airconditioningtypeid  architecturalstyletypeid  \
0         10754147                    NaN                       NaN   
1         10759547                    NaN                       NaN   
2         10843547                    NaN                       NaN   
3         10859147                    NaN                       NaN   
4         10879947                    NaN                       NaN   
5         10898347                    NaN                       NaN   
6         10933547                    NaN                       NaN   
7         10940747                    NaN                       NaN   
8         10954547                    NaN                       NaN   
9         10976347                    NaN                       NaN   
10        11073947                    NaN                       NaN   
11        11114347                    NaN                       NaN   
12        11116947                    NaN    

In [7]:

# read features and convert to float32
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

# merge transactions and properties
df_train = train.merge(prop, how='left', on='parcelid')
print ('merged train prop:', df_train.shape)


# separate features as X and log-error as y for training-validation
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print('X and y shape: ', x_train.shape, y_train.shape)

# list of features
train_columns = x_train.columns
print 'number of features:', len(train_columns)


# find object features
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train; gc.collect()
array_stats(x_train)
array_stats(y_train)


('merged train prop:', (90811, 60))
('X and y shape: ', (90811, 55), (90811,))
number of features: 55
('array shape: ', (90811, 55), dtype('O'))
min: 6.03710333911e+13, max: 6.11100823388e+13, avg: nan, std:nan
('array shape: ', (90811,), dtype('float64'))
min: -4.605, max: 4.737, avg: 0.0108, std:0.163


## split data into train and test

In [8]:
split = 90000
x_train1, y_train1, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

x_train1 = x_train1.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)

array_stats(x_train1)
array_stats(y_train1)
array_stats(x_valid)
array_stats(y_valid)



('array shape: ', (90000, 55), dtype('float32'))
min: nan, max: nan, avg: nan, std:nan
('array shape: ', (90000,), dtype('float64'))
min: -4.605, max: 4.737, avg: 0.0108, std:0.163
('array shape: ', (811, 55), dtype('float32'))
min: nan, max: nan, avg: nan, std:nan
('array shape: ', (811,), dtype('float64'))
min: -1.05, max: 2.387, avg: 0.0177, std:0.164


### LGB classifier

In [9]:
d_train = lgb.Dataset(x_train1, label=y_train1)
d_valid = lgb.Dataset(x_valid, label=y_valid)

params = {}
params['learning_rate'] = 0.002
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mae'
params['sub_feature'] = 0.5
params['num_leaves'] = 60
params['min_data'] = 500
params['min_hessian'] = 1

watchlist = [d_valid]
clf = lgb.train(params, d_train, 500, watchlist)

del d_train, d_valid; gc.collect()
del x_train, x_valid; gc.collect()

[1]	valid_0's l1: 0.0723926
[2]	valid_0's l1: 0.072389
[3]	valid_0's l1: 0.0723882
[4]	valid_0's l1: 0.0723865
[5]	valid_0's l1: 0.072386
[6]	valid_0's l1: 0.072381
[7]	valid_0's l1: 0.0723768
[8]	valid_0's l1: 0.0723758
[9]	valid_0's l1: 0.0723734
[10]	valid_0's l1: 0.0723721
[11]	valid_0's l1: 0.0723682
[12]	valid_0's l1: 0.0723673
[13]	valid_0's l1: 0.0723654
[14]	valid_0's l1: 0.0723637
[15]	valid_0's l1: 0.0723616
[16]	valid_0's l1: 0.0723593
[17]	valid_0's l1: 0.0723579
[18]	valid_0's l1: 0.0723557
[19]	valid_0's l1: 0.0723541
[20]	valid_0's l1: 0.0723529
[21]	valid_0's l1: 0.072347
[22]	valid_0's l1: 0.0723435
[23]	valid_0's l1: 0.0723447
[24]	valid_0's l1: 0.0723426
[25]	valid_0's l1: 0.0723369
[26]	valid_0's l1: 0.0723355
[27]	valid_0's l1: 0.0723307
[28]	valid_0's l1: 0.0723284
[29]	valid_0's l1: 0.0723244
[30]	valid_0's l1: 0.0723219
[31]	valid_0's l1: 0.0723201
[32]	valid_0's l1: 0.0723184
[33]	valid_0's l1: 0.0723165
[34]	valid_0's l1: 0.0723129
[35]	valid_0's l1: 0.072311

0

### loading test data 

In [10]:
# load test data
print("Prepare for the prediction ...")
sample = pd.read_csv(path2sample)
print 'sample submission', sample.shape
#sample.head

Prepare for the prediction ...
sample submission (2985217, 7)


### prepare test data features

In [11]:
# convert parcelid to ParcelId
sample['parcelid'] = sample['ParcelId']

# merge prop features to test data
df_test = sample.merge(prop, on='parcelid', how='left')
#del sample, prop; gc.collect()

# separate test features
x_test = df_test[train_columns]
print 'x test shape', x_test.shape
del df_test; gc.collect()

for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

x test shape (2985217, 55)


### prediction

In [12]:
print("prediction ...")
start_time=time.time()
# num_threads > 1 will predict very slow in kernal
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
print 'elapsed time: seconds', (time.time()-start_time)/60.
#del x_test; gc.collect()


prediction ...
