In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import os
import time

In [None]:
## Parallelization functions
import numpy as np
from multiprocessing import cpu_count
 
nCores = cpu_count() # Number of CPU cores on your system
partitions = nCores  # Number of partitions
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(nCores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [2]:
def load_data(conf):
    st = time.time()
    if os.path.exists(f'{conf}.feather'):
        out = pd.read_feather(f'{conf}.feather')
        print(f'{conf} data loaded from feather {time.time()-st:.2f} s!')
    else:
        out = pd.read_csv(f'./{conf}.csv.zip')
        print(f'{conf} data loaded from csv {time.time()-st:.2f} s!')
        out.to_feather(f'{conf}.feather')
        print(f'{conf} data save to feather {time.time()-st:.2f} s!')  
    return out 
train = load_data('train')
test = load_data('test')

train data loaded from feather 0.34 s!
test data loaded from feather 1.30 s!


In [4]:
print(train.shape, test.shape)
train.head()

(4459, 4993) (49342, 4992)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [5]:
## Drop columns with only one unique value (usually zero)
only_one = train.columns[train.nunique() == 1]
train.drop(only_one, axis = 1, inplace = True)
test.drop(only_one, axis = 1, inplace = True)
print(train.shape, test.shape)

(4459, 4737) (49342, 4736)


In [6]:
## Copy training features and target out
dataCols = [c for c in train.columns if c not in ['ID', 'target']]
xtrain = train[dataCols].copy().values
target = train['target'].copy().values

In [18]:
from PIL import Image, ImageDraw, ImageColor
im = Image.new('RGB', xtrain.shape)
wh = ImageColor.getrgb('white')
re = ImageColor.getrgb('red')
gr = ImageColor.getrgb('green')
ga = ImageColor.getrgb('gray')

for x in range(xtrain.shape[0]):
    for y in range(xtrain.shape[1]):
        if xtrain[x][y] == 0:
            im.putpixel((x,y), wh)
        elif xtrain[x][y] == target[x]:
            im.putpixel((x,y), re)
        elif (np.abs(xtrain[x][y] - target[x]) / target[x]) < 0.05:
            im.putpixel((x,y), gr)
        else:
            im.putpixel((x,y), ga)
im.save('leak.bmp')

In [7]:
leak_col = []
for c in dataCols:
    leak2 = np.sum(( np.abs((train[c] - train['target']) / train['target']) < 0.05).astype(int))
    if leak2 > 30:
        leak_col.append(c)
print(len(leak_col))

114


In [8]:
leak_col = list(leak_col)
train = train[leak_col +  ['ID', 'target']]
test = test[leak_col +  ['ID']]

In [9]:
train.loc[:,"nz_mean"] = train[leak_col].apply(lambda x: x[x!=0].mean(), axis=1)
train.loc[:,"nz_max"] = train[leak_col].apply(lambda x: x[x!=0].max(), axis=1)
train.loc[:,"nz_min"] = train[leak_col].apply(lambda x: x[x!=0].min(), axis=1)
train.loc[:,"cntZero"] = train[leak_col].apply(lambda x: len(x[x==0]), axis=1)
train.loc[:,"mean"] = train[leak_col].apply(lambda x: x.mean(), axis=1)
train.loc[:,"max"] = train[leak_col].apply(lambda x: x.max(), axis=1)
train.loc[:,"min"] = train[leak_col].apply(lambda x: x.min(), axis=1)

test.loc[:,"nz_mean"] = test[leak_col].apply(lambda x: x[x!=0].mean(), axis=1)
test.loc[:,"nz_max"] = test[leak_col].apply(lambda x: x[x!=0].max(), axis=1)
test.loc[:,"nz_min"] = test[leak_col].apply(lambda x: x[x!=0].min(), axis=1)
test.loc[:,"cntZero"] = test[leak_col].apply(lambda x: len(x[x==0]), axis=1)
test.loc[:,"mean"] = test[leak_col].apply(lambda x: x.mean(), axis=1)
test.loc[:,"max"] = test[leak_col].apply(lambda x: x.max(), axis=1)
test.loc[:,"min"] = test[leak_col].apply(lambda x: x.min(), axis=1)
leak_col += ['nz_mean', 'nz_max', 'nz_min', 'cntZero', 'mean', 'max', 'min']

In [10]:
print(train.shape, test.shape)
print(leak_col)

(4459, 123) (49342, 122)
['20aa07010', '87ffda550', '963a49cdc', '68a945b18', '63c094ba4', '935ca66a9', 'e078302ef', '861076e21', 'bee629024', '26fc93eb7', '0572565c2', '66ace2992', '350473311', 'fb49e4212', '6619d81fc', '8337d1adc', '6eef030c1', 'fc99f9426', '1c71183bb', 'bd6da0cca', '956d228b9', 'df838756c', 'f3cf9341c', '2eeadde2b', '1db387535', 'ce3d7595b', 'b43a7cfd5', '024c577b9', 'aac52d8d9', 'ea772e115', 'ad009c8b9', 'b4cfe861f', '51d4053c7', '2ec5b290f', '44d5b820f', 'cd24eae8a', '0ff32eb98', '166008929', '58e056e12', 'e16a20511', 'b7c931383', '241f0f867', 'cbb673163', '1931ccfdd', '8f57141ec', 'f02ecb19c', '58e2e02e6', '1fe5d56b9', '5e645a169', '031490e77', 'a6b6bc34a', 'a9f61cf27', '9fd594eec', 'fb0f5dbfe', 'd5fa73ead', '99e779ee0', 'f6eba969e', '91f701ba2', '6b795a2bc', 'ca2b906e8', '8e4d0fe45', '703885424', '6c5c8869c', '2e103d632', 'f97d9431e', '191e21b5f', 'c928b4b74', '122c135ed', '62fb56487', 'eeb9cd3aa', '324921c7b', '58232a6fb', 'c0d2348b7', '1af4d24fa', '491b9ee45',

In [11]:
train.head()

Unnamed: 0,20aa07010,87ffda550,963a49cdc,68a945b18,63c094ba4,935ca66a9,e078302ef,861076e21,bee629024,26fc93eb7,...,8675bec0b,ID,target,nz_mean,nz_max,nz_min,cntZero,mean,max,min
0,0.0,1300000.0,13200000.0,0.0,7100000.0,0.0,1600000.0,0.0,0.0,3205000.0,...,0.0,000d6aaf2,38000000.0,4156952.0,28000000.0,250000.0,79,1276257.0,28000000.0,0.0
1,2200000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,000fbd867,600000.0,4342361.0,16000000.0,800000.0,102,457090.6,16000000.0,0.0
2,0.0,0.0,12000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0027d6b71,10000000.0,8875000.0,12000000.0,5500000.0,110,311403.5,12000000.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0028cbf45,2000000.0,2775000.0,5800000.0,1300000.0,110,97368.42,5800000.0,0.0
4,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,8000000.0,0.0,0.0,...,0.0,002a68644,14400000.0,9888769.0,37662000.0,400000.0,101,1127667.0,37662000.0,0.0


In [39]:
for i in range(2, 100):
    train['index'+str(i)] = ((train.index + 2) % i == 0).astype(int)
    test['index'+str(i)] = ((test.index + 2) % i == 0).astype(int)
    col.append('index'+str(i))

In [41]:
sub3 = pd.read_csv('./leaky_submission.csv')
test = pd.merge(test, sub3, how='left', on='ID',)

In [42]:
from scipy.sparse import csr_matrix, vstack
train = train.replace(0, np.nan)
test = test.replace(0, np.nan)
train = pd.concat((train, test), axis=0, ignore_index=True)

In [43]:
test['target'] = 0.0
folds = 5
for fold in range(folds):
    x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.20, random_state=fold)
    params = {'learning_rate': 0.02, 'max_depth': 7, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'seed':fold}
    model = lgb.train(params, lgb.Dataset(x1, label=y1), 3000, lgb.Dataset(x2, label=y2), verbose_eval=200, early_stopping_rounds=100)
    test['target'] += np.expm1(model.predict(test[col], num_iteration=model.best_iteration))
test['target'] /= folds
test[['ID', 'target']].to_csv('submission.csv', index=False)


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 0.850041
[400]	valid_0's rmse: 0.847941
Early stopping, best iteration is:
[442]	valid_0's rmse: 0.847622
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 0.82896
[400]	valid_0's rmse: 0.827591
Early stopping, best iteration is:
[336]	valid_0's rmse: 0.827404
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 0.832559
[400]	valid_0's rmse: 0.830785
Early stopping, best iteration is:
[435]	valid_0's rmse: 0.830492
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 0.825973
[400]	valid_0's rmse: 0.82456
Early stopping, best iteration is:
[432]	valid_0's rmse: 0.82433
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 0.832116
[400]	valid_0's rmse: 0.830204
Early stopping, best iteration is:
[454]	valid_0's rmse: 0.830111


In [61]:
        keys = list(range(1,27))
        values = [chr(i) for i in range(65,91)]
        
        mydict = dict(zip(keys, values))

In [76]:
a = 4/2
(4/2).is_integer()

True