In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
import os
# UTC Timestamp and timing
import time
from datetime import datetime

In [2]:
## Parallelization functions
import numpy as np
from multiprocessing import cpu_count
 
nCores = cpu_count() # Number of CPU cores on your system
partitions = nCores  # Number of partitions
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(nCores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [7]:
def load_data(conf):
    st = time.time()
    if os.path.exists(f'{conf}_raw.feather'):
        out = pd.read_feather(f'{conf}_raw.feather')
        print(f'{conf} data loaded from feather {time.time()-st:.2f} s!')
    else:
        out = pd.read_csv(f'./{conf}.csv.zip')
        print(f'{conf} data loaded from csv {time.time()-st:.2f} s!')
        out.to_feather(f'{conf}_raw.feather')
        print(f'{conf} data save to feather {time.time()-st:.2f} s!')  
    return out 
train = load_data('train')
test = load_data('test')

train data loaded from csv 3.45 s!
train data save to feather 4.02 s!
test data loaded from csv 49.01 s!
test data save to feather 62.70 s!


In [8]:
print(train.shape, test.shape)
train.head()

(4459, 4993) (49342, 4992)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [9]:
## Drop columns with only one unique value (usually zero)
only_one = train.columns[train.nunique() == 1]
train.drop(only_one, axis = 1, inplace = True)
test.drop(only_one, axis = 1, inplace = True)
print(train.shape, test.shape)

(4459, 4737) (49342, 4736)


In [10]:
## Copy training features and target out
dataCols = [c for c in train.columns if c not in ['ID', 'target']]
xtrain = train[dataCols].copy().values
target = train['target'].copy().values

In [18]:
from PIL import Image, ImageDraw, ImageColor
im = Image.new('RGB', xtrain.shape)
wh = ImageColor.getrgb('white')
re = ImageColor.getrgb('red')
gr = ImageColor.getrgb('green')
ga = ImageColor.getrgb('gray')

for x in range(xtrain.shape[0]):
    for y in range(xtrain.shape[1]):
        if xtrain[x][y] == 0:
            im.putpixel((x,y), wh)
        elif xtrain[x][y] == target[x]:
            im.putpixel((x,y), re)
        elif (np.abs(xtrain[x][y] - target[x]) / target[x]) < 0.05:
            im.putpixel((x,y), gr)
        else:
            im.putpixel((x,y), ga)
im.save('leak.bmp')

In [11]:
leak_col = []
for c in dataCols:
    leak2 = np.sum(( np.abs((train[c] - train['target']) / train['target']) < 0.05).astype(int))
    if leak2 > 30:
        leak_col.append(c)
print(len(leak_col))

114


In [12]:
leak_col = list(leak_col)
train = train[leak_col +  ['ID', 'target']]
test = test[leak_col +  ['ID']]

In [13]:
train.loc[:,"nz_mean"] = train[leak_col].apply(lambda x: x[x!=0].mean(), axis=1)
train.loc[:,"nz_max"] = train[leak_col].apply(lambda x: x[x!=0].max(), axis=1)
train.loc[:,"nz_min"] = train[leak_col].apply(lambda x: x[x!=0].min(), axis=1)
train.loc[:,"cntZero"] = train[leak_col].apply(lambda x: len(x[x==0]), axis=1)
train.loc[:,"mean"] = train[leak_col].apply(lambda x: x.mean(), axis=1)
train.loc[:,"max"] = train[leak_col].apply(lambda x: x.max(), axis=1)
train.loc[:,"min"] = train[leak_col].apply(lambda x: x.min(), axis=1)

test.loc[:,"nz_mean"] = test[leak_col].apply(lambda x: x[x!=0].mean(), axis=1)
test.loc[:,"nz_max"] = test[leak_col].apply(lambda x: x[x!=0].max(), axis=1)
test.loc[:,"nz_min"] = test[leak_col].apply(lambda x: x[x!=0].min(), axis=1)
test.loc[:,"cntZero"] = test[leak_col].apply(lambda x: len(x[x==0]), axis=1)
test.loc[:,"mean"] = test[leak_col].apply(lambda x: x.mean(), axis=1)
test.loc[:,"max"] = test[leak_col].apply(lambda x: x.max(), axis=1)
test.loc[:,"min"] = test[leak_col].apply(lambda x: x.min(), axis=1)
leak_col += ['nz_mean', 'nz_max', 'nz_min', 'cntZero', 'mean', 'max', 'min']

In [13]:
## Save to feather
train.to_feather('./train_processed.feather')
test.to_feather('./test_processed.feather')
print(train.shape, test.shape)
print(leak_col)

(4459, 123) (49342, 122)
['20aa07010', '87ffda550', '963a49cdc', '68a945b18', '63c094ba4', '935ca66a9', 'e078302ef', '861076e21', 'bee629024', '26fc93eb7', '0572565c2', '66ace2992', '350473311', 'fb49e4212', '6619d81fc', '8337d1adc', '6eef030c1', 'fc99f9426', '1c71183bb', 'bd6da0cca', '956d228b9', 'df838756c', 'f3cf9341c', '2eeadde2b', '1db387535', 'ce3d7595b', 'b43a7cfd5', '024c577b9', 'aac52d8d9', 'ea772e115', 'ad009c8b9', 'b4cfe861f', '51d4053c7', '2ec5b290f', '44d5b820f', 'cd24eae8a', '0ff32eb98', '166008929', '58e056e12', 'e16a20511', 'b7c931383', '241f0f867', 'cbb673163', '1931ccfdd', '8f57141ec', 'f02ecb19c', '58e2e02e6', '1fe5d56b9', '5e645a169', '031490e77', 'a6b6bc34a', 'a9f61cf27', '9fd594eec', 'fb0f5dbfe', 'd5fa73ead', '99e779ee0', 'f6eba969e', '91f701ba2', '6b795a2bc', 'ca2b906e8', '8e4d0fe45', '703885424', '6c5c8869c', '2e103d632', 'f97d9431e', '191e21b5f', 'c928b4b74', '122c135ed', '62fb56487', 'eeb9cd3aa', '324921c7b', '58232a6fb', 'c0d2348b7', '1af4d24fa', '491b9ee45',

In [14]:
## Load from feather
if os.path.exists('./train_processed.feather'):
    train = pd.read_feather(f'./train_processed.feather')
if os.path.exists('./test_processed.feather'):
    test = pd.read_feather(f'./test_processed.feather')

In [18]:
baseline = pd.read_csv('./baseline-0.63.csv')
test = pd.merge(test, baseline, how='left', on='ID')
train = train.replace(0, np.nan)
test = test.replace(0, np.nan)
model_train = pd.concat((train, test), axis=0, ignore_index=True, sort=False)

In [39]:
for i in range(2, 100):
    train['index'+str(i)] = ((train.index + 2) % i == 0).astype(int)
    test['index'+str(i)] = ((test.index + 2) % i == 0).astype(int)
    col.append('index'+str(i))

In [19]:
# test['target'] = 0.0
testRatio = 0.20
nFolds = 5
params = {
    'nthread': cpu_count(),
    'learning_rate': 0.002, 
    'max_depth': 7,
    'boosting': 'gbdt', 
    'objective': 'regression',
#     'tree_learner': 'feature',
    'metric': 'rmse',
    'is_training_metric': True,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5, 
    'seed': nFolds
}
rawScores = []
featureImportance = []
timeStampStart = datetime.fromtimestamp(time.time())
for fold in range(nFolds):
    x1, x2, y1, y2 = train_test_split(train[leak_col], np.log1p(train.target.values), test_size=testRatio, random_state=fold)
    model = lgb.train(params, lgb.Dataset(x1, label=y1), 5000, lgb.Dataset(x2, label=y2), verbose_eval=200, early_stopping_rounds=200)
    pred = np.expm1(model.predict(test[leak_col], num_iteration=model.best_iteration))
    rawScores.append(model.best_score['valid_0']['rmse'])
    featureImportance.append(model.feature_importance())
    test['target'] += pred
    
featureImportance /= np.sum(rawScores)
test['target'] /= folds + 1
timeStampEnd = datetime.fromtimestamp(time.time())
durationSecond = (timeStampEnd - timeStampStart).total_seconds()

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's rmse: 1.55082
[400]	valid_0's rmse: 1.46941
[600]	valid_0's rmse: 1.4322
[800]	valid_0's rmse: 1.41596
[1000]	valid_0's rmse: 1.4081
[1200]	valid_0's rmse: 1.40616
[1400]	valid_0's rmse: 1.40389
[1600]	valid_0's rmse: 1.40344
Early stopping, best iteration is:
[1525]	valid_0's rmse: 1.40319
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's rmse: 1.52021
[400]	valid_0's rmse: 1.42658
[600]	valid_0's rmse: 1.38172
[800]	valid_0's rmse: 1.36061
[1000]	valid_0's rmse: 1.35135
[1200]	valid_0's rmse: 1.34791
[1400]	valid_0's rmse: 1.34741
[1600]	valid_0's rmse: 1.3468
Early stopping, best iteration is:
[1555]	valid_0's rmse: 1.34647
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's rmse: 1.56254
[400]	valid_0's rmse: 1.45347
[600]	valid_0's rmse: 1.39529
[800]	valid_0's rmse: 1.36222
[1000]	valid_0's rmse: 1.34479
[1200]	valid_0's rmse: 1.33469
[1400]	valid

KeyboardInterrupt: 

In [32]:
def getNewVersion():
    try:
        ld = [f for f in os.listdir('./submissions/') if '.json' in f]
    except:
        return 'v0.0'
    maxVer = 0
    for f in ld:
        ver = int(''.join([c for c in f if c.isnumeric()]))
        maxVer = max(maxVer, ver)
    maxVer = str(maxVer + 1)
    major = maxVer[0]
    minor = maxVer[1:]
    return f'v{major}.{minor}'

submissionVersion = getNewVersion()
submissionComment = ''
submissionFile = f'./submissions/{submissionVersion}.csv'
submissionNote = f'{submissionVersion} - GBDT+Ensemble, n={nFolds}, p={testRatio}, {submissionComment}'

dictSave = {}
dictSave['features'] = leak_col
dictSave['featureRank'] = featureImportance
dictSave['finalScore'] = np.mean(rawScores)
dictSave['modelParam'] = params
dictSave['testRatio'] = testRatio
dictSave['nEnsemble'] = nFolds
dictSave['bestScore'] = np.min(rawScores)
dictSave['rawAucScore'] = rawScores
dictSave['trainingTime'] = durationSecond
dictSave['trainingThreads'] = params['nthread']
# backup data
with open(f'./submission/{submissionVersion}.pkl', 'wb') as fp:
    pkl.dump(dictSave, fp)
with open(f'./submission/{submissionVersion}.json', 'w') as fp:
    json.dump(dictSave, fp, sort_keys=True, indent=4, cls=NumpyEncoder)
    
print(f'{submissionVersion} ready to submit')

NameError: name 'nFolds' is not defined

In [None]:
test[['ID', 'target']].to_csv('./submissions/submission.csv', index=False)

In [61]:
        keys = list(range(1,27))
        values = [chr(i) for i in range(65,91)]
        
        mydict = dict(zip(keys, values))

In [76]:
a = 4/2
(4/2).is_integer()

True