In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

import time

In [2]:
def load_data(conf):
    st = time.time()
    if os.path.exists(f'{conf}.feather'):
        out = pd.read_feather(f'{conf}.feather')
        print(f'{conf} data loaded from feather {time.time()-st:.2f} s!')
    else:
        out = pd.read_csv(f'./{conf}.csv.zip')
        print(f'{conf} data loaded from csv {time.time()-st:.2f} s!')
        out.to_feather(f'{conf}.feather')
        print(f'{conf} data save to feather {time.time()-st:.2f} s!')  
    return out 
train = load_data('train')
test = load_data('test')

  return feather.read_dataframe(path, nthreads=nthreads)


train data loaded from feather 2.27 s!
test data loaded from feather 21.80 s!


In [3]:
transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [4]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'] 

In [5]:
%%time
test["target"] = train["target"].mean()

all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
all_df.head()
all_df.shape

CPU times: user 4.04 s, sys: 8.94 s, total: 13 s
Wall time: 14.2 s


In [6]:
from multiprocessing import Pool, cpu_count
CPU_CORES = 1
def _time_series_info(row):
    try:
        id_1st_nz = row.nonzero()[0][0]
        value_1st_nz = str(row[id_1st_nz])
    except:
        return '0_0.0'   
    return str(id_1st_nz)+'_'+value_1st_nz

def _join2str(df):
    return df.apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)

def _get_leak(df, cols, lag=0, n_thread=2):
    """
    Get leaked data
    """
    st = time.time()
    
    df_split = np.array_split(df[cols[lag+2:]], n_thread)
    df_shift_split = np.array_split(df[cols].shift(lag+2, axis=1)[cols[lag+2:]], n_thread)
    
    print(f'Shift columns: {time.time()-st:.2f} seconds!')
    with Pool(processes=n_thread) as p:
        result1 = p.map(_join2str, df_split)
        result2 = p.map(_join2str, df_shift_split)
        
    series_str = pd.concat(list(result1), ignore_index=True)
    series_shifted_str = pd.concat(list(result2), ignore_index=True)
    print(f'Create time series strings before and after shift: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    series_dict = {}
    for i in range(len(series_str)):
        key = series_str[i]
        if key in series_dict.keys():
            continue
        series_dict[key] = i
    print(f'Create dictionary for faster search: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    target_vals = series_shifted_str.apply(lambda x: df.loc[series_dict[x], cols[lag]] 
                                                   if x in series_dict else 0)
    print(f'Matching process finished: {time.time()-st:.2f} seconds!')
    return target_vals

def get_all_leak(df, cols=None, nlags=15):
    """
    We just recursively fetch target value for different lags
    """
    df =  df.copy()
    
    for i in range(nlags):
        print("Processing lag {}".format(i))
        df["leaked_target_"+str(i)] = _get_leak(df, cols, i)
    return df

In [7]:
%%time
## test the speed of get leaked data of one lag value
d = _get_leak(all_df, cols, 0)
test_ = all_df
test_['predict'] = d

Shift columns: 0.11 seconds!
Create time series strings before and after shift: 18.39 seconds!
Create dictionary for faster search: 0.75 seconds!
Matching process finished: 0.67 seconds!
CPU times: user 1.73 s, sys: 342 ms, total: 2.07 s
Wall time: 19.8 s


In [8]:
%%time
NLAGS = 38 #Increasing this might help push score a bit
all_df = get_all_leak(all_df, cols=cols, nlags=NLAGS)

leaky_cols = ["leaked_target_"+str(i) for i in range(NLAGS)]
train = train.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")
test = test.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")

Processing lag 0
Shift columns: 0.12 seconds!
Create time series strings before and after shift: 23.21 seconds!
Create dictionary for faster search: 0.93 seconds!
Matching process finished: 0.50 seconds!
Processing lag 1
Shift columns: 0.14 seconds!
Create time series strings before and after shift: 15.73 seconds!
Create dictionary for faster search: 1.00 seconds!
Matching process finished: 0.48 seconds!
Processing lag 2
Shift columns: 0.14 seconds!
Create time series strings before and after shift: 15.55 seconds!
Create dictionary for faster search: 0.93 seconds!
Matching process finished: 0.48 seconds!
Processing lag 3
Shift columns: 0.14 seconds!
Create time series strings before and after shift: 18.76 seconds!
Create dictionary for faster search: 0.95 seconds!
Matching process finished: 0.51 seconds!
Processing lag 4
Shift columns: 0.15 seconds!
Create time series strings before and after shift: 22.86 seconds!
Create dictionary for faster search: 1.27 seconds!
Matching process fini

In [9]:
%%time
## post-processing
cnt = 0
for i in range(4300):
    tar = train['target'].iloc[i]
    nz_idx = train[leaky_cols].iloc[i].nonzero()[0]
    try:
        likely = train[leaky_cols].iloc[i][nz_idx].value_counts().reset_index().sort_values(by=i, ascending=False).iloc[0,0]
    except:
        likely = 0.0

    if tar == likely:
        cnt += 1

CPU times: user 36.6 s, sys: 1.31 s, total: 37.9 s
Wall time: 38.7 s


In [14]:
cnt

3179

In [10]:
%%time
train["nonzero_mean"] = train[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
test["nonzero_mean"] = test[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)

CPU times: user 1min 20s, sys: 16 s, total: 1min 36s
Wall time: 1min 39s


In [11]:
%%time
#We start with 1st lag target and recusrsively fill zero's
train["compiled_leak"] = 0
test["compiled_leak"] = 0
for i in range(NLAGS):
    train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "leaked_target_"+str(i)]
    test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "leaked_target_"+str(i)]
    
print("Leak values found in train and test ", sum(train["compiled_leak"] > 0), sum(test["compiled_leak"] > 0))
print("% of correct leaks values in train ", sum(train["compiled_leak"] == train["target"])/sum(train["compiled_leak"] > 0))

train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "nonzero_mean"]
test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "nonzero_mean"]

Leak values found in train and test  4383 43394
% of correct leaks values in train  0.8188455395847593
CPU times: user 8.61 s, sys: 9.2 s, total: 17.8 s
Wall time: 15 s


In [12]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, np.log1p(train["compiled_leak"]).fillna(14.49)))

0.8970406264797846

In [13]:
sub = test[["ID"]]
sub["target"] = test["compiled_leak"]

if not os.path.exists('submissions'):
    os.mkdir('submissions')
    
sub.to_csv('submissions/baseline_submission_with_leaks_'+'_'.join(time.ctime().split())+'.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
