In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

import time

In [2]:
def load_data(conf):
    st = time.time()
    if os.path.exists(f'{conf}.feather'):
        out = pd.read_feather(f'{conf}.feather')
        print(f'{conf} data loaded from feather {time.time()-st:.2f} s!')
    else:
        out = pd.read_csv(f'./{conf}.csv.zip')
        print(f'{conf} data loaded from csv {time.time()-st:.2f} s!')
        out.to_feather(f'{conf}.feather')
        print(f'{conf} data save to feather {time.time()-st:.2f} s!')  
    return out 
train = load_data('train')
test = load_data('test')

  return feather.read_dataframe(path, nthreads=nthreads)


train data loaded from feather 2.92 s!
test data loaded from feather 22.50 s!


In [3]:
transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [4]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'] 

In [14]:
%%time
test["target"] = train["target"].mean()

all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
all_df.head()
all_df.shape

CPU times: user 53.2 ms, sys: 46.9 ms, total: 100 ms
Wall time: 86.1 ms


In [26]:
from multiprocessing import Pool, cpu_count
CPU_CORES = 1
def _time_series_info(row):
    try:
        id_1st_nz = row.nonzero()[0][0]
        value_1st_nz = str(row[id_1st_nz])
    except:
        return '0_0.0'   
    return str(id_1st_nz)+'_'+value_1st_nz

def _join2str(df):
    return df.apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)

def _get_leak(df, cols, lag=0, n_thread=2):
    """
    Get leaked data
    """
    st = time.time()
    
    df_split = np.array_split(df[cols[lag+2:]], n_thread)
    df_shift_split = np.array_split(df[cols].shift(lag+2, axis=1)[cols[lag+2:]], n_thread)
    
    print(f'Shift columns: {time.time()-st:.2f} seconds!')
    with Pool(processes=n_thread) as p:
        result1 = p.map(_join2str, df_split)
        result2 = p.map(_join2str, df_shift_split)
        
    series_str = pd.concat(list(result1), ignore_index=True)
    series_shifted_str = pd.concat(list(result2), ignore_index=True)
#     series_str = df[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
#     series_shifted_str = df[cols].shift(lag+2, axis=1)[cols[lag+2:]].apply(lambda x: 
#                                                                            "_".join(x.round(2).astype(str)), 
#                                                                            axis=1)   
    print(f'Create time series strings before and after shift: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    series_dict = {}
    for i in range(len(series_str)):
        key = series_str[i]
        if key in series_dict.keys():
            continue
        series_dict[key] = i

    print(f'Create dictionary for faster search: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    target_vals = series_shifted_str.apply(lambda x: df.loc[series_dict[x], cols[lag]] 
                                                   if x in series_dict else 0)
    print(f'Matching process finished: {time.time()-st:.2f} seconds!')
#     target_rows = series_shifted_str.progress_apply(lambda x: series_dict[x])
#     target_vals = target_rows.apply(lambda x: df.loc[x, cols[lag]] if len(x)==1 else 0)
    return target_vals

#     target_rows = series_shifted_str.progress_apply(lambda x: np.where(x == series_str)[0])
#     target_vals = target_rows.apply(lambda x: df.loc[x[0], cols[lag]] if len(x)==1 else 0)
#     return target_vals

def get_all_leak(df, cols=None, nlags=15):
    """
    We just recursively fetch target value for different lags
    """
    df =  df.copy()
    
    for i in range(nlags):
        print("Processing lag {}".format(i))
        df["leaked_target_"+str(i)] = _get_leak(df, cols, i)
    return df

In [27]:
%%time
## test the speed of get leaked data of one lag value
d = _get_leak(all_df, cols, 0)
test_ = all_df
test_['predict'] = d

Shift columns: 0.15 seconds!
Create time series strings before and after shift: 13.63 seconds!
Create dictionary for faster search: 0.78 seconds!
Matching process finished: 0.41 seconds!
CPU times: user 1.5 s, sys: 302 ms, total: 1.8 s
Wall time: 14.9 s


In [17]:
%%time
NLAGS = 25 #Increasing this might help push score a bit
all_df = get_all_leak(all_df, cols=cols, nlags=NLAGS)

Processing lag 0
Create time series strings before and after shift: 21.74 seconds!
Create dictionary for faster search: 0.68 seconds!
Matching process finished: 0.36 seconds!
Processing lag 1
Create time series strings before and after shift: 21.55 seconds!
Create dictionary for faster search: 0.67 seconds!
Matching process finished: 0.36 seconds!
Processing lag 2
Create time series strings before and after shift: 24.84 seconds!
Create dictionary for faster search: 0.75 seconds!
Matching process finished: 0.39 seconds!
Processing lag 3
Create time series strings before and after shift: 20.86 seconds!
Create dictionary for faster search: 0.73 seconds!
Matching process finished: 0.43 seconds!
Processing lag 4
Create time series strings before and after shift: 22.76 seconds!
Create dictionary for faster search: 0.70 seconds!
Matching process finished: 0.38 seconds!
Processing lag 5
Create time series strings before and after shift: 21.91 seconds!
Create dictionary for faster search: 0.83 

In [30]:
%%time
leaky_cols = ["leaked_target_"+str(i) for i in range(NLAGS)]
train = train.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")
test = test.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")

ValueError: columns overlap but no suffix specified: Index(['leaked_target_0', 'leaked_target_1', 'leaked_target_2',
       'leaked_target_3', 'leaked_target_4', 'leaked_target_5',
       'leaked_target_6', 'leaked_target_7', 'leaked_target_8',
       'leaked_target_9', 'leaked_target_10', 'leaked_target_11',
       'leaked_target_12', 'leaked_target_13', 'leaked_target_14',
       'leaked_target_15', 'leaked_target_16', 'leaked_target_17',
       'leaked_target_18', 'leaked_target_19', 'leaked_target_20',
       'leaked_target_21', 'leaked_target_22', 'leaked_target_23',
       'leaked_target_24'],
      dtype='object')

In [161]:
%%time
## post-processing
cnt = 0
for i in range(4300):
    tar = train['target'].iloc[i]
#     lst = train[leaky_cols].iloc[i].unique()
#     if tar in lst:
#         print(f'True, {len(lst)}, {lst}')
#     else:
#         print(f'False, {len(lst)}, {lst}')
    nz_idx = train[leaky_cols].iloc[i].nonzero()[0]
    try:
        likely = train[leaky_cols].iloc[i][nz_idx].value_counts().reset_index().sort_values(by=i, ascending=False).iloc[0,0]
    except:
        likely = 0.0
#     print(likely)
    if tar == likely:
#         print(f'True')
        cnt += 1
#     else:
#         print(f'False')

In [162]:
print(cnt)

3366


In [163]:
3700*0.97

3589.0

In [31]:
%%time
train["nonzero_mean"] = train[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
test["nonzero_mean"] = test[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)

In [33]:
%%time
#We start with 1st lag target and recusrsively fill zero's
train["compiled_leak"] = 0
test["compiled_leak"] = 0
for i in range(NLAGS):
    train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "leaked_target_"+str(i)]
    test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "leaked_target_"+str(i)]
    
print("Leak values found in train and test ", sum(train["compiled_leak"] > 0), sum(test["compiled_leak"] > 0))
print("% of correct leaks values in train ", sum(train["compiled_leak"] == train["target"])/sum(train["compiled_leak"] > 0))

train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "nonzero_mean"]
test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "nonzero_mean"]

Leak values found in train and test  4072 32698
% of correct leaks values in train  0.8747544204322201
CPU times: user 5.13 s, sys: 7.08 s, total: 12.2 s
Wall time: 9.51 s


In [34]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, np.log1p(train["compiled_leak"]).fillna(14.49)))

0.8014511083501742

In [35]:
sub = test[["ID"]]
sub["target"] = test["compiled_leak"]
sub.to_csv("baseline_submission_with_leaks.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
result = pd.read_csv('./baseline_submission_with_leaks.csv')

In [37]:
result.shape

(49342, 2)