In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

import time

In [2]:
def load_data(conf):
    st = time.time()
    if os.path.exists(f'{conf}.feather'):
        out = pd.read_feather(f'{conf}.feather')
        print(f'{conf} data loaded from feather {time.time()-st:.2f} s!')
    else:
        out = pd.read_csv(f'./{conf}.csv.zip')
        print(f'{conf} data loaded from csv {time.time()-st:.2f} s!')
        out.to_feather(f'{conf}.feather')
        print(f'{conf} data save to feather {time.time()-st:.2f} s!')  
    return out 
train = load_data('train')
test = load_data('test')

train data loaded from csv 7.07 s!
train data save to feather 9.70 s!
test data loaded from csv 106.46 s!
test data save to feather 119.63 s!


In [3]:
transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [4]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'] 

In [5]:
df = train
lag = 0
t1 = df[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
t2 = df[cols].shift(lag+2, axis=1)[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
t3 = t2.apply(lambda x: np.where(x == t1)[0])
t4 = t3.apply(lambda x: df.loc[x[0], cols[lag]] if len(x)==1 else 0)

In [6]:
rr = train.iloc[:, 1]

In [7]:
rr[0]

38000000.0

In [8]:
test["target"] = train["target"].mean()

all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
all_df.head()
all_df.shape

(53801, 42)

In [25]:
from multiprocessing import Pool
CPU_CORES = 1
def _time_series_info(row):
    try:
        id_1st_nz = row.nonzero()[0][0]
        value_1st_nz = str(row[id_1st_nz])
    except:
        return '0_0.0'   
    return str(id_1st_nz)+'_'+value_1st_nz


def _get_leak(df, cols, lag=0):
    st = time.time()
    series_str = df[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
    series_shifted_str = df[cols].shift(lag+2, axis=1)[cols[lag+2:]].apply(lambda x: 
                                                                           "_".join(x.round(2).astype(str)), 
                                                                           axis=1)   
    print(f'Create time series strings before and after shift: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    series_dict = {}
    for i in range(len(series_str)):
        key = series_str[i]
        if key in series_dict.keys():
            continue
        series_dict[key] = i

    print(f'Create dictionary for faster search: {time.time()-st:.2f} seconds!')
    
    st = time.time()
    target_vals = series_shifted_str.apply(lambda x: df.loc[series_dict[x], cols[lag]] 
                                                   if x in series_dict else 0)
    print(f'Matching process finished: {time.time()-st:.2f} seconds!')
#     target_rows = series_shifted_str.progress_apply(lambda x: series_dict[x])
#     target_vals = target_rows.apply(lambda x: df.loc[x, cols[lag]] if len(x)==1 else 0)
    return target_vals

#     target_rows = series_shifted_str.progress_apply(lambda x: np.where(x == series_str)[0])
#     target_vals = target_rows.apply(lambda x: df.loc[x[0], cols[lag]] if len(x)==1 else 0)
#     return target_vals

def get_all_leak(df, cols=None, nlags=15):
    """
    We just recursively fetch target value for different lags
    """
    df =  df.copy()
    #with Pool(processes=CPU_CORES) as p:
    #    res = [p.apply_async(_get_leak, args=(df, cols, i)) for i in range(nlags)]
    #    res = [r.get() for r in res]
    
    for i in range(nlags):
        print("Processing lag {}".format(i))
        df["leaked_target_"+str(i)] = _get_leak(df, cols, i)
    return df

In [26]:
%%time
d = _get_leak(all_df, cols, 0)

Create time series strings before and after shift: 19.47 seconds!
Create dictionary for faster search: 0.87 seconds!
Matching process finished: 0.44 seconds!
CPU times: user 21.3 s, sys: 134 ms, total: 21.4 s
Wall time: 20.8 s


In [27]:
%%time
NLAGS = 25 #Increasing this might help push score a bit
all_df = get_all_leak(all_df, cols=cols, nlags=NLAGS)

Processing lag 0
Create time series strings before and after shift: 19.52 seconds!
Create dictionary for faster search: 0.89 seconds!
Matching process finished: 0.45 seconds!
Processing lag 1
Create time series strings before and after shift: 19.55 seconds!
Create dictionary for faster search: 0.87 seconds!
Matching process finished: 0.44 seconds!
Processing lag 2
Create time series strings before and after shift: 19.30 seconds!
Create dictionary for faster search: 0.87 seconds!
Matching process finished: 0.44 seconds!
Processing lag 3
Create time series strings before and after shift: 19.25 seconds!
Create dictionary for faster search: 0.86 seconds!
Matching process finished: 0.46 seconds!
Processing lag 4
Create time series strings before and after shift: 19.48 seconds!
Create dictionary for faster search: 0.85 seconds!
Matching process finished: 0.45 seconds!
Processing lag 5
Create time series strings before and after shift: 19.54 seconds!
Create dictionary for faster search: 0.89 

In [28]:
leaky_cols = ["leaked_target_"+str(i) for i in range(NLAGS)]
train = train.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")
test = test.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")

In [30]:
train[["target"]+leaky_cols].head(30)

Unnamed: 0,target,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_15,leaked_target_16,leaked_target_17,leaked_target_18,leaked_target_19,leaked_target_20,leaked_target_21,leaked_target_22,leaked_target_23,leaked_target_24
0,38000000.0,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,0.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,0.0,0.0,0.0
1,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2000000.0,0.0,2800000.0,0.0,2000000.0,0.0,0.0,0.0,0.0,3644666.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2800000.0,0.0,0.0,0.0,2800000.0,0.0,0.0,0.0,2800000.0,0.0,...,2800000.0,0.0,0.0,0.0,2800000.0,0.0,2800000.0,2800000.0,0.0,0.0
6,164000.0,0.0,0.0,0.0,164000.0,0.0,164000.0,0.0,0.0,0.0,...,0.0,164000.0,0.0,0.0,164000.0,0.0,0.0,0.0,0.0,0.0
7,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,0.0,...,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0
8,979000.0,979000.0,979000.0,979000.0,979000.0,979000.0,979000.0,0.0,0.0,0.0,...,979000.0,0.0,979000.0,979000.0,0.0,0.0,0.0,0.0,0.0,0.0
9,460000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,460000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,460000.0,460000.0,0.0,0.0,0.0


In [31]:
%%time
train["nonzero_mean"] = train[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
test["nonzero_mean"] = test[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)

In [33]:
%%time
#We start with 1st lag target and recusrsively fill zero's
train["compiled_leak"] = 0
test["compiled_leak"] = 0
for i in range(NLAGS):
    train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "leaked_target_"+str(i)]
    test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "leaked_target_"+str(i)]
    
print("Leak values found in train and test ", sum(train["compiled_leak"] > 0), sum(test["compiled_leak"] > 0))
print("% of correct leaks values in train ", sum(train["compiled_leak"] == train["target"])/sum(train["compiled_leak"] > 0))

train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "nonzero_mean"]
test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "nonzero_mean"]

Leak values found in train and test  4072 32698
% of correct leaks values in train  0.8747544204322201
CPU times: user 5.13 s, sys: 7.08 s, total: 12.2 s
Wall time: 9.51 s


In [34]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, np.log1p(train["compiled_leak"]).fillna(14.49)))

0.8014511083501742

In [35]:
sub = test[["ID"]]
sub["target"] = test["compiled_leak"]
sub.to_csv("baseline_submission_with_leaks.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
result = pd.read_csv('./baseline_submission_with_leaks.csv')

In [37]:
result.shape

(49342, 2)