In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import keggler as kg
from helpers import *
#import helpers

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil
import glob


import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [#logging.StreamHandler(None), 
            logging.FileHandler(filename, 'a')
        ]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

#PATH='data_mini/'
#prefix='_mini'

PATH='data/'
SUB_PATH='test_set/' #test_set #training_set

v_data='v2'
v_xtra='v4'
v_model=['m05i{}_6f'.format(i) for i in range(5)]

print(os.listdir(PATH))


['submissions', 'test_set', 'track_features', 'training_set']


In [2]:
half_1_logs = sorted(glob.glob(PATH + SUB_PATH + "outDD_"+v_data+"*.h5"))
half_1_xtra = sorted(glob.glob(PATH + SUB_PATH + "outDD_"+v_xtra+"*.h5"))
half_2_logs = sorted(glob.glob(PATH + SUB_PATH + "subDD_"+v_data+"*.h5"))
half_2_xtra = sorted(glob.glob(PATH + SUB_PATH + "subDD_"+v_xtra+"*.h5"))

In [3]:
half_1_logs[:3]

['data/test_set/outDD_v2_00.h5',
 'data/test_set/outDD_v2_01.h5',
 'data/test_set/outDD_v2_02.h5']

In [4]:
half_2_logs[:3]

['data/test_set/subDD_v2_00.h5',
 'data/test_set/subDD_v2_01.h5',
 'data/test_set/subDD_v2_02.h5']

In [5]:
n_trk = 5

# Read dataframes

In [6]:
from sklearn.preprocessing import LabelEncoder

def read_list_of_files(l1, l2, l1_xtr=None, l2_xtr=None, n_trk=1):
    df_1 = pd.concat([read_log(f) for f in l1], axis=0, ignore_index=True)
#     print(df_1.shape)
    if l1_xtr:
        df_xtr = pd.concat([read_log(f) for f in l1_xtr], axis=0, ignore_index=True)
        df_1 = pd.concat([df_1, df_xtr], axis=1)
        del df_xtr
    
    df_2 = pd.concat([read_log(f) for f in l2], axis=0, ignore_index=True)
#     print(df_2.shape)
    if l2_xtr:
        df_xtr = pd.concat([read_log(f) for f in l2_xtr], axis=0, ignore_index=True)
        df_2 = pd.concat([df_2, df_xtr], axis=1)
        
#     print(df_1.shape, df_2.shape)

#     df_1['session_id'] = df_1['session_id'].astype('category')
    le = LabelEncoder().fit(df_1['session_id'])
    df_1['session_id'] = le.transform(df_1['session_id'])
    df_1.rename({'not_skipped': 'skip_4'}, axis=1, inplace=True)

#     df_2['session_id'] = df_2['session_id'].astype('category')
    df_2['session_id'] = le.transform(df_2['session_id'])
    df_2.rename({'not_skipped': 'skip_4'}, axis=1, inplace=True)
    
    del le
    
#     display(df_2.head(30))
    
    # transform the data
    return get_XY((df_1, df_2), aggs, reset_index=True, 
                  list_musik_qualities_=list_musik_qualities,
                  aggs_music_qualities_=aggs_music_qualities,
                  i_=list(range(n_trk)),
                  aggs_trkvec_=aggs_trkvec, list_trkvec_=list_trkvec
                 )

In [7]:
def pred_series_of_lists(list_preds, y_length, i_2fill=-2):
    # transform predictions into a dataframe
    tmp_preds_constant_mdl = pd.DataFrame({'pred_{}'.format(i): list_preds[i] for i in range(len(list_preds))}).astype(np.uint8)
    # add a column with the residual desired length of complete session
    tmp_preds_constant_mdl['len'] = y_length.values - len(list_preds)
    # create a series with lists
    series_of_lists = tmp_preds_constant_mdl.apply(lambda x: x.iloc[:-1].tolist() + [x.iloc[i_2fill]]*x['len'], axis=1)
    del tmp_preds_constant_mdl
    return series_of_lists

def make_predictions(l1, l2, l1_xtr=None, l2_xtr=None, n_trk=1):
    X_trn, [target_length], X_trk = read_list_of_files(l1, l2, l1_xtr=l1_xtr, l2_xtr=l2_xtr, n_trk=n_trk)

    l_prob = []
    for i_ in tqdm(list(range(n_trk))):
        X = pd.concat([X_trn, X_trk[i_]], axis=1)
        if len(l_prob) > 0:
            X = pd.concat([X, pd.DataFrame({'pred_trk{}'.format(j): l_prob[j] 
                                            for j in range(len(l_prob))
                                           })],
                          axis=1)
        import joblib
        prob_pred = np.zeros(shape=(len(X),))
        for v in v_model:
            mfn = 'models/model_{}_{}_{}.pkl'.format(v_data, v, i_)
    #         print(mfn)
            mdl = joblib.load(mfn)
            prob_pred += mdl.predict_proba(X)[:,1]

        prob_pred /= len(v_model)

        l_prob.append(prob_pred)
        
#     print([p>0.50 for p in l_prob])
    return pred_series_of_lists([p>0.50 for p in l_prob], target_length, i_2fill=-2)

In [8]:
#!rm data/submissions/sub_v1.txt

In [None]:
n_part = 66

for i, (a1, a2, a1_xtr, a2_xtr) in tqdm(enumerate(zip(np.array(half_1_logs).reshape(n_part,-1),
                  np.array(half_2_logs).reshape(n_part,-1),
                  np.array(half_1_xtra).reshape(n_part,-1),
                  np.array(half_2_xtra).reshape(n_part,-1),  
                 ))):
#     print(i)
#     print(a1, a2)
#     print(a1_xtr, a2_xtr)
    x1 = make_predictions(a1.tolist(), a2.tolist(), 
                          l1_xtr=a1_xtr.tolist(), l2_xtr=a2_xtr.tolist(), 
                          n_trk=10)
    gc.collect()
    log.info(psutil.Process(os.getpid()).memory_info().rss / 1024**2)
    
    x1.apply(lambda l: ''.join(map(str,l))).to_csv(PATH+'submissions/sub_'+v_data+v_xtra+'_'+v_model[0]+'.txt', index=False, mode='a')
    gc.collect()
    log.info(psutil.Process(os.getpid()).memory_info().rss / 1024**2)
#     if i >=1:
#         break

0it [00:00, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:13<02:04, 13.81s/it][A
 20%|██        | 2/10 [00:30<01:56, 14.58s/it][A
 30%|███       | 3/10 [00:43<01:39, 14.16s/it][A
 40%|████      | 4/10 [00:56<01:23, 13.85s/it][A
 50%|█████     | 5/10 [01:09<01:08, 13.64s/it][A
 60%|██████    | 6/10 [01:21<00:52, 13.25s/it][A
 70%|███████   | 7/10 [01:34<00:38, 12.89s/it][A
 80%|████████  | 8/10 [01:44<00:24, 12.08s/it][A
 90%|█████████ | 9/10 [01:55<00:11, 11.91s/it][A
100%|██████████| 10/10 [02:05<00:00, 11.29s/it][A
1it [05:18, 318.16s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:14<02:13, 14.83s/it][A
 20%|██        | 2/10 [00:33<02:08, 16.01s/it][A
 30%|███       | 3/10 [00:48<01:49, 15.65s/it][A
 40%|████      | 4/10 [01:03<01:33, 15.59s/it][A
 50%|█████     | 5/10 [01:18<01:17, 15.43s/it][A
 60%|██████    | 6/10 [01:32<00:59, 14.86s/it][A
 70%|███████   | 7/10 [01:45<00:43, 14.43s/it][A
 80%|████████  | 8/10 

 30%|███       | 3/10 [00:28<01:04,  9.16s/it][A
 40%|████      | 4/10 [00:36<00:53,  8.95s/it][A
 50%|█████     | 5/10 [00:45<00:44,  8.85s/it][A
 60%|██████    | 6/10 [00:53<00:34,  8.58s/it][A
 70%|███████   | 7/10 [01:00<00:25,  8.34s/it][A
 80%|████████  | 8/10 [01:07<00:15,  7.85s/it][A
 90%|█████████ | 9/10 [01:15<00:07,  7.75s/it][A
100%|██████████| 10/10 [01:21<00:00,  7.37s/it][A
15it [3:12:53, 740.20s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:09<01:24,  9.39s/it][A
 20%|██        | 2/10 [00:20<01:19,  9.97s/it][A
 30%|███       | 3/10 [00:29<01:07,  9.69s/it][A
 40%|████      | 4/10 [00:38<00:57,  9.53s/it][A
 50%|█████     | 5/10 [00:47<00:46,  9.39s/it][A
 60%|██████    | 6/10 [00:56<00:36,  9.13s/it][A
 70%|███████   | 7/10 [01:04<00:26,  8.93s/it][A
 80%|████████  | 8/10 [01:12<00:16,  8.43s/it][A
 90%|█████████ | 9/10 [01:20<00:08,  8.34s/it][A
100%|██████████| 10/10 [01:27<00:00,  7.96s/it][A
16it [3:16:52, 589.76s/it]
  

 60%|██████    | 6/10 [00:53<00:34,  8.55s/it][A
 70%|███████   | 7/10 [01:01<00:25,  8.40s/it][A
 80%|████████  | 8/10 [01:08<00:15,  7.98s/it][A
 90%|█████████ | 9/10 [01:15<00:07,  7.89s/it][A
100%|██████████| 10/10 [01:22<00:00,  7.49s/it][A
29it [5:18:41, 499.59s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:09<01:28,  9.81s/it][A
 20%|██        | 2/10 [00:21<01:24, 10.52s/it][A
 30%|███       | 3/10 [00:31<01:11, 10.27s/it][A
 40%|████      | 4/10 [00:41<01:00, 10.11s/it][A
 50%|█████     | 5/10 [00:51<00:50, 10.00s/it][A
 60%|██████    | 6/10 [01:00<00:38,  9.72s/it][A
 70%|███████   | 7/10 [01:09<00:28,  9.49s/it][A
 80%|████████  | 8/10 [01:16<00:17,  8.94s/it][A
 90%|█████████ | 9/10 [01:25<00:08,  8.75s/it][A
100%|██████████| 10/10 [01:32<00:00,  8.33s/it][A
30it [5:22:01, 409.76s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:10<01:31, 10.17s/it][A
 20%|██        | 2/10 [00:23<01:28, 11.10s/it][A
 30%|███  

100%|██████████| 10/10 [01:18<00:00,  7.26s/it][A
43it [6:06:10, 196.93s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:09<01:25,  9.46s/it][A
 20%|██        | 2/10 [00:21<01:21, 10.20s/it][A
 30%|███       | 3/10 [00:30<01:09,  9.92s/it][A
 40%|████      | 4/10 [00:39<00:58,  9.73s/it][A
 50%|█████     | 5/10 [00:49<00:48,  9.63s/it][A
 60%|██████    | 6/10 [00:58<00:37,  9.34s/it][A
 70%|███████   | 7/10 [01:06<00:27,  9.23s/it][A
 80%|████████  | 8/10 [01:14<00:17,  8.73s/it][A
 90%|█████████ | 9/10 [01:22<00:08,  8.61s/it][A
100%|██████████| 10/10 [01:30<00:00,  8.23s/it][A
44it [6:09:25, 196.48s/it]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:10<01:30, 10.09s/it][A
 20%|██        | 2/10 [00:24<01:31, 11.38s/it][A
 30%|███       | 3/10 [00:35<01:18, 11.29s/it][A
 40%|████      | 4/10 [00:46<01:07, 11.19s/it][A
 50%|█████     | 5/10 [00:58<00:57, 11.46s/it][A
 60%|██████    | 6/10 [01:09<00:44, 11.24s/it][A
 70%|█████

In [None]:
!ls -ltr data/submissions/

In [None]:
!wc -l
