In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import keggler as kg
from helpers import *
#import helpers

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil
import glob


import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [#logging.StreamHandler(None), 
            logging.FileHandler(filename, 'a')
        ]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

#PATH='data_mini/'
#prefix='_mini'

PATH='data/'
SUB_PATH='test_set/' #test_set #training_set

v_data='v2'
v_model='m04_4f'

print(os.listdir(PATH))


['submissions', 'test_set', 'track_features', 'training_set']


In [2]:
half_1_logs = sorted(glob.glob(PATH + SUB_PATH + "outDD_"+v_data+"*.h5"))
half_2_logs = sorted(glob.glob(PATH + SUB_PATH + "subDD_"+v_data+"*.h5"))

In [3]:
half_1_logs[:3]

['data/test_set/outDD_v2_00.h5',
 'data/test_set/outDD_v2_01.h5',
 'data/test_set/outDD_v2_02.h5']

In [4]:
half_2_logs[:3]

['data/test_set/subDD_v2_00.h5',
 'data/test_set/subDD_v2_01.h5',
 'data/test_set/subDD_v2_02.h5']

In [5]:
n_trk = 5

# Read dataframes

In [6]:
from sklearn.preprocessing import LabelEncoder

def read_list_of_files(l1, l2, n_trk=1):
    df_1 = pd.concat([read_log(f) for f in l1], axis=0, ignore_index=True)
    df_2 = pd.concat([read_log(f) for f in l2], axis=0, ignore_index=True)

#     df_1['session_id'] = df_1['session_id'].astype('category')
    le = LabelEncoder().fit(df_1['session_id'])
    df_1['session_id'] = le.transform(df_1['session_id'])
    df_1.rename({'not_skipped': 'skip_4'}, axis=1, inplace=True)

#     df_2['session_id'] = df_2['session_id'].astype('category')
    df_2['session_id'] = le.transform(df_2['session_id'])
    df_2.rename({'not_skipped': 'skip_4'}, axis=1, inplace=True)
    
    del le
    
#     display(df_2.head(30))
    
    # transform the data
    return get_XY((df_1, df_2), aggs, reset_index=True, 
                  list_musik_qualities_=list_musik_qualities,
                  aggs_music_qualities_=aggs_music_qualities,
                   i_=list(range(n_trk)))

In [7]:
def pred_series_of_lists(list_preds, y_length, i_2fill=-2):
    # transform predictions into a dataframe
    tmp_preds_constant_mdl = pd.DataFrame({'pred_{}'.format(i): list_preds[i] for i in range(len(list_preds))}).astype(np.uint8)
    # add a column with the residual desired length of complete session
    tmp_preds_constant_mdl['len'] = y_length.values - len(list_preds)
    # create a series with lists
    series_of_lists = tmp_preds_constant_mdl.apply(lambda x: x.iloc[:-1].tolist() + [x.iloc[i_2fill]]*x['len'], axis=1)
    del tmp_preds_constant_mdl
    return series_of_lists

def make_predictions(l1, l2, n_trk=1):
    X_trn, [target_length], X_trk = read_list_of_files(l1, l2, n_trk=n_trk)

    l_prob = []
    for i_ in tqdm(list(range(n_trk))):
        X = pd.concat([X_trn, X_trk[i_]], axis=1)
        if len(l_prob) > 0:
            X = pd.concat([X, pd.DataFrame({'pred_trk{}'.format(j): l_prob[j] 
                                            for j in range(len(l_prob))
                                           })],
                          axis=1)
        import joblib
        mfn = 'models/model_{}_{}_{}.pkl'.format(v_data, v_model, i_)
        mdl = joblib.load(mfn)

        prob_pred = mdl.predict_proba(X)[:,1]

        l_prob.append(prob_pred)
        
#     print([p>0.50 for p in l_prob])
    return pred_series_of_lists([p>0.50 for p in l_prob], target_length, i_2fill=-2)

In [8]:
#!rm data/submissions/sub_v1.txt

In [9]:
n_part = 33

for i, (a1, a2) in enumerate(zip(np.array(half_1_logs).reshape(n_part,-1),
                  np.array(half_2_logs).reshape(n_part,-1)
                 )):
    print(i)
    print(a1, a2)
    x1 = make_predictions(a1.tolist(), a2.tolist(), n_trk=5)
    gc.collect()
    log.info(psutil.Process(os.getpid()).memory_info().rss / 1024**2)
    
    x1.apply(lambda l: ''.join(map(str,l))).to_csv(PATH+'submissions/sub_'+v_data+'_'+v_model+'.txt', index=False, mode='a')
    gc.collect()
    log.info(psutil.Process(os.getpid()).memory_info().rss / 1024**2)
#     if i >=1:
#         break
#     else:
#         i=i+0

0
['data/test_set/outDD_v2_00.h5' 'data/test_set/outDD_v2_01.h5'] ['data/test_set/subDD_v2_00.h5' 'data/test_set/subDD_v2_01.h5']


100%|██████████| 5/5 [00:33<00:00,  6.39s/it]


1
['data/test_set/outDD_v2_02.h5' 'data/test_set/outDD_v2_03.h5'] ['data/test_set/subDD_v2_02.h5' 'data/test_set/subDD_v2_03.h5']


100%|██████████| 5/5 [00:33<00:00,  6.39s/it]


2
['data/test_set/outDD_v2_04.h5' 'data/test_set/outDD_v2_05.h5'] ['data/test_set/subDD_v2_04.h5' 'data/test_set/subDD_v2_05.h5']


100%|██████████| 5/5 [00:33<00:00,  6.43s/it]


3
['data/test_set/outDD_v2_06.h5' 'data/test_set/outDD_v2_07.h5'] ['data/test_set/subDD_v2_06.h5' 'data/test_set/subDD_v2_07.h5']


100%|██████████| 5/5 [00:26<00:00,  5.24s/it]


4
['data/test_set/outDD_v2_08.h5' 'data/test_set/outDD_v2_09.h5'] ['data/test_set/subDD_v2_08.h5' 'data/test_set/subDD_v2_09.h5']


100%|██████████| 5/5 [00:27<00:00,  5.50s/it]


5
['data/test_set/outDD_v2_10.h5' 'data/test_set/outDD_v2_11.h5'] ['data/test_set/subDD_v2_10.h5' 'data/test_set/subDD_v2_11.h5']


100%|██████████| 5/5 [00:27<00:00,  5.55s/it]


6
['data/test_set/outDD_v2_12.h5' 'data/test_set/outDD_v2_13.h5'] ['data/test_set/subDD_v2_12.h5' 'data/test_set/subDD_v2_13.h5']


100%|██████████| 5/5 [00:28<00:00,  5.65s/it]


7
['data/test_set/outDD_v2_14.h5' 'data/test_set/outDD_v2_15.h5'] ['data/test_set/subDD_v2_14.h5' 'data/test_set/subDD_v2_15.h5']


100%|██████████| 5/5 [00:25<00:00,  5.15s/it]


8
['data/test_set/outDD_v2_16.h5' 'data/test_set/outDD_v2_17.h5'] ['data/test_set/subDD_v2_16.h5' 'data/test_set/subDD_v2_17.h5']


100%|██████████| 5/5 [00:27<00:00,  5.40s/it]


9
['data/test_set/outDD_v2_18.h5' 'data/test_set/outDD_v2_19.h5'] ['data/test_set/subDD_v2_18.h5' 'data/test_set/subDD_v2_19.h5']


100%|██████████| 5/5 [00:28<00:00,  5.74s/it]


10
['data/test_set/outDD_v2_20.h5' 'data/test_set/outDD_v2_21.h5'] ['data/test_set/subDD_v2_20.h5' 'data/test_set/subDD_v2_21.h5']


100%|██████████| 5/5 [00:24<00:00,  4.96s/it]


11
['data/test_set/outDD_v2_22.h5' 'data/test_set/outDD_v2_23.h5'] ['data/test_set/subDD_v2_22.h5' 'data/test_set/subDD_v2_23.h5']


100%|██████████| 5/5 [00:27<00:00,  5.45s/it]


12
['data/test_set/outDD_v2_24.h5' 'data/test_set/outDD_v2_25.h5'] ['data/test_set/subDD_v2_24.h5' 'data/test_set/subDD_v2_25.h5']


100%|██████████| 5/5 [00:27<00:00,  5.53s/it]


13
['data/test_set/outDD_v2_26.h5' 'data/test_set/outDD_v2_27.h5'] ['data/test_set/subDD_v2_26.h5' 'data/test_set/subDD_v2_27.h5']


100%|██████████| 5/5 [00:28<00:00,  5.59s/it]


14
['data/test_set/outDD_v2_28.h5' 'data/test_set/outDD_v2_29.h5'] ['data/test_set/subDD_v2_28.h5' 'data/test_set/subDD_v2_29.h5']


100%|██████████| 5/5 [00:25<00:00,  5.11s/it]


15
['data/test_set/outDD_v2_30.h5' 'data/test_set/outDD_v2_31.h5'] ['data/test_set/subDD_v2_30.h5' 'data/test_set/subDD_v2_31.h5']


100%|██████████| 5/5 [00:27<00:00,  5.43s/it]


16
['data/test_set/outDD_v2_32.h5' 'data/test_set/outDD_v2_33.h5'] ['data/test_set/subDD_v2_32.h5' 'data/test_set/subDD_v2_33.h5']


100%|██████████| 5/5 [00:30<00:00,  6.12s/it]


17
['data/test_set/outDD_v2_34.h5' 'data/test_set/outDD_v2_35.h5'] ['data/test_set/subDD_v2_34.h5' 'data/test_set/subDD_v2_35.h5']


100%|██████████| 5/5 [00:26<00:00,  5.29s/it]


18
['data/test_set/outDD_v2_36.h5' 'data/test_set/outDD_v2_37.h5'] ['data/test_set/subDD_v2_36.h5' 'data/test_set/subDD_v2_37.h5']


100%|██████████| 5/5 [00:27<00:00,  5.44s/it]


19
['data/test_set/outDD_v2_38.h5' 'data/test_set/outDD_v2_39.h5'] ['data/test_set/subDD_v2_38.h5' 'data/test_set/subDD_v2_39.h5']


100%|██████████| 5/5 [00:27<00:00,  5.55s/it]


20
['data/test_set/outDD_v2_40.h5' 'data/test_set/outDD_v2_41.h5'] ['data/test_set/subDD_v2_40.h5' 'data/test_set/subDD_v2_41.h5']


100%|██████████| 5/5 [00:28<00:00,  5.62s/it]


21
['data/test_set/outDD_v2_42.h5' 'data/test_set/outDD_v2_43.h5'] ['data/test_set/subDD_v2_42.h5' 'data/test_set/subDD_v2_43.h5']


100%|██████████| 5/5 [00:25<00:00,  5.03s/it]


22
['data/test_set/outDD_v2_44.h5' 'data/test_set/outDD_v2_45.h5'] ['data/test_set/subDD_v2_44.h5' 'data/test_set/subDD_v2_45.h5']


100%|██████████| 5/5 [00:27<00:00,  5.50s/it]


23
['data/test_set/outDD_v2_46.h5' 'data/test_set/outDD_v2_47.h5'] ['data/test_set/subDD_v2_46.h5' 'data/test_set/subDD_v2_47.h5']


100%|██████████| 5/5 [00:31<00:00,  6.09s/it]


24
['data/test_set/outDD_v2_48.h5' 'data/test_set/outDD_v2_49.h5'] ['data/test_set/subDD_v2_48.h5' 'data/test_set/subDD_v2_49.h5']


100%|██████████| 5/5 [00:24<00:00,  4.90s/it]


25
['data/test_set/outDD_v2_50.h5' 'data/test_set/outDD_v2_51.h5'] ['data/test_set/subDD_v2_50.h5' 'data/test_set/subDD_v2_51.h5']


100%|██████████| 5/5 [00:25<00:00,  5.04s/it]


26
['data/test_set/outDD_v2_52.h5' 'data/test_set/outDD_v2_53.h5'] ['data/test_set/subDD_v2_52.h5' 'data/test_set/subDD_v2_53.h5']


100%|██████████| 5/5 [00:28<00:00,  5.67s/it]


27
['data/test_set/outDD_v2_54.h5' 'data/test_set/outDD_v2_55.h5'] ['data/test_set/subDD_v2_54.h5' 'data/test_set/subDD_v2_55.h5']


100%|██████████| 5/5 [00:27<00:00,  5.44s/it]


28
['data/test_set/outDD_v2_56.h5' 'data/test_set/outDD_v2_57.h5'] ['data/test_set/subDD_v2_56.h5' 'data/test_set/subDD_v2_57.h5']


100%|██████████| 5/5 [00:24<00:00,  4.95s/it]


29
['data/test_set/outDD_v2_58.h5' 'data/test_set/outDD_v2_59.h5'] ['data/test_set/subDD_v2_58.h5' 'data/test_set/subDD_v2_59.h5']


100%|██████████| 5/5 [00:26<00:00,  5.30s/it]


30
['data/test_set/outDD_v2_60.h5' 'data/test_set/outDD_v2_61.h5'] ['data/test_set/subDD_v2_60.h5' 'data/test_set/subDD_v2_61.h5']


100%|██████████| 5/5 [00:28<00:00,  5.60s/it]


31
['data/test_set/outDD_v2_62.h5' 'data/test_set/outDD_v2_63.h5'] ['data/test_set/subDD_v2_62.h5' 'data/test_set/subDD_v2_63.h5']


100%|██████████| 5/5 [00:23<00:00,  4.71s/it]


32
['data/test_set/outDD_v2_64.h5' 'data/test_set/outDD_v2_65.h5'] ['data/test_set/subDD_v2_64.h5' 'data/test_set/subDD_v2_65.h5']


100%|██████████| 5/5 [00:26<00:00,  5.28s/it]


In [10]:
!ls -ltr data/submissions/

total 2585700
-rwxrwxrwx 1 root root 294194705 Nov 13 10:06 fixed_submission_noskip.txt
-rwxrwxrwx 1 root root 294194705 Nov 13 10:06 fixed_submission_skip.txt
-rwxrwxrwx 1 root root 294194705 Nov 13 10:06 random_submission.txt
-rwxrwxrwx 1 root root 294194705 Dez 18 18:54 sub_last_played_0.csv
-rwxrwxrwx 1 root root 294194705 Dez 18 20:18 sub_last_played_mp_0.csv
-rwxrwxrwx 1 root root 294194705 Dez 22 10:40 sub_v1.txt
-rwxrwxrwx 1 root root 294194705 Dez 23 09:15 sub_v2_m01.txt
-rwxrwxrwx 1 root root 294194705 Dez 23 18:16 sub_v2_m02.txt
-rwxrwxrwx 1 root root 294194705 Dez 28 19:49 sub_v2_m04_4f.txt


In [11]:
!wc -l


^C
