In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import dask
import dask.dataframe as dd

from helpers import *
import keggler as kg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil

# Set up a logger to dump messages to both log file and notebook
import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [logging.StreamHandler(None), logging.FileHandler(filename, 'a')]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

#PATH='data_mini/'
#prefix='_mini'

PATH='data/'
SUB_PATH='test_set/' #test_set #training_set
prefix=''#'_0_201807[1-2]'

print(os.listdir(PATH))


['submissions', 'test_set', 'track_features', 'training_set']


In [2]:
from multiprocessing.pool import ThreadPool
import dask
dask.config.set(pool=ThreadPool(3))

<dask.config.set at 0x7ffae3c89a20>

# Read in training data

In [3]:
# Training
# fin = '{}/training_set/log{}*.csv.gz'.format(PATH,prefix)

# Test
# fin=['{}/{}/log_prehistory_20180715_000000000000.csv.gz'.format(PATH,SUB_PATH),
#      '{}/{}/log_prehistory_20180716_000000000000.csv.gz'.format(PATH,SUB_PATH)
#     ]
fin = '{}/{}/log_input_*.csv.gz'.format(PATH,SUB_PATH)

In [4]:
fin

'data//test_set//log_input_*.csv.gz'

In [6]:
dd_trn = dd.read_csv(fin,
#                      usecols=[i for i in range(21) if i != 16], # disable this to process data/test_set/log_input_
#                      usecols=[i for i in range(5)],
                     compression='gzip')[['session_id', 'track_id_clean']]

In [7]:
dd_trn.npartitions

66

In [8]:
# dd_trn.memory_usage(deep=True).compute() / 1024**2

In [9]:
dd_trn.dtypes

session_id        object
track_id_clean    object
dtype: object

In [10]:
#dd_trn.head(5)

# Read in track features

In [11]:
df_trk = pd.concat([pd.read_csv(PATH+'track_features/'+f, 
                     usecols=[22+i for i in range(8)]+[0], 
                     compression='gzip'
                    ) for f in os.listdir(PATH+'track_features/') if f.startswith('tf_')
         ], axis=0).set_index('track_id')

In [12]:
df_trk.memory_usage(deep=True)/1024**2

Index                335.795269
acoustic_vector_0     28.277496
acoustic_vector_1     28.277496
acoustic_vector_2     28.277496
acoustic_vector_3     28.277496
acoustic_vector_4     28.277496
acoustic_vector_5     28.277496
acoustic_vector_6     28.277496
acoustic_vector_7     28.277496
dtype: float64

# Merge DS with Track features

In [13]:
dd_out = dd_trn.merge(df_trk, 
                      how='left', 
                      right_index=True, 
                      left_on='track_id_clean'
                     ).drop(['track_id_clean', 'session_id'], 
                            axis=1
                           )

Cast boolean into `unit8` to reduce file size

In [14]:
for c in dd_out.columns:
    if c.startswith('acoustic_vector_'):
        dd_out[c] = dd_out[c].astype(np.float32)

In [15]:
dd_out.dtypes

acoustic_vector_0    float32
acoustic_vector_1    float32
acoustic_vector_2    float32
acoustic_vector_3    float32
acoustic_vector_4    float32
acoustic_vector_5    float32
acoustic_vector_6    float32
acoustic_vector_7    float32
dtype: object

# Store the DD

In [16]:
dd_out.npartitions

66

In [17]:
# dd_out.to_csv(PATH+'/'+SUB_PATH+'/outDD_v2_*.csv.gz', index=False, float_format='%.5f', compression='gzip')
dd_out.to_hdf(PATH+'/'+SUB_PATH+'/subDD_v4_*.h5', key='df')

['data//test_set//subDD_v4_00.h5',
 'data//test_set//subDD_v4_01.h5',
 'data//test_set//subDD_v4_02.h5',
 'data//test_set//subDD_v4_03.h5',
 'data//test_set//subDD_v4_04.h5',
 'data//test_set//subDD_v4_05.h5',
 'data//test_set//subDD_v4_06.h5',
 'data//test_set//subDD_v4_07.h5',
 'data//test_set//subDD_v4_08.h5',
 'data//test_set//subDD_v4_09.h5',
 'data//test_set//subDD_v4_10.h5',
 'data//test_set//subDD_v4_11.h5',
 'data//test_set//subDD_v4_12.h5',
 'data//test_set//subDD_v4_13.h5',
 'data//test_set//subDD_v4_14.h5',
 'data//test_set//subDD_v4_15.h5',
 'data//test_set//subDD_v4_16.h5',
 'data//test_set//subDD_v4_17.h5',
 'data//test_set//subDD_v4_18.h5',
 'data//test_set//subDD_v4_19.h5',
 'data//test_set//subDD_v4_20.h5',
 'data//test_set//subDD_v4_21.h5',
 'data//test_set//subDD_v4_22.h5',
 'data//test_set//subDD_v4_23.h5',
 'data//test_set//subDD_v4_24.h5',
 'data//test_set//subDD_v4_25.h5',
 'data//test_set//subDD_v4_26.h5',
 'data//test_set//subDD_v4_27.h5',
 'data//test_set//su