In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import keggler as kg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil

# Set up a logger to dump messages to both log file and notebook
import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [#logging.StreamHandler(None), 
            logging.FileHandler(filename, 'a')
        ]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

#PATH='data_mini/'
#prefix='_mini'

PATH='data/'
prefix=''

print(os.listdir(PATH))


['20181113_submissions.tar.gz', '20181113_test_set.tar.gz', 'submissions', 'track_features', 'training_set', 'training_set_0.tar.gz']


# Read in track features

In [2]:
from os import listdir
listdir(PATH+'track_features/')

['._tf_000000000000.csv',
 '._tf_000000000001.csv',
 'tf_000000000000.csv.gz',
 'tf_000000000001.csv.gz']

In [3]:
df_trk = pd.concat([pd.read_csv(PATH+'track_features/'+f, 
                     usecols=range(4), 
                     dtype={'release_year': np.uint32},
                     compression='gzip'
                    ) for f in listdir(PATH+'track_features/') if f.startswith('tf_')
         ], axis=0)

In [4]:
df_trk.memory_usage(deep=True)/1024**2

Index                      28.277496
track_id                  335.795269
duration                   28.277496
release_year               14.138748
us_popularity_estimate     28.277496
dtype: float64

In [5]:
# df_trk = kg.reduce_mem_usage(df_trk)

In [6]:
# df_trk.head()

# Merge DS with Track features

In [19]:
def preprocess_data(fin_, path_, df_trk_):
    df_trn = pd.read_csv(path_ + '/' + fin_,
                         usecols=[i for i in range(21) if i != 16]
                        )
    enc = {
        'hist_user_behavior_reason_start': {'trackdone': 1, 'fwdbtn': 2, 'trackerror': 8, 'remote': 7, 'clickrow': 4, 'backbtn': 3, 'playbtn': 6, 'appload': 5, 'endplay': 9},
        'context_type': {'radio': 3, 'personalized_playlist': 4, 'charts': 6, 'user_collection': 2, 'editorial_playlist': 1, 'catalog': 5},
        'hist_user_behavior_reason_end': {'trackdone': 1, 'fwdbtn': 2, 'logout': 5, 'clickrow': 7, 'backbtn': 3, 'endplay': 4, 'remote': 6}
    }

    for c in enc:
        df_trn[c] = df_trn[c].map(enc[c]).fillna(0).astype(np.uint8)
        
    df_out = df_trn.merge(df_trk_, 
                          how='left', 
                          right_on='track_id', 
                          left_on='track_id_clean'
                         ).drop(['track_id_clean', 'track_id'], 
                                axis=1
                               )
    cols_bool = [c for c in df_out.columns if df_out[c].dtype==bool]
    #print(cols_bool)
    df_out.loc[:,cols_bool] = df_out.loc[:,cols_bool].astype(np.uint8)
    
    df_out.to_csv(path_ + '/out_' + fin_, index=False, float_format='%.5f')#, compression='gzip')
    
    del df_out, df_trn
    gc.collect()

In [10]:
file_list = [c 
             for c in listdir(PATH+'training_set/') 
             if c.startswith('tmp_')
             and not os.path.isfile(PATH+'training_set/'+'out_'+c)
            ]

In [11]:
file_list

['tmp_0.csv']

In [20]:
for f in tqdm(file_list):
    #['log_0_20180715_000000000000.csv.gz', 'log_0_20180716_000000000000.csv.gz']
    preprocess_data(f, PATH+'training_set/', df_trk)

100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


In [2]:
x = pd.read_csv('data/training_set/log_0_20180715_000000000000.csv.gz', nrows=100)

In [19]:
x.to_csv('data/training_set/tmp_0.csv', index=False)

In [16]:
x.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,date,premium,context_type,hist_user_behavior_reason_start,hist_user_behavior_reason_end
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,0,0,0,0,0,True,16,2018-07-15,True,editorial_playlist,trackdone,trackdone
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,20,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,False,False,False,True,0,1,0,0,0,0,True,16,2018-07-15,True,editorial_playlist,trackdone,trackdone
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,20,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,False,False,False,True,0,1,0,0,0,0,True,16,2018-07-15,True,editorial_playlist,trackdone,trackdone
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,20,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,False,False,False,True,0,1,0,0,0,0,True,16,2018-07-15,True,editorial_playlist,trackdone,trackdone
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,20,t_64f3743c-f624-46bb-a579-0f3f9a07a123,False,False,False,True,0,1,0,0,0,0,True,16,2018-07-15,True,editorial_playlist,trackdone,trackdone
