In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import keggler as kg
from helpers import *

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil
import glob

# Set up a logger to dump messages to both log file and notebook
import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [#logging.StreamHandler(None), 
            logging.FileHandler(filename, 'a')
        ]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')


In [2]:
data_path = 'data/' # point this to your data folder
trn_path = data_path + 'training_set/'

# trn_input_logs = sorted(glob.glob(trn_path + "outDD_v1*.csv.gz"))
trn_input_logs = sorted(glob.glob(trn_path + "outDD_v2*.h5"))

In [3]:
trn_input_logs[:10]

['data/training_set/outDD_v2_00.h5',
 'data/training_set/outDD_v2_01.h5',
 'data/training_set/outDD_v2_02.h5',
 'data/training_set/outDD_v2_03.h5',
 'data/training_set/outDD_v2_04.h5',
 'data/training_set/outDD_v2_05.h5',
 'data/training_set/outDD_v2_06.h5',
 'data/training_set/outDD_v2_07.h5',
 'data/training_set/outDD_v2_08.h5',
 'data/training_set/outDD_v2_09.h5']

# Read in the data

In [5]:
for i,f in enumerate(trn_input_logs[:]):
    if i < 37:
        continue
#     print(i,f)
    df_trn = read_log(f, cols_2read=['session_id', 'skip_2', 'session_position', 'session_length'])
    gc.collect()
    
    from sklearn.preprocessing import LabelEncoder
    df_trn['session_id'] = LabelEncoder().fit_transform(df_trn['session_id'])
    df_trn['session_id'] = df_trn['session_id'].astype(np.uint32)
    gc.collect()
    
    y_competition_truth = pd.Series(get_y_truth(df_trn))
    print(y_competition_truth.memory_usage(deep=True)/1024**2)
    
    y_competition_truth.to_hdf(trn_path+'y_'+'{0:02d}_.h5'.format(i), key='df')

100%|██████████| 193139/193139 [01:24<00:00, 2285.61it/s]


21.155410766601562


100%|██████████| 191998/191998 [01:38<00:00, 1946.67it/s]


21.03844451904297


100%|██████████| 191867/191867 [01:25<00:00, 2247.64it/s]


21.01141357421875


100%|██████████| 206540/206540 [02:02<00:00, 1692.89it/s]


22.62152099609375


100%|██████████| 181532/181532 [01:19<00:00, 2286.16it/s]


19.930831909179688


100%|██████████| 166033/166033 [01:07<00:00, 2472.77it/s]


18.24980926513672


100%|██████████| 188077/188077 [01:16<00:00, 2444.68it/s]


20.590133666992188


100%|██████████| 190317/190317 [01:15<00:00, 2533.21it/s]


20.814987182617188


100%|██████████| 190255/190255 [01:16<00:00, 2480.13it/s]


20.807937622070312


100%|██████████| 191482/191482 [01:15<00:00, 2538.90it/s]


20.947288513183594


100%|██████████| 215724/215724 [01:26<00:00, 2481.08it/s]


23.596237182617188


100%|██████████| 183527/183527 [01:13<00:00, 2499.77it/s]


20.130661010742188


100%|██████████| 164255/164255 [01:04<00:00, 2534.26it/s]


18.04863739013672


100%|██████████| 169280/169280 [01:05<00:00, 2575.11it/s]


18.643081665039062


100%|██████████| 189360/189360 [01:40<00:00, 1880.23it/s]


20.806602478027344


100%|██████████| 192363/192363 [3:54:08<00:00, 13.69it/s]    


21.110374450683594


100%|██████████| 194602/194602 [01:50<00:00, 1758.55it/s]


21.369491577148438


100%|██████████| 201122/201122 [01:52<00:00, 1788.83it/s]


22.08110809326172


100%|██████████| 173220/173220 [01:41<00:00, 1714.66it/s]


19.037796020507812


100%|██████████| 155330/155330 [01:23<00:00, 1851.45it/s]


17.10771942138672


100%|██████████| 187047/187047 [01:38<00:00, 1893.19it/s]


20.58197021484375


100%|██████████| 187710/187710 [01:40<00:00, 1873.01it/s]


20.610595703125


100%|██████████| 192053/192053 [01:43<00:00, 1859.69it/s]


21.091522216796875


100%|██████████| 193531/193531 [01:45<00:00, 1829.81it/s]


21.24614715576172


100%|██████████| 205927/205927 [01:55<00:00, 1786.10it/s]


22.624412536621094


100%|██████████| 175989/175989 [01:37<00:00, 1801.69it/s]


19.356285095214844


100%|██████████| 154753/154753 [01:27<00:00, 1775.85it/s]


17.038238525390625


100%|██████████| 188313/188313 [01:45<00:00, 1793.39it/s]


20.7064208984375


100%|██████████| 192201/192201 [01:47<00:00, 1789.82it/s]


21.10662078857422
