In [4]:
import os
import gc
import cudf
import pickle
import dask_cudf
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [5]:
%%time
data_dir = Path('../input')

labels = cudf.read_csv(data_dir.joinpath('train_labels.csv'))

CPU times: user 893 ms, sys: 230 ms, total: 1.12 s
Wall time: 1.66 s


In [6]:
labels.head()

Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


## カラムごとに保存

In [4]:
%%time

_df = pd.read_csv(data_dir.joinpath('train_data.csv'), nrows=2)

# カラムごとにpickleを作る
for s in ['D', 'S', 'P', 'B', 'R']:
    use_cols = ['customer_ID'] + [c for c in _df.columns if c.startswith(s)]
    
    train = pd.read_csv(data_dir.joinpath('train_data.csv'), chunksize=200000, usecols=use_cols)
    
    dfs = [df for df in train]
    dfs = pd.concat(dfs, axis=0, ignore_index=True)

    with open(f'../input/train_data_{s}.pkl', 'wb') as f:
        pickle.dump(dfs, f)

    del dfs
    gc.collect()

CPU times: user 4min 37s, sys: 31.8 s, total: 5min 8s
Wall time: 5min 50s


## Train all

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    return df

In [8]:
%%time
train = pd.read_csv(data_dir.joinpath('train_data.csv'), chunksize=200000)

dfs = [reduce_mem_usage(df) for df in train]
dfs = pd.concat(dfs, axis=0, ignore_index=True)

with open(f'../input/train_data.pkl', 'wb') as f:
    pickle.dump(dfs, f)

CPU times: user 2min 22s, sys: 55.4 s, total: 3min 18s
Wall time: 3min 27s
