In [3]:
%matplotlib inline

import os, glob, time, datetime, gc, pickle
from tqdm import tqdm_notebook as tqdm
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
data_dir = Path('../data/input')
os.listdir('../data/input')

['.DS_Store',
 'calendar.csv',
 'sell_prices.csv',
 'sales_train_validation.csv',
 'sample_submission.csv']

---
# Preprocessing

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
# divide train, val, test data
def get_data(train, sub):
    # Train  ###################################################
    # Concatenate Values per 'd_x'
    id_list = []
    v_list = []
    date_list = []

    for i in range(len(train)):

        v = train.iloc[i, 6:].values.tolist()
        ids = train.iloc[i, :1].values.tolist() * len(v)
        dates = train.columns[6:]

        v_list.extend(v)
        id_list.extend(ids)
        date_list.extend(dates)

    train = pd.DataFrame({
        'id': id_list,
        'd': date_list,
        'values': v_list
    })

    # validation  #######################################################
    id_list = []
    date_list = []
    set_ids = [i for i in sub['id'].values if 'validation' in i]
    for _id in set_ids:
        ids = [_id] * 28
        dates = [f'd_{i}' for i in np.arange(1914, 1941 + 1, 1)]

        id_list.extend(ids)
        date_list.extend(dates)

    vals = pd.DataFrame({
        'id': id_list,
        'd': date_list,
    })

    # evaluation  #######################################################
    id_list = []
    date_list = []
    set_ids = [i for i in sub['id'].values if 'evaluation' in i]
    for _id in set_ids:
        ids = [_id] * 28
        dates = [f'd_{i}' for i in np.arange(1942, 1969 + 1, 1)]

        id_list.extend(ids)
        date_list.extend(dates)

    evals = pd.DataFrame({
        'id': id_list,
        'd': date_list,
    })
    
    return train, vals, evals

def preprocessing(d, calendar, sell):
    # Set Id  #######################################################
    for i in range(5):
        d[f'id_{i}'] = d['id'].apply(lambda x: str(x.split('_')[i]))
    d['store_id'] = d['id_3'] + '_' + d['id_4']
    d['item_id'] = d['id_0'] + '_' + d['id_1'] + '_' + d['id_2']
    d.drop([f'id_{i}' for i in range(5)], axis=1, inplace=True)
    
    # Merge  #####################################################
    # Calendar
    d = pd.merge(d, calendar, on='d')

    # integrate 'snap' feature
    def snap(row):
        if 'CA' in row['store_id']:
            return row['snap_CA']
        elif 'TX' in row['store_id']:
            return row['snap_TX']
        elif 'WI' in row['store_id']:
            return row['snap_WI']
        else:
            pass

    d['snap'] = d.apply(snap, axis=1)
    
    # sellprice
    d = pd.merge(d, sell, on=['store_id', 'item_id', 'wm_yr_wk'])
    
    return d

In [5]:
%%time
# Preprocessing - Train
# Load
reader = pd.read_csv(data_dir.joinpath('sales_train_validation.csv'), chunksize=10000)

train = pd.DataFrame()

# train
for i, _train in enumerate(reader):
    print(i)
    sub = pd.read_csv(data_dir.joinpath('sample_submission.csv'))
    _train, test_public, test_private = get_data(_train, sub)
    del test_public, test_private, sub
    _train = reduce_mem_usage(_train)
    gc.collect()

    calendar = pd.read_csv(data_dir.joinpath('calendar.csv'))
    sell = pd.read_csv(data_dir.joinpath('sell_prices.csv'))
    _train = preprocessing(_train, calendar, sell)
    del calendar, sell
    gc.collect()

    with open(f'../data/prep_train_{i}.pkl', 'wb') as f:
        pickle.dump(_train, f, protocol=4)
        
    # Concatenate
    train = pd.concat([train, _train], axis=0, ignore_index=True)
    
    del _train
    gc.collect()
    
train = reduce_mem_usage(train)
print(train.dtypes)
    
with open('../data/prep_train.pkl', 'wb') as f:
    pickle.dump(train, f, protocol=4)

0
Memory usage of dataframe is 437.85 MB
Memory usage after optimization is: 328.39 MB
Decreased by 25.0%
1
Memory usage of dataframe is 437.85 MB
Memory usage after optimization is: 328.39 MB
Decreased by 25.0%
2
Memory usage of dataframe is 437.85 MB
Memory usage after optimization is: 328.39 MB
Decreased by 25.0%
3
Memory usage of dataframe is 21.45 MB
Memory usage after optimization is: 16.09 MB
Decreased by 25.0%
Memory usage of dataframe is 6759.93 MB
Memory usage after optimization is: 4126.19 MB
Decreased by 39.0%
id               object
d                object
values            int16
store_id         object
item_id          object
date             object
wm_yr_wk          int16
weekday          object
wday               int8
month              int8
year              int16
event_name_1     object
event_type_1     object
event_name_2     object
event_type_2     object
snap_CA            int8
snap_TX            int8
snap_WI            int8
snap               int8
sell_price      

In [6]:
del train
gc.collect()

20

In [11]:
%%time
# Preprocessing - vals
# Load
train = pd.read_csv(data_dir.joinpath('sales_train_validation.csv'))
sub = pd.read_csv(data_dir.joinpath('sample_submission.csv'))

# train
train, vals, _ = get_data(train, sub)
del train, _, sub
vals = reduce_mem_usage(vals)
gc.collect()

calendar = pd.read_csv(data_dir.joinpath('calendar.csv'))
sell = pd.read_csv(data_dir.joinpath('sell_prices.csv'))
vals = preprocessing(vals, calendar, sell)

with open('../data/validation.pkl', 'wb') as f:
    pickle.dump(vals, f, protocol=4)
    
del vals, calendar, sell
gc.collect()
print('DONE')

Memory usage of dataframe is 13.03 MB
Memory usage after optimization is: 13.03 MB
Decreased by 0.0%
DONE
CPU times: user 2min 20s, sys: 6.51 s, total: 2min 26s
Wall time: 2min 28s


In [7]:
%%time
# Preprocessing - evals
# Load
train = pd.read_csv(data_dir.joinpath('sales_train_validation.csv'))
sub = pd.read_csv(data_dir.joinpath('sample_submission.csv'))

# train
train, _, evals = get_data(train, sub)
del train, _, sub
evals = reduce_mem_usage(evals)
gc.collect()

calendar = pd.read_csv(data_dir.joinpath('calendar.csv'))
sell = pd.read_csv(data_dir.joinpath('sell_prices.csv'))
evals = preprocessing(evals, calendar, sell)

with open('../data/evaluation.pkl', 'wb') as f:
    pickle.dump(evals, f, protocol=4)
    
del evals, calendar, sell
gc.collect()
print('DONE')

Memory usage of dataframe is 13.03 MB
Memory usage after optimization is: 13.03 MB
Decreased by 0.0%
DONE
CPU times: user 2min 23s, sys: 6.38 s, total: 2min 29s
Wall time: 2min 33s


In [9]:
print('FINISH')

FINISH
