In [1]:
import pandas as pd
import numpy as np
import feather
import os

In [2]:
data_folder = os.getcwd()+'/data/formatted_data/'

**Note: can use multiple cols for downcast, eg: df[float_cols] = df[float_cosl].astype[np.int32]**

In [3]:
# test dataframe
test = pd.read_csv('data/test.csv.gz', compression='gzip', header=0)
test.name = 'test'

# downcast to int32
for col in test.columns:
    test[col] = test[col].astype(np.int32)
    
test.to_pickle(data_folder+'test.pkl')

In [16]:
def get_dt_block_num(datetime):
    if datetime.year == 2013:
        return datetime.month - 1
    elif datetime.year == 2014:
        return datetime.month + 11
    else:
        return datetime.month + 23

In [31]:
# hol dataframe
hol = feather.read_dataframe('data/hol.feather')
hol_to_drop = ['Season', 'Working day (moved weekend)', 'Observance', 'Observance, Orthodox']
drop_hol_idx = list(hol[hol.h_type.isin(hol_to_drop)].index) + [17]
hol.drop(drop_hol_idx, axis=0, inplace=True)
hol.reset_index(drop=True, inplace=True)
hol['date_block_num']  = hol['date'].apply(get_dt_block_num)

new_hol_df = pd.DataFrame(hol.groupby('date_block_num').size(), columns= ['hol_count']).reset_index()
new_hol_df[['date_block_num', 'hol_count']] = new_hol_df[['date_block_num', 'hol_count']].astype(np.int32) 

new_hol_df.to_pickle(data_folder+'hol.pkl')

In [44]:
# items
items = feather.read_dataframe('data/it_en.feather')
items.columns = ['it_name', 'item_id', 'itc_id', 'en_it_name']
items.name = 'items'

# downcast to int32
for col in items.select_dtypes(exclude=object).columns:
    items[col] = items[col].astype(np.int32)

items.to_pickle(data_folder+'items.pkl')

In [45]:
# items categories
icats = feather.read_dataframe('data/itc_en.feather')
icats.name = 'icats'

# downcast to int32
for col in icats.select_dtypes(exclude=object).columns:
    icats[col] = icats[col].astype(np.int32)

icats.to_pickle(data_folder+'icats.pkl')

In [46]:
# shops
shops = feather.read_dataframe('data/sh_en.feather')
shops.columns = ['sname', 'shop_id', 'en_sname']
shops.name = 'shops'

# downcast to int32
for col in shops.select_dtypes(exclude=object).columns:
    shops[col] = shops[col].astype(np.int32)

shops.to_pickle(data_folder+'shops.pkl')

In [3]:
from multiprocessing import Pool

train = pd.read_csv('data/sales_train.csv.gz', compression='gzip', header=0)
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
#re-format date feature
def to_datetime(v):
    #convert string date to d/m/y format
    return pd.to_datetime(v, format="%d.%m.%Y")

with Pool(5) as p:
    data = p.map(to_datetime, train.date.values)
    train.loc[:, 'date'] = data

In [5]:
# train
train.name = 'train'
train['day'] = train.date.dt.day
train['month'] = train.date.dt.month
train['year'] = train.date.dt.year
train['weekday'] = train.date.apply(lambda x: x.weekday() < 4) * 1
train.drop('date', axis=1, inplace=True)

# downcast to int32
for col in ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day', 'day', 'month', 'year', 'weekday']:
    train[col] = train[col].astype(np.int32)

train.to_pickle(data_folder+'train.pkl')

In [3]:
# read inflation file
inflation = pd.read_csv('data/monthly_inflation.csv', names=['date_block_num', 'infla_rate'])
inflation.head()

Unnamed: 0,date_block_num,infla_rate
0,0,0.97
1,1,0.56
2,2,0.34
3,3,0.51
4,4,0.66


In [4]:
inflation['date_block_num'] = inflation['date_block_num'].astype(np.int32)
inflation['infla_rate'] = inflation['infla_rate'].astype(np.float32)
inflation.to_pickle(data_folder+'inflation.pkl')