In [1]:
import os
import numpy as np
import pandas as pd
import sklearn as skl
import xgboost as xgb
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)
np.set_printoptions(edgeitems=100)

#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

from tqdm import tqdm_notebook
from itertools import product

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import gc

In [2]:
for p in [np, pd, skl, xgb]:
    print (p.__name__, p.__version__)

numpy 1.14.5
pandas 0.23.0
sklearn 0.19.1
xgboost 0.80


In [3]:
seed = 123
ctx = 'Search2_shop_id_item_id_one_hot_float32_'

In [4]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [5]:
from sklearn.metrics import mean_squared_error, r2_score

def score(est, ground_truths, predictions):
    print(est)
    print('RMSE is %f' % np.sqrt(mean_squared_error(ground_truths, predictions)))
    print('R-squared is %f' % r2_score(ground_truths, predictions))    

In [8]:
import h5py

CACHE_FOLDER = './cache/'
KEY = 'dataset_1'

def save_data(filename, data):
    path = str(os.path.join(CACHE_FOLDER, filename) + '.h5')
    print('Saving ' + path)
    if isinstance(data, pd.DataFrame):
        data.to_hdf(path, KEY, mode='w')
    elif isinstance(data, np.ndarray):
        with h5py.File(path, 'w') as f:
            f.create_dataset(KEY, data=data)
    else:
        raise('Not supported')
    
def load_data(filename, hint):
    path = str(os.path.join(CACHE_FOLDER, filename) + '.h5')
    print('Loading ' + path)
    if isinstance(hint, pd.DataFrame):
        data = pd.read_hdf(path, KEY)
    elif isinstance(hint, np.ndarray):
        with h5py.File(path,'r') as f:
            data = f[KEY][:]
    else:
        raise('Not supported')
        
    return data

In [78]:
from joblib import dump, load
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import load_model as keras_load_model

MODEL_FOLDER = './models/'

def save_model(filename, model):
    path = str(os.path.join(MODEL_FOLDER, filename) + '.jlib')
    print('Saving ' + path)    
    if isinstance(model, KerasRegressor):
        model.model.save(path)
    else:
        dump(model, path)        
        
def load_model(filename, hint):
    path = str(os.path.join(MODEL_FOLDER, filename) + '.jlib')
    print('Loading ' + path)
    if isinstance(hint, KerasRegressor):
        model = keras_load_model(path)
    else:
        model = load(path)
        
    return model        

In [18]:
DATA_FOLDER = './data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [19]:
max_train_date_block_num = sales.date_block_num.max()
max_train_date_block_num

33

import nltk
#nltk.download('punkt')

tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
items['tokenized_name'] = items.apply(lambda row: tokenizer.tokenize(row['item_name']), axis=1)

from gensim.models import Word2Vec
w2v_item_name = Word2Vec(items.tokenized_name, min_count=1)

items['embedded_name'] = items.apply(lambda row: np.array[w2v_item_name[word] for word in row['tokenized_name']])

from gensim.models import Word2Vec
docs = items.tokenized_names
w2v_item_names = Word2Vec(
        docs,
        size=150,
        window=10,
        min_count=2,
        workers=10)
w2v_item_names(docs, total_examples=len(documents), epochs=10)

In [None]:
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
index_cols = ['shop_id', 'item_id', 'date_block_num']

In [None]:
def create_grid(sales, index_cols):
    from tqdm import tqdm_notebook
    
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in tqdm_notebook(sales['date_block_num'].unique()):
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Clip
    gb.target = gb.target.clip(0,20) #TODO
    # Join it to the grid    
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Same as above but with category-month aggregates
    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    
    gb = pd.merge(sales, item_category_mapping, how='left', on='item_id')
    gb = gb.groupby(['item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_category':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_category_id', 'date_block_num']).fillna(0)
    all_data = all_data.drop(['item_category_id'], axis=1)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect();
    
    return all_data

In [None]:
all_data = create_grid(
    pd.concat(
        [sales, test.assign(date_block_num=max_train_date_block_num+1)],
        ignore_index=True, sort=False),
    index_cols)
all_data.head()

In [None]:
def create_lags(all_data, shift_range = [1, 2, 3, 4, 5, 12]):
   
    lagged_data = all_data.copy()
    
    # List of columns that we will use to create lags
    cols_to_rename = list(lagged_data.columns.difference(index_cols))    

    for month_shift in tqdm_notebook(shift_range):
        train_shift = lagged_data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        lagged_data = pd.merge(lagged_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # List of all lagged features
    fit_cols = [col for col in lagged_data.columns if col[-1] in [str(item) for item in shift_range]]  
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(lagged_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 
    
    lagged_data = pd.merge(lagged_data, item_category_mapping, how='left', on='item_id')

    lagged_data = downcast_dtypes(lagged_data)
    gc.collect();
    
    return lagged_data, to_drop_cols

In [None]:
lagged_data, to_drop_cols = create_lags(all_data)
lagged_data.head()
to_drop_cols

In [None]:
del all_data
gc.collect()

In [None]:
numeric_features = sorted(list(set(list(lagged_data.columns.values)) 
                               - set(to_drop_cols) - set(index_cols) - set(['item_category_id'])))
numeric_features

#categorical_features = list(set(index_cols + ['item_category_id']) - set(['date_block_num']))
categorical_features = ['shop_id', 'item_category_id'] 
categorical_features

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler #, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper, gen_features

def create_mapper():
    categorial_maps = gen_features(
        columns=[[feature] for feature in categorical_features],
        classes=[{'class': OneHotEncoder, 'dtype': np.float32, 'sparse':False, 'handle_unknown':'ignore'}])
    numeric_maps = gen_features(
        columns=[[feature] for feature in numeric_features],
        classes=[StandardScaler])
    return DataFrameMapper(categorial_maps + numeric_maps, default=None)
    #return DataFrameMapper(categorial_maps, default=None)

In [None]:
mapper = create_mapper()
mapped_data = mapper.fit_transform(lagged_data.drop(to_drop_cols, axis=1)).astype(np.float32)
#mapper.transformed_names_

In [None]:
#train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
#test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]
train_indices = lagged_data.date_block_num <= max_train_date_block_num
test_indices = lagged_data.date_block_num == max_train_date_block_num + 1

In [None]:
X_train = mapped_data[train_indices]
X_test = mapped_data[test_indices]

target_col = 'target'
y_train = lagged_data.loc[train_indices, [target_col]].values.ravel()

In [None]:
dates_train=lagged_data.loc[train_indices,['date_block_num']]
#dates_train.head()
predictions = lagged_data.loc[test_indices, ['shop_id', 'item_id']]
#predictions.head()

In [None]:
del lagged_data
del mapped_data
gc.collect()

In [10]:
%pinfo2 np.empty

In [None]:
save_data(ctx + 'X_train', X_train)
save_data(ctx + 'X_test', X_test)
save_data(ctx + 'y_train', y_train)
save_data(ctx + 'dates_train', dates_train)
save_data(ctx + 'predictions', predictions)

In [12]:
X_train = load_data(ctx + 'X_train', np.empty(0))
X_test = load_data(ctx + 'X_test', np.empty(0))
y_train = load_data(ctx + 'y_train', np.empty(0))
dates_train = load_data(ctx + 'dates_train', pd.DataFrame())
predictions = load_data(ctx + 'predictions', pd.DataFrame())

Loading ./cache/Search2_shop_id_item_id_one_hot_float32_dates_train.h5
Loading ./cache/Search2_shop_id_item_id_one_hot_float32_predictions.h5


In [14]:
#print(X_train) 
#print(X_test)
#print(y_train) 
#print(dates_train)
#print(predictions)
X_train.dtype
X_train.shape
dates_train.shape

dtype('float32')

(10913850, 169)

(10913850, 1)

In [15]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        #print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = data.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index.values, vali_indices[vali_indices].index.values)

In [None]:
xgb_est_filename = ctx + 'xgb_est.joblib'
#xgb_est = load(xgb_est_filename)
xgb_est = xgb.XGBRegressor(objective='reg:linear', n_jobs=-1, silent=0, **{'tree_method':'gpu_hist'}) #n_estimators=100, learning_rate=0.3, max_depth=7, 
#xgb_est.fit(X_train, y_train, verbose=2)
#dump(xgb_est, xgb_est_filename)
#xgb_est.get_params(deep=True)
#xgb_est.get_xgb_params()

In [None]:
from sklearn.model_selection import GridSearchCV
search_est = GridSearchCV(xgb_est,
                          {'n_estimators':[50, 100, 150], 'learning_rate':[0.1, 0.3], 'max_depth':[6,7,8]},
                          scoring='neg_mean_squared_error',
                          cv=gen_time_split(dates_train, 3),
                          refit=True,
                          return_train_score=True,
                          verbose=2)
search_est.fit(X_train, y_train, verbose=2)

In [None]:
search_est.cv_results_
search_est.best_score_

In [None]:
dump(search_est.best_estimator_, xgb_est_filename)

In [None]:
score("XGB train", y_train, search_est.predict(X_train))

RMSE for XGBoost is 0.857923
Train R-squared for XGBoost is 0.507406

In [22]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adam

def create_keras_model(lr=0.1):
    model = Sequential()
    model.add(Dense(1000, activation='sigmoid'))
    model.add(Dense(300, activation='sigmoid'))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(1))
    model.compile(
        loss='mean_squared_error',
        optimizer='adam'
        #optimizer=SGD(lr=lr, momentum=0.0, decay=0.0, nesterov=False)
    )
    return model

keras_est_filename = ctx + 'keras_est'
keras_est = KerasRegressor(build_fn=create_keras_model, epochs=1, verbose=True) #lr=0.1, batch_size=10000, 

In [79]:
keras_est_loaded = KerasRegressor(build_fn=create_keras_model, epochs=1, verbose=True) #lr=0.1, batch_size=10000, 
keras_est_loaded.model = load_model(keras_est_filename + '_model', keras_est_loaded)
keras_est_loaded.fit(X_train, y_train, verbose=True)

Loading ./models/Search2_shop_id_item_id_one_hot_float32_keras_est_model.jlib
Epoch 1/1


<keras.callbacks.History at 0x7f95f8cb0438>

In [83]:
keras_est_loaded.get_params()

{'build_fn': <function __main__.create_keras_model(lr=0.1)>,
 'epochs': 1,
 'verbose': True}

In [23]:
from sklearn.model_selection import GridSearchCV
search_est = GridSearchCV(keras_est,
                          {'lr':[0.003, 0.001, 0.01], 'batch_size': [1000, 300, 100]}, #'epochs': [3,5,7], 
                          scoring='neg_mean_squared_error',
                          cv=gen_time_split(dates_train, 3),
                          refit=True,
                          return_train_score=True,
                          verbose=2)
search_est.fit(X_train, y_train, verbose=True)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] lr=0.003, batch_size=1000 .......................................
Epoch 1/1
[CV] ........................ lr=0.003, batch_size=1000, total= 1.1min
[CV] lr=0.003, batch_size=1000 .......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


Epoch 1/1
[CV] ........................ lr=0.003, batch_size=1000, total= 1.0min
[CV] lr=0.003, batch_size=1000 .......................................
Epoch 1/1
[CV] ........................ lr=0.003, batch_size=1000, total= 1.0min
[CV] lr=0.001, batch_size=1000 .......................................
Epoch 1/1
[CV] ........................ lr=0.001, batch_size=1000, total= 1.1min
[CV] lr=0.001, batch_size=1000 .......................................
Epoch 1/1
[CV] ........................ lr=0.001, batch_size=1000, total= 1.1min
[CV] lr=0.001, batch_size=1000 .......................................
Epoch 1/1
[CV] ........................ lr=0.001, batch_size=1000, total= 1.0min
[CV] lr=0.01, batch_size=1000 ........................................
Epoch 1/1
[CV] ......................... lr=0.01, batch_size=1000, total= 1.1min
[CV] lr=0.01, batch_size=1000 ........................................
Epoch 1/1
[CV] ......................... lr=0.01, batch_size=1000, total= 1.1min
[CV] lr

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 135.1min finished


Epoch 1/1


GridSearchCV(cv=<generator object gen_time_split at 0x7f9872b8bd00>,
       error_score='raise',
       estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f9872b90080>,
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'lr': [0.003, 0.001, 0.01], 'batch_size': [1000, 300, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=2)

In [53]:
search_est.best_score_
search_est.best_params_
search_est.cv_results_

-1.061886083575765

{'batch_size': 300, 'lr': 0.003}

{'mean_fit_time': array([ 62.48187931,  62.90376043,  63.5504454 , 173.87662554,
        175.34648577, 178.62468894, 460.19229786, 465.82288233,
        469.38978092]),
 'mean_score_time': array([0.43642346, 0.46623699, 0.52014248, 1.11015232, 1.15533201,
        1.26173973, 2.65385   , 2.95103129, 2.88988765]),
 'mean_test_score': array([-1.15886212, -1.12824089, -1.09928816, -1.06188608, -1.16670793,
        -1.10604122, -1.1798972 , -1.12459976, -1.19964502]),
 'mean_train_score': array([-1.32266271, -1.30504155, -1.26720564, -1.23914047, -1.35611387,
        -1.27658129, -1.38101848, -1.30370033, -1.41783949]),
 'param_batch_size': masked_array(data=[1000, 1000, 1000, 300, 300, 300, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_lr': masked_array(data=[0.003, 0.001, 0.01, 0.003, 0.001, 0.01, 0.003, 0.001,
                    0.01],
              mask=

In [71]:
save_model(keras_est_filename + '_cv_results', search_est.cv_results_)

Saving ./models/Search2_shop_id_item_id_one_hot_float32_keras_est_cv_results.jlib


In [72]:
save_model(keras_est_filename + '_model', search_est.best_estimator_)

Saving ./models/Search2_shop_id_item_id_one_hot_float32_keras_est_model.jlib


In [None]:
y_test_pred = search_est.best_estimator_.predict(X_train)
score("Keras train", y_train, y_test_pred)

In [None]:
y_test_pred = None
predictions = predictions.assign(item_cnt_month=y_test_pred)
submission = test.merge(predictions, how='left')[['item_cnt_month']]
submission.head()
submission.to_csv(ctx + '.csv.gz', index_label='ID', compression='gzip') #header=['ID', 'item_cnt_month'])