In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings

from fancyimpute import SimpleFill

from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

warnings.simplefilter(action='ignore', category=FutureWarning)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#product data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

#sales, exchange rates, social network data
sales = pd.read_csv('../input/sales.csv')

#website navigation data
navigation = pd.read_csv('../input/navigation.csv')

#product images vectorized with ResNet50
vimages = pd.read_csv('../input/vimages.csv')

# Data preparation

In [6]:
currency_and_social_columns = sales.columns[9:].tolist()

In [7]:
first_day = sales.loc[sales.Date == 'Day_1',:]

In [8]:
all_currency_and_social = sales.groupby('sku_hash').mean()[currency_and_social_columns]
first_day_currency_and_social = first_day.groupby('sku_hash').mean()[currency_and_social_columns]
first_day_currency_and_social.columns = ['first_day_' + col for col in first_day_currency_and_social.columns]

In [9]:
all_sales = sales.groupby('sku_hash').sum()['sales_quantity']
all_sales = pd.DataFrame(all_sales)
first_day_sales = first_day.groupby(['sku_hash', 'day_transaction_date', 'Month_transaction']).sum()['sales_quantity']
first_day_sales = pd.DataFrame(first_day_sales)
first_day_sales.columns = ['first_day_sales']
first_day_sales.reset_index(inplace=True)
first_day_sales.set_index('sku_hash', inplace=True)

In [10]:
# Features for number of sales from each day from 1 to 7

sku_features = []
for dayid in range(1, 8):
    day = sales.loc[sales.Date == 'Day_{}'.format(dayid),:]
    day_sales = day.groupby(['sku_hash'])['sales_quantity'].sum()

    sku_features.append(day_sales)

In [11]:
sku_features.append(sales.groupby('sku_hash')['sku_hash'].count())

In [12]:
sales_data = pd.merge(all_sales, first_day_sales, left_index=True, right_index=True)
sales_data = pd.merge(sales_data, all_currency_and_social, left_index=True, right_index=True)
sales_data = pd.merge(sales_data, first_day_currency_and_social, left_index=True, right_index=True)

In [13]:
# Convert month_transaction to numeric
monthDict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
sales_data.Month_transaction = sales_data.Month_transaction.astype('object').map(monthDict)

In [14]:
sales_data.head()

Unnamed: 0_level_0,sales_quantity,day_transaction_date,Month_transaction,first_day_sales,currency_rate_USD,currency_rate_GBP,currency_rate_CNY,currency_rate_JPY,currency_rate_KRW,currency_rate_USD_1_day_before,...,first_day_NetSentiment_6_day_before,first_day_PositiveSentiment_6_day_before,first_day_NegativeSentiment_6_day_before,first_day_Impressions_6_day_before,first_day_TotalBuzzPost_7_day_before,first_day_TotalBuzz_7_day_before,first_day_NetSentiment_7_day_before,first_day_PositiveSentiment_7_day_before,first_day_NegativeSentiment_7_day_before,first_day_Impressions_7_day_before
sku_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000cb631113e2f54ca5512139a6592e9584957aa,114,Friday,Jun,15,1.126212,0.874773,7.661876,123.874618,1258.19566,1.1264,...,71.497052,17888.0,2973.0,514585381.0,41276.0,64550.0,68.741355,20130.0,3729.0,491736200.0
0020d561eab8b88ab55dfde84a2f12b865e5e0b4,365,Friday,Apr,282,1.081008,0.841606,7.447166,119.015813,1223.698506,1.081296,...,69.491343,15711.0,2828.0,271425144.0,42770.0,60152.0,68.060358,18711.0,3556.0,352152000.0
0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2,110,Friday,Nov,15,1.161866,0.886239,7.708679,132.288017,1295.056829,1.161648,...,69.02664,24459.0,4482.0,542601613.0,61979.0,88755.0,55.200683,25463.0,7350.0,1189237000.0
00287bbb94c12066df6491dccd744ee87ff01a90,45,Friday,Nov,5,1.189247,0.889329,7.851854,132.787147,1290.424868,1.189655,...,70.311199,22493.0,3921.0,801621926.0,54173.0,78972.0,74.279561,25491.0,3762.0,512948500.0
003f8a76cb823eb7c58b6d052c57a8933f9275fd,398,Friday,Oct,95,1.181248,0.892284,7.793914,132.539294,1331.520249,1.181057,...,64.672699,22477.0,4822.0,419612863.0,55415.0,80913.0,67.001526,23523.0,4648.0,498309200.0


In [15]:
first_day_navigation = navigation.loc[navigation.Date == 'Day 1',:]
first_day_views = first_day_navigation.groupby('sku_hash').sum()[['page_views', 'addtocart']]
first_day_views.columns = ['first_day_page_views', 'first_day_addtocart']
views = navigation.groupby('sku_hash').sum()[['page_views', 'addtocart']]
navigation_data = pd.merge(views, first_day_views, left_index=True, right_index=True)

In [16]:
sales_data.sales_quantity = sales_data.sales_quantity.astype('float64')
sales_data.first_day_sales = sales_data.first_day_sales.astype('float64')

In [17]:
sales_data['sales_quantity_log'] = (sales_data.sales_quantity + 1).apply(np.log)
sales_data['first_day_sales_log'] = (sales_data.first_day_sales + 1).apply(np.log)

In [18]:
# Load in raddar's features from the first kernel

darius_train = pd.read_csv('raddar-features-train.csv')
darius_test = pd.read_csv('raddar-features-test.csv')

In [19]:
train_data = pd.merge(train, sales_data, left_on='sku_hash', right_index=True)
train_data = pd.merge(train_data, navigation_data, how='left', left_on='sku_hash', right_index=True)
train_data = pd.merge(train_data, vimages, left_on='sku_hash', right_on='sku_hash')

# Add raddar's features to my dataframe

cols = darius_train.columns
drop_cols = [c for c in darius_train.columns if (c in train_data.columns and c != 'ID')]

train_data = pd.merge(train_data.drop(drop_cols, axis=1), darius_train, how='left', left_on='ID', right_on='ID')

# Add sales per day features
extra_features = []
for i, feature in enumerate(sku_features):
    feature = pd.DataFrame(feature)
    feature.columns=['extra{}'.format(i)]
    train_data = pd.merge(train_data, feature, how='left', left_on='sku_hash', right_index=True)
    extra_features.append('extra{}'.format(i))

# Add feature which is ratio between 7th day sales and 1st day sales
# i.e. how quickly sales dropped off
train_data['extraratio'] = train_data['extra6'] / train_data['extra0']

In [20]:
test_data = pd.merge(test, sales_data, left_on='sku_hash', right_index=True)
test_data = pd.merge(test_data, navigation_data, how='left', left_on='sku_hash', right_index=True)
test_data = pd.merge(test_data, vimages, left_on='sku_hash', right_on='sku_hash')

# Add raddar's feature
test_data = pd.merge(test_data.drop(drop_cols, axis=1), darius_test, how='left', left_on='ID', right_on='ID')

extra_features = []
for i, feature in enumerate(sku_features):
#     print(feature)
    feature = pd.DataFrame(feature)
    feature.columns=['extra{}'.format(i)]
#     print(feature)
    test_data = pd.merge(test_data, feature, how='left', left_on='sku_hash', right_index=True)
    extra_features.append('extra{}'.format(i))
    
test_data['extraratio'] = test_data['extra6'] / test_data['extra0']

In [21]:
train_data[navigation_data.columns] = train_data[navigation_data.columns].fillna(0)
test_data[navigation_data.columns] = test_data[navigation_data.columns].fillna(0)

# Modeling
## utils
from https://github.com/pjankiewicz/PandasSelector

In [22]:


class PandasSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, dtype=None, inverse=False,
                 return_vector=True):
        self.dtype = dtype
        self.columns = columns
        self.inverse = inverse
        self.return_vector = return_vector

        if isinstance(self.columns, str):
            self.columns = [self.columns]

    def check_condition(self, x, col):
        cond = (self.dtype is not None and x[col].dtype == self.dtype) or \
               (self.columns is not None and col in self.columns)
        return self.inverse ^ cond

    def fit(self, x, y=None):
        return self

    def _check_if_all_columns_present(self, x):
        if not self.inverse and self.columns is not None:
            missing_columns = set(self.columns) - set(x.columns)
            if len(missing_columns) > 0:
                missing_columns_ = ','.join(col for col in missing_columns)
                raise KeyError('Keys are missing in the record: %s' %
                               missing_columns_)

    def transform(self, x):
        # check if x is a pandas DataFrame
        if not isinstance(x, pd.DataFrame):
            raise KeyError('Input is not a pandas DataFrame')

        selected_cols = []
        for col in x.columns:
            if self.check_condition(x, col):
                selected_cols.append(col)

        # if the column was selected and inversed = False make sure the column
        # is in the DataFrame
        self._check_if_all_columns_present(x)

        # if only 1 column is returned return a vector instead of a dataframe
        if len(selected_cols) == 1 and self.return_vector:
            return list(x[selected_cols[0]])
        else:
            return x[selected_cols]

## separate models for each prediction month

In [23]:
train_data1 = train_data.loc[train_data.month == 1, :].copy()
train_data1.drop(['month', 'sku_hash', 'ID'], axis=1, inplace=True)

X_test1 = test_data.loc[test_data.month == 1, :].copy()
X_test1.drop(['month', 'sku_hash'], axis=1, inplace=True)
X_test1.set_index('ID', inplace=True)

y_train1 = (train_data1.target + 1).apply(np.log)
X_train1 = train_data1.drop('target', axis=1)

train_data2 = train_data.loc[train_data.month == 2, :].copy()
train_data2.drop(['month', 'sku_hash', 'ID'], axis=1, inplace=True)

X_test2 = test_data.loc[test_data.month == 2, :].copy()
X_test2.drop(['month', 'sku_hash'], axis=1, inplace=True)
X_test2.set_index('ID', inplace=True)

y_train2 = (train_data2.target + 1).apply(np.log)
X_train2 = train_data2.drop('target', axis=1)


train_data3 = train_data.loc[train_data.month == 3, :].copy()
train_data3.drop(['month', 'sku_hash', 'ID'], axis=1, inplace=True)

X_test3 = test_data.loc[test_data.month == 3, :].copy()
X_test3.drop(['month', 'sku_hash'], axis=1, inplace=True)
X_test3.set_index('ID', inplace=True)

y_train3 = (train_data3.target + 1).apply(np.log)
X_train3 = train_data3.drop('target', axis=1)

In [26]:
images_cols = vimages.columns[1:].tolist()
float_cols = X_train1.dtypes[X_train1.dtypes == 'float64'].index.tolist()
float_cols = list(set(float_cols) - set(images_cols))
float_cols.remove('sales_quantity_log')
float_cols.remove('first_day_sales_log')
# float_cols.remove('sales_quantity')
float_cols.remove('first_day_sales')

In [27]:
categorical_cols = X_train1.dtypes[X_train1.dtypes == 'object'].index.tolist()
categorical_cols.remove('en_US_description')
categorical_cols.remove('color')

In [28]:
x_train = pd.DataFrame()
x_test = pd.DataFrame()

In [29]:
split_points = [0]
combined_data = pd.concat([train_data1, train_data2, train_data3, X_test1, X_test2, X_test3], axis=0)
for df in train_data1, train_data2, train_data3, X_test1, X_test2, X_test3:
    split_points.append(split_points[-1] + len(df))

description features

In [30]:
descs = combined_data['en_US_description']
vect = TfidfVectorizer(stop_words='english', max_features=5000)

descs = vect.fit_transform(descs)
vect_pca = LatentDirichletAllocation(n_components=16)
descs = vect_pca.fit_transform(descs)

In [31]:
im_data = combined_data[images_cols]
im_pca = PCA(n_components=16)
im_data = im_pca.fit_transform(im_data)

In [32]:
float_cols_pca = PCA(n_components=16)
float_data = float_cols_pca.fit_transform(SimpleFill().fit_transform(combined_data[float_cols]))

In [33]:
from sklearn.decomposition import TruncatedSVD
cat_pca = TruncatedSVD(n_components=32)
cat_ohe = OneHotEncoder(handle_unknown='ignore')
cat_data = cat_pca.fit_transform(cat_ohe.fit_transform(combined_data[categorical_cols]))

In [38]:
#months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
#extra_columns = pd.DataFrame()
#extra_columns['1'] = combined_data['Month_transaction'].apply(months.index)
#extra_columns['2'] = combined_data['product_gender'].apply(['Women', 'Unisex', 'Men'].index)

In [39]:
train_data1['Month_transaction'].value_counts()

Sep    344
Jan    236
Jun    197
May    153
Nov    152
Oct    150
Mar    137
Apr    120
Feb    112
Jul    107
Dec     51
Aug      7
Name: Month_transaction, dtype: int64

In [439]:
darius_train.columns

Index(['ID', 'product_type', 'product_gender', 'page_views', 'sales_quantity',
       'TotalBuzzPost', 'TotalBuzz', 'NetSentiment', 'PositiveSentiment',
       'NegativeSentiment', 'Impressions', 'fr_FR_price',
       'macro_function_count', 'function_count', 'sub_function_count',
       'model_count', 'aesthetic_sub_line_count', 'macro_material_count',
       'color_count', 'page_views_nav1', 'page_views_nav2', 'page_views_nav3',
       'page_views_nav4', 'page_views_nav5', 'page_views_nav6',
       'sales_quantity_type1', 'sales_quantity_type2', 'sales_quantity_zone1',
       'sales_quantity_zone2', 'sales_quantity_zone3', 'sales_quantity_zone4',
       'sales_quantity_zone5', 'mean_target'],
      dtype='object')

In [40]:
x_all = np.concatenate([descs, im_data, float_data, cat_data, combined_data[['sales_quantity_log', 'first_day_sales_log', 'sales_quantity', 'first_day_sales', 'page_views', 'fr_FR_price']].values, combined_data[extra_features + darius_train.columns[1:].tolist()].values], axis=1)

In [41]:
s = split_points
x_train_1 = x_all[s[0]:s[1]]
x_train_2 = x_all[s[1]:s[2]]
x_train_3 = x_all[s[2]:s[3]]
x_test_1 = x_all[s[3]:s[4]]
x_test_2 = x_all[s[4]:s[5]]
x_test_3 = x_all[s[5]:s[6]]

In [42]:
# Taken from mlcrate
# https://github.com/mxbi/mlcrate
from mlcrate.time import Timer
from mlcrate.xgb import get_importances
def train_kfold(params, x_train, y_train, x_test=None, folds=5, stratify=None, random_state=1337, skip_checks=False, print_imp='final'):
    """Trains a set of XGBoost models with chosen parameters on a KFold split dataset, returning full out-of-fold
    training set predictions (useful for stacking) as well as test set predictions and the models themselves.
    Test set predictions are generated by averaging predictions from all the individual fold models - this means
    1 model fewer has to be trained and from my experience performs better than retraining a single model on the full set.
    Optionally, the split can be stratified along a passed array. Feature importances are also computed and summed across all folds for convenience.
    Keyword arguments:
    params -- Parameters passed to the xgboost model, as well as ['early_stopping_rounds', 'nrounds', 'verbose_eval'], which are passed to xgb.train()
              Defaults: early_stopping_rounds = 50, nrounds = 100000, verbose_eval = 1
    x_train -- The training set features
    y_train -- The training set labels
    x_test (optional) -- The test set features
    folds (default: 5) -- The number of folds to perform
    stratify (optional) -- An array to stratify the splits along
    random_state (default: 1337) -- Random seed for splitting folds
    skip_checks -- By default, this function tries to reorder the test set columns to match the order of the training set columns. Set this to disable this behaviour.
    print_imp -- One of ['every', 'final', None] - 'every' prints importances for every fold, 'final' prints combined importances at the end, None does not print importance
    Returns:
    models -- a list of trained xgboost.Booster objects
    p_train -- Out-of-fold training set predictions (shaped like y_train)
    p_test -- Mean of test set predictions from the models
    imps -- dict with \{feature: importance\} pairs representing the sum feature importance from all the models.
    """

    from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold  # Optional dependencies
    from collections import defaultdict
    import numpy as np
    import xgboost as xgb

    assert print_imp in ['every', 'final', None]

    # If it's a dataframe, we can take column names, otherwise just use column indices (eg. for printing importances).
    if hasattr(x_train, 'columns'):
        columns = x_train.columns.values
        columns_exists = True
    else:
        columns = np.arange(x_train.shape[1]).astype(str)
        columns_exists = False
#     print(columns)

    x_train = np.asarray(x_train)
    y_train = np.array(y_train)

    if x_test is not None:
        if columns_exists and not skip_checks:
            try:
                x_test = x_test[columns]
            except Exception as e:
                print('[mlcrate] Could not coerce x_test columns to match x_train columns. Set skip_checks=True to run anyway.')
                raise e

        x_test = np.asarray(x_test)
        d_test = xgb.DMatrix(x_test)

    if not skip_checks:
        assert x_train.shape[1] == x_test.shape[1], "x_train and x_test have different numbers of features."

    print('[mlcrate] Training {} {}XGBoost models on training set {} {}'.format(folds, 'stratified ' if stratify is not None else '',
            x_train.shape, 'with test set {}'.format(x_test.shape) if x_test is not None else 'without a test set'))

    # Init a timer to get fold durations
    t = Timer()

    if stratify is not None:
        kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
        splits = kf.split(x_train, stratify)
    else:
        # MODIFIED
        kf = RepeatedKFold(n_repeats=10, n_splits=folds, random_state=4242)
        splits = kf.split(x_train)

    p_train = np.zeros_like(y_train, dtype=np.float32)
    ps_test = []
    models = []
    scores = []
    imps = defaultdict(int)

    fold_i = 0
    for train_kf, valid_kf in splits:
        print('[mlcrate] Running fold {}, {} train samples, {} validation samples'.format(fold_i, len(train_kf), len(valid_kf)))
        d_train = xgb.DMatrix(x_train[train_kf], label=y_train[train_kf])
        d_valid = xgb.DMatrix(x_train[valid_kf], label=y_train[valid_kf])

        # Start a timer for the fold
        t.add('fold{}'.format(fold_i))

        # Metrics to print
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        mdl = xgb.train(params, d_train, params.get('nrounds', 100000), watchlist,
                        early_stopping_rounds=params.get('early_stopping_rounds', 50), verbose_eval=params.get('verbose_eval', 1))

        scores.append(mdl.best_score)

        print('[mlcrate] Finished training fold {} - took {} - running score {}'.format(fold_i, t.format_elapsed('fold{}'.format(fold_i)), np.mean(scores)))

        # Get importances for this model and add to global importance
#         print(mdl, columns)
        imp = get_importances(mdl, columns)
        if print_imp == 'every':
            print('Fold {} importances:'.format(fold_i), imp)

        for f, i in imp:
            imps[f] += i

        # Get predictions from the model
        p_valid = mdl.predict(d_valid, ntree_limit=mdl.best_ntree_limit)
        if x_test is not None:
            p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)

        p_train[valid_kf] = p_valid

        ps_test.append(p_test)
        models.append(mdl)

        fold_i += 1

    if x_test is not None:
        p_test = np.mean(ps_test, axis=0)

    print('[mlcrate] Finished training {} XGBoost models, took {}'.format(folds, t.format_elapsed(0)))

    if print_imp in ['every', 'final']:
        print('[mlcrate] Overall feature importances:', sorted(imps.items(), key=lambda x: x[1], reverse=True))

    if x_test is None:
        p_test = None

    return models, p_train, p_test, imps, np.mean(scores)



In [45]:
params = {}
params['objective'] = 'reg:linear'
params['max_depth'] = 3
params['subsample'] = 0.7
params['colsample_bylevel'] = 0.8
#params['gamma'] = 5
# params['min_child_weight'] = 25
params['eta'] = 0.02
params['silent'] = 1
params['eval_metric'] = 'rmse'
params['verbose_eval'] = 2000
params['early_stopping_rounds'] = 500

# Train 30 model folds per month
# folds=3 not 30 because I modified the train_kfold function from the mlcrate source
_, p_train1, p_test1, _, score1 = train_kfold(params, x_train_1, y_train1, x_test_1, folds=3)
_, p_train2, p_test2, _, score2 = train_kfold(params, x_train_2, y_train2, x_test_2, folds=3)
_, p_train3, p_test3, _, score3 = train_kfold(params, x_train_3, y_train3, x_test_3, folds=3)

print('{:.4f} {:.4f} {:.4f} {:.4f}'.format(score1, score2, score3, np.mean([score1, score2, score3])))

[mlcrate] Training 3 XGBoost models on training set (1766, 126) with test set (1722, 126)
[mlcrate] Running fold 0, 1177 train samples, 589 validation samples
[0]	train-rmse:4.82397	valid-rmse:4.77384
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[2000]	train-rmse:0.090203	valid-rmse:0.373127
Stopping. Best iteration:
[1670]	train-rmse:0.10829	valid-rmse:0.372846

[mlcrate] Finished training fold 0 - took 17s - running score 0.372846
[mlcrate] Running fold 1, 1177 train samples, 589 validation samples
[0]	train-rmse:4.77795	valid-rmse:4.86585
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.


  warn(message)


[2000]	train-rmse:0.087407	valid-rmse:0.372753
[4000]	train-rmse:0.032126	valid-rmse:0.370279


KeyboardInterrupt: 

In [515]:
# Do exp(target) before submission
p_test1 = pd.Series(p_test1)
p_test1 = (p_test1).apply(np.exp)  - 1
p_test1.index = X_test1.index

p_test2 = pd.Series(p_test2)
p_test2 = (p_test2).apply(np.exp)  - 1
p_test2.index = X_test2.index

p_test3 = pd.Series(p_test3)
p_test3 = (p_test3).apply(np.exp) - 1
p_test3.index = X_test3.index

submission = pd.DataFrame(pd.concat([p_test1, p_test2, p_test3]))
submission.columns = ['target']

In [516]:
# Save submission
submission.to_csv('submission_xgb6_bag30.csv')