In [1]:
import os.path
import pandas as pd
import numpy as np
import xgboost as xgb 

base_path = "../input/"

In [2]:
# load the data

navigation = pd.read_csv(os.path.join(base_path, 'navigation.csv'))
sales = pd.read_csv(os.path.join(base_path, 'sales.csv'))
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
vimages = pd.read_csv(os.path.join(base_path, 'vimages.csv'))
sub = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [3]:
# leave-one-out target encoding for different colors of the same product

product_descriptor = ['product_type', 'product_gender', 'macro_function', 
                      'function', 'sub_function', 'model', 'aesthetic_sub_line', 'macro_material',
                      'month']

product_target_sum = train.groupby(product_descriptor)['target'].sum().reset_index(name = 'sum_target')
product_target_count = train.groupby(product_descriptor)['target'].count().reset_index(name = 'count_target')
product_target_stats = pd.merge(product_target_sum, product_target_count, on = product_descriptor)

train = train.merge(product_target_stats, on = product_descriptor, how = 'left')
test = test.merge(product_target_stats, on = product_descriptor, how = 'left')

train['mean_target'] = (train['sum_target'] - train['target'])/(train['count_target']-1)
test['mean_target'] = (test['sum_target'])/(test['count_target'])

train.drop(['sum_target','count_target'],axis=1,inplace=True)
test.drop(['sum_target','count_target'],axis=1,inplace=True)

In [4]:
# counts for categorical features in train+test

count_vec_cols = ['macro_function', 'function', 'sub_function', 'model',
                  'aesthetic_sub_line', 'macro_material', 'color']

for col in count_vec_cols:
    tmp = pd.DataFrame({'sku_hash': pd.concat([train['sku_hash'],test['sku_hash']]), 
                        col:pd.concat([train[col],test[col]])})
    tmp = pd.DataFrame(tmp.groupby(col)['sku_hash'].count()).reset_index()
    tmp.columns = [col,col+'_count']

    train = train.merge(tmp, on = col, how = 'left')
    test = test.merge(tmp, on = col, how = 'left')

In [5]:
# sum of page views by different traffic source

traffic_source_views = navigation.groupby(['sku_hash','traffic_source'])['page_views'].sum().reset_index()
traffic_source_views = traffic_source_views.pivot(index='sku_hash', columns='traffic_source', values='page_views').reset_index()
traffic_source_views.columns = ['sku_hash', 
                                'page_views_nav1', 'page_views_nav2', 'page_views_nav3', 
                                'page_views_nav4', 'page_views_nav5', 'page_views_nav6']

In [6]:
# sum of sales by different type

type_sales = sales.groupby(['sku_hash','type'])['sales_quantity'].sum().reset_index()
type_sales = type_sales.pivot(index='sku_hash', columns='type', values='sales_quantity').reset_index()
type_sales.columns = ['sku_hash', 'sales_quantity_type1', 'sales_quantity_type2']

In [7]:
# sum of sales by different zone

zone_sales = sales.groupby(['sku_hash','zone_number'])['sales_quantity'].sum().reset_index()
zone_sales = zone_sales.pivot(index='sku_hash', columns='zone_number', values='sales_quantity').reset_index()
zone_sales.columns = ['sku_hash', 
                      'sales_quantity_zone1', 'sales_quantity_zone2', 'sales_quantity_zone3', 
                      'sales_quantity_zone4', 'sales_quantity_zone5']

In [8]:
# overall stats of sales, page views  and twitter sentiments

navigation_stats = navigation.groupby('sku_hash')['page_views'].sum().reset_index(name='page_views')
sales_stats = sales.groupby('sku_hash')['sales_quantity','TotalBuzzPost', 'TotalBuzz',
       'NetSentiment', 'PositiveSentiment', 'NegativeSentiment', 'Impressions'].sum().reset_index()

In [9]:
# define cross validation splits

train['idx'] = pd.Categorical(train.sku_hash).codes
train['idx'] = train['idx'] % 5

In [10]:
# merge everything for train

X = train.copy()
X = X.merge(navigation_stats, on = 'sku_hash', how = 'left')
X = X.merge(sales_stats, on = 'sku_hash', how = 'left')
X = X.merge(traffic_source_views, on = 'sku_hash', how = 'left')
X = X.merge(type_sales, on = 'sku_hash', how = 'left')
X = X.merge(zone_sales, on = 'sku_hash', how = 'left')

X.loc[X.product_type=='Accessories','product_type'] = '0'
X.loc[X.product_type=='Leather Goods','product_type'] = '1'
X.product_type = X.product_type.astype(int)

X.loc[X.product_gender=='Women','product_gender'] = '-1'
X.loc[X.product_gender=='Unisex','product_gender'] = '0'
X.loc[X.product_gender=='Men','product_gender'] = '1'
X.product_gender = X.product_gender.astype(int)

# transform label to meet the metric
X['y'] = np.log(X['target'] + 1)

In [11]:
# merge everything for test 

Z = test.copy()
Z = Z.merge(navigation_stats, on = 'sku_hash', how = 'left')
Z = Z.merge(sales_stats, on = 'sku_hash', how = 'left')
Z = Z.merge(traffic_source_views, on = 'sku_hash', how = 'left')
Z = Z.merge(type_sales, on = 'sku_hash', how = 'left')
Z = Z.merge(zone_sales, on = 'sku_hash', how = 'left')

Z.loc[Z.product_type=='Accessories','product_type'] = '0'
Z.loc[Z.product_type=='Leather Goods','product_type'] = '1'
Z.product_type = Z.product_type.astype(int)

Z.loc[Z.product_gender=='Women','product_gender'] = '-1'
Z.loc[Z.product_gender=='Unisex','product_gender'] = '0'
Z.loc[Z.product_gender=='Men','product_gender'] = '1'
Z.product_gender = Z.product_gender.astype(int)

In [12]:
features = ['product_type', 'product_gender', 
            'page_views', 'sales_quantity',
            'TotalBuzzPost', 'TotalBuzz', 'NetSentiment', 'PositiveSentiment', 'NegativeSentiment', 'Impressions',
            'fr_FR_price',
            'macro_function_count', 'function_count', 'sub_function_count', 'model_count', 'aesthetic_sub_line_count', 'macro_material_count', 'color_count',
            'page_views_nav1', 'page_views_nav2', 'page_views_nav3', 'page_views_nav4', 'page_views_nav5', 'page_views_nav6',
            'sales_quantity_type1', 'sales_quantity_type2',
            'sales_quantity_zone1','sales_quantity_zone2','sales_quantity_zone3', 'sales_quantity_zone4','sales_quantity_zone5',
            'mean_target',]

In [13]:
# define function to generate xgboost objects for a specific month

def train_test_split(tr, te, mo, feats, num_folds):
    
    Xtrain = []
    ytrain = []
    dtrain = []
    Xval = []
    yval = []
    dval = []

    for i in range(num_folds):
        
        Xtrain.append(tr.loc[(tr.month == mo) & (tr.idx != i), feats].values)
        ytrain.append(tr.loc[(tr.month == mo) & (tr.idx != i), 'y'].values)
        dtrain.append(xgb.DMatrix(Xtrain[i],ytrain[i]))
        
        Xval.append(tr.loc[(tr.month == mo) & (tr.idx == i), feats].values)
        yval.append(tr.loc[(tr.month == mo) & (tr.idx == i), 'y'].values)
        dval.append(xgb.DMatrix(Xval[i],yval[i]))

    Xtest = te.loc[(te.month == mo),feats].values    
    dtest = xgb.DMatrix(Xtest)
    
    return dtrain, dval, dtest

In [14]:
# define xgboost parameters to use in models

param = {} 
param['objective'] = 'reg:linear'
param['eval_metric'] =  'rmse'
param['booster'] = 'gbtree'
param['eta'] = 0.025
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 3
param['min_child_weight'] = 25
param['gamma'] = 5
param['max_depth'] =  3
param['silent'] = 1

In [15]:
# train models for the 1 month

dtrain, dval, dtest = train_test_split(tr = X, te = Z, mo = 1, feats = features, num_folds = 5)

model_m1 = []
for i in range(5):
    model_m1.append(
        xgb.train(
                  param,
                  dtrain[i],
                  50000,
                  [(dtrain[i],'train'), (dval[i],'eval')],
                  early_stopping_rounds = 200,
                  verbose_eval = False)
    )
    
# run predictions for the 1 month    
    
oof_m1 = []
oof_test_m1 = []
for i in range(5):
    oof_m1.append(model_m1[i].predict(dval[i]))
    oof_test_m1.append(model_m1[i].predict(dtest))
    
test_m1 = np.mean(oof_test_m1, axis=0)    
    
m1 = {}
for i in range(5):
    m1 = {**m1, **dict(zip(X.loc[(X.month==1) & (X.idx==i),'sku_hash'], oof_m1[i]))}
    
m1 = {**m1, **dict(zip(Z.loc[(Z.month==1),'sku_hash'], test_m1))}
    
oof_m1 = pd.DataFrame.from_dict(m1, orient='index').reset_index()    
oof_m1.columns = ['sku_hash', 'oof_m1']

X2 = pd.merge(X.copy(), oof_m1, on = 'sku_hash')
Z2 = pd.merge(Z.copy(), oof_m1, on = 'sku_hash')
features2 = features + ['oof_m1']    

In [16]:
# train models for the 2 month

dtrain2, dval2, dtest2 = train_test_split(tr = X2, te = Z2, mo = 2, feats = features2, num_folds = 5)

model_m2 = []

for i in range(5):
    model_m2.append(
        xgb.train(
                  param,
                  dtrain2[i],
                  50000,
                  [(dtrain2[i],'train'), (dval2[i],'eval')],
                  early_stopping_rounds = 200,
                  verbose_eval = False)
    )

# run predictions for the 2 month        
    
oof_m2 = []
oof_test_m2 = []
for i in range(5):
    oof_m2.append(model_m2[i].predict(dval2[i]))
    oof_test_m2.append(model_m2[i].predict(dtest2))
    
test_m2 = np.mean(oof_test_m2, axis=0)    
    
m2 = {}
for i in range(5):
    m2 = {**m2, **dict(zip(X.loc[(X.month==2) & (X.idx==i),'sku_hash'], oof_m2[i]))}
    
m2 = {**m2, **dict(zip(Z.loc[(Z.month==2),'sku_hash'], test_m2))}
    
oof_m2 = pd.DataFrame.from_dict(m2, orient='index').reset_index()    
oof_m2.columns = ['sku_hash', 'oof_m2']

X3 = pd.merge(X2.copy(), oof_m2, on = 'sku_hash')
Z3 = pd.merge(Z2.copy(), oof_m2, on = 'sku_hash')
features3 = features2 + ['oof_m2']    

In [17]:
# train models for the 3 month

dtrain3, dval3, dtest3 = train_test_split(tr = X3, te = Z3, mo = 3, feats = features3, num_folds = 5)

model_m3 = []

for i in range(5):
    model_m3.append(
        xgb.train(
                  param,
                  dtrain3[i],
                  50000,
                  [(dtrain3[i],'train'),(dval3[i],'eval')],
                  early_stopping_rounds = 200,
                  verbose_eval = False)
    )

# run predictions for the 3 month        
    
oof_m3 = []
oof_test_m3 = []
for i in range(5):
    oof_m3.append(model_m3[i].predict(dval3[i]))
    oof_test_m3.append(model_m3[i].predict(dtest3))
    
test_m3 = np.mean(oof_test_m3, axis=0)    
    
m3 = {}
for i in range(5):
    m3 = {**m3, **dict(zip(X.loc[(X.month==3) & (X.idx==i),'sku_hash'], oof_m3[i]))}
    
m3 = {**m3, **dict(zip(Z.loc[(Z.month==3),'sku_hash'], test_m3))}
    
oof_m3 = pd.DataFrame.from_dict(m3, orient='index').reset_index()    
oof_m3.columns = ['sku_hash', 'oof_m3']

X3 = pd.merge(X3.copy(), oof_m3, on = 'sku_hash')
Z3 = pd.merge(Z3.copy(), oof_m3, on = 'sku_hash')

In [18]:
# create a single vector of predictions for both train and test

Z3['target'] = 0
Z3.loc[Z3.month == 1, 'target'] = Z3.loc[Z3.month == 1, 'oof_m1'] 
Z3.loc[Z3.month == 2, 'target'] = Z3.loc[Z3.month == 2, 'oof_m2'] 
Z3.loc[Z3.month == 3, 'target'] = Z3.loc[Z3.month == 3, 'oof_m3'] 

X3['pred_target'] = 0
X3.loc[X3.month == 1, 'pred_target'] = X3.loc[X3.month == 1, 'oof_m1'] 
X3.loc[X3.month == 2, 'pred_target'] = X3.loc[X3.month == 2, 'oof_m2'] 
X3.loc[X3.month == 3, 'pred_target'] = X3.loc[X3.month == 3, 'oof_m3'] 

In [19]:
# some cross validation diagnostics

print(f"month1: {np.sqrt(np.mean((X3.loc[X3.month==1,'y'] - X3.loc[X3.month==1,'pred_target'])**2))}")
print(f"month2: {np.sqrt(np.mean((X3.loc[X3.month==2,'y'] - X3.loc[X3.month==2,'pred_target'])**2))}")
print(f"month3: {np.sqrt(np.mean((X3.loc[X3.month==3,'y'] - X3.loc[X3.month==3,'pred_target'])**2))}")
print(f"overall: {np.sqrt(np.mean((X3['y'] - X3['pred_target'])**2))}")

month1: 0.44243199303177
month2: 0.5833878625335227
month3: 0.7168724677773026
overall: 0.5916061757809871


In [20]:
# make a submission

Z3['target'] = np.exp(Z3.target)-1
final_sub = Z3[['ID','target']]
final_sub.to_csv(os.path.join(base_path,'silly-raddar-sub4.csv'),index=None)

In [21]:
# create a oof train version of a submision

X3['target'] = np.exp(X3.pred_target)-1
cv_sub = X3[['ID','target']]
cv_sub.to_csv(os.path.join(base_path,'silly-raddar-cv4.csv'),index=None)

In [22]:
# export features for anokas to use

X3[['ID']+features].to_csv(os.path.join(base_path,'raddar-features-train.csv'),index=None)
Z3[['ID']+features].to_csv(os.path.join(base_path,'raddar-features-test.csv'),index=None)