In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
# Import raw data
calander = pd.read_csv('data/calendar.csv')
df=pd.read_csv('data/sales_train_validation.csv')
sell_prices = pd.read_csv('data/sell_prices.csv')

In [3]:
calander.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [None]:
sell_prices.head()

In [4]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [None]:
train_df = pd.read_csv('train_val_df_2011.csv')

In [7]:
def create_train_data(ds,de):
    # Import raw data
    calander = pd.read_csv('data/calendar.csv')
    df=pd.read_csv('data/sales_train_validation.csv')
    sell_prices = pd.read_csv('data/sell_prices.csv')
    calander = calander[(calander['year'] >= ds) & (calander['year'] <de)]
    # Before one row represents all sales for one item (30490 rows)
    # After one row represents one day's sales for one item (58327370 rows = 30490 * 1913)
    numcols = [f"d_{day}" for day in range(1,1914)]
    
    # Convert category columns to numbers
    # removing dept_id and store from features, not enough training data atm for these
    features = ['item_id','cat_id','state_id','wday','month','year']
    cat = ['item_id','cat_id','state_id']
    target = 'sales'
    for c in cat:
        df[c] = df[c].astype('category').cat.codes
    
    df = pd.melt(df, id_vars=cat, value_vars = numcols, var_name='d',value_name='sales')
    df['d'] =  pd.to_numeric(df['d'].str.split('_',expand=True)[1])
    calander['d'] =  pd.to_numeric(calander['d'].str.split('_',expand=True)[1])
    
    # Merge with calander
    df_train = df.merge(calander,on='d')
    
    train_df = df_train[df_train.d < df_train['d'].unique()[-28]]
    val_df= df_train[df_train.d >= df_train['d'].unique()[-28]]
    X = train_df[features].values
    X_val = val_df[features].values
    y=train_df[target].values
    y_val=val_df[target].values
    train_data = lgb.Dataset(X,label=y,feature_name = features, categorical_feature = cat,free_raw_data=False)
    val_data = lgb.Dataset(X_val,label=y_val)
    
    return train_data, val_data

In [None]:
train_data,val_data = create_train_data(ds=2011,de=2016)

In [None]:
paramaters = {'objective':'rmse','metric':'rmse','num_leaves':31}

In [None]:
model = lgb.train(paramaters,train_data,valid_sets = val_data,num_boost_round = 1000, early_stopping_rounds=100)

Submission File
Each row contains an id that is a concatenation of an item_id and a store_id, which is either validation (corresponding to the Public leaderboard), or evaluation (corresponding to the Private leaderboard). You are predicting 28 forecast days (F1-F28) of items sold for each row. For the validation rows, this corresponds to d_1914 - d_1941, and for the evaluation rows, this corresponds to d_1942 - d_1969. (Note: a month before the competition close, the ground truth for the validation rows will be provided.)

The files must have a header and should look like the following:

id,F1,...F28

HOBBIES_1_001_CA_1_validation,0,...,2 

HOBBIES_1_002_CA_1_validation,2,...,11

...
HOBBIES_1_001_CA_1_evaluation,3,...,7

HOBBIES_1_002_CA_1_evaluation,1,...,4

In [None]:
# Creates test data for model.
# ds {int} Date start, inclusive. e.g 1914 (for this comp)
# de {int} Date end, inclusive. e.g 1941 (for this comp)
#
def create_test_data(ds=1914,de=1941):
    calander = pd.read_csv('data/calendar.csv')
    df=pd.read_csv('data/sales_train_validation.csv')
    sell_prices = pd.read_csv('data/sell_prices.csv')
    features = ['item_id','cat_id','state_id','wday','month','year']
    df_features = ['item_id','cat_id','state_id']
    cal_features = ['wday','month','year']
    test_ids = df['id'].unique()
    df_test = df[df_features]
    
    cat = ['item_id','cat_id','state_id']
    for c in cat:
        df_test[c] = df_test[c].astype('category').cat.codes
    
    #test_day_columns=['F'+str(i) for i in range(1,28)]
    calander['d'] =  pd.to_numeric(calander['d'].str.split('_',expand=True)[1])
    calander = calander[(calander.d >= ds) & (calander.d <=de)]
    calander['key'] = 0
    df_test['key'] = 0
    merged = df_test.merge(calander, on='key')
    df_test = merged[features]
    return df_test, df['id']
    

In [None]:
def create_28_day_test_submission(test_df,ids,model):
    y_pred = model.predict(test_df.values)
    y_grp = [y_pred[i:i+28] for i  in range(0, len(y_pred), 28)]
    test_day_columns=['F'+str(i) for i in range(1,29)]
    df_sub = pd.DataFrame(columns=test_day_columns,data=y_grp)
    df_sub['id'] = ids
    return df_sub[['id',*test_day_columns]]
    

In [None]:
val_test_data,val_ids = create_test_data(ds=1914,de=1941)

In [None]:
# Need to change ids from validation to evaluation
eval_test_data, eval_ids = create_test_data(ds=1942,de=1969)

In [None]:
eval_ids = eval_ids.str.replace('validation','evaluation')

In [None]:
df_val_sub = create_28_day_test_submission(val_test_data,val_ids,model)
df_eval_sub = create_28_day_test_submission(eval_test_data,eval_ids,model)
df_val_sub.head()

In [None]:
final_submission = pd.concat([df_val_sub,df_eval_sub])

In [None]:
final_submission.to_csv('sub_1.csv',index=False)