In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'/'Colab Notebooks'/
#sssssssssssssssssssssssssssssssssssssssssssssssssssss

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


In [0]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import lightgbm as lgb
#import dask_xgboost as xgb
#import dask.dataframe as dd
from sklearn import preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
import gc
import os
from tqdm import tqdm, tqdm_notebook
from scipy.sparse import csr_matrix


In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('calendar.csv')
    calendar_df = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation_df = pd.read_csv('sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))
    submission = pd.read_csv('sample_submission.csv')
    return calendar, sell_prices, sales_train_validation_df, submission

In [0]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

In [6]:
calendar_df, sell_prices_df, sales_train_validation_df, submission = read_data()

Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Sales train validation has 30490 rows and 1919 columns


In [0]:
NUM_ITEMS = sales_train_validation_df.shape[0]  # 30490
DAYS_PRED = 28
nrows = 365 * 2 * NUM_ITEMS

In [8]:
def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = preprocessing.LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df


calendar_df = encode_categorical(calendar_df, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)
sales_train_validation_df = encode_categorical(sales_train_validation_df, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
sell_prices_df = encode_categorical(sell_prices_df, ["item_id", "store_id"]).pipe(reduce_mem_usage)

Mem. usage decreased to  0.08 Mb (36.9% reduction)
Mem. usage decreased to 94.01 Mb (78.9% reduction)
Mem. usage decreased to 45.67 Mb (65.0% reduction)


In [0]:
def zapsmall(z, tol=1e-6):
    z[abs(z) < tol] = 0
    return z
    
def prep_selling_prices(df):
    gr = df.groupby(["store_id", "item_id"])["sell_price"]
    df["sell_price_rel_diff"] = gr.pct_change()
    df["sell_price_cumrel"] = (gr.shift(0) - gr.cummin()) / (1 + gr.cummax() - gr.cummin())
    df["sell_price_roll_sd7"] = zapsmall(gr.transform(lambda x: x.rolling(7).std()))
    to_float32 = ["sell_price", "sell_price_rel_diff", "sell_price_cumrel", "sell_price_roll_sd7"]
    df[to_float32] = df[to_float32].astype("float32")
         
    return df

sell_prices_df = prep_selling_prices(sell_prices_df)

In [0]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    sales_train_validation = sales_train_validation.iloc[-nrows:,:]
    
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    print(data.shape)
    
    # get only a sample for fst training
#     data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data


In [11]:
nrows

22257700

In [12]:
# nrows = 365 * 2 * NUM_ITEMS

#nrows = 27500000
data = melt_and_merge(calendar_df, sell_prices_df, sales_train_validation_df, submission, nrows = nrows, merge = True)
# nrows = 27500000

Melted sales train validation has 58327370 rows and 8 columns
Mem. usage decreased to 1335.01 Mb (0.0% reduction)
(23965140, 9)
Our final dataset to train has 23111420 rows and 21 columns


In [13]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,sell_price_rel_diff,sell_price_cumrel,sell_price_roll_sd7
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,train,2014-04-26,11413,,,,,0,0,0,8.257812,0.0,0.0,0.0
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,train,2014-04-26,11413,,,,,0,0,0,3.970703,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,train,2014-04-26,11413,,,,,0,0,0,2.970703,0.0,0.0,0.0
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,1,train,2014-04-26,11413,,,,,0,0,0,4.640625,0.0,0.231201,0.0
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,train,2014-04-26,11413,,,,,0,0,0,3.080078,0.0,0.375244,0.0


In [0]:
data['demand_total'] = data.groupby('id')['demand'].cumsum()
data = data[~((data['demand_total']==0)&(data['part']=='train'))]

In [15]:
def simple_fe(data):

    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_mean_t15'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(15).mean())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t50'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(50).mean())
    data['rolling_mean_t70'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(70).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t120'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(120).mean())
    data['rolling_mean_t150'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(150).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())


# time features
    data['date'] = pd.to_datetime(data['date'])
    attrs = ["year", "quarter", "month", "week", "day", "dayofweek", "is_year_end", "is_year_start", "is_quarter_end", \
        "is_quarter_start", "is_month_end","is_month_start",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        data[attr] = getattr(data['date'].dt, attr).astype(dtype)
    data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
    
    return data

data = simple_fe(data)
data = reduce_mem_usage(data)

Mem. usage decreased to 2239.11 Mb (48.8% reduction)


In [16]:
data.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'demand',
       'part', 'date', 'wm_yr_wk', 'event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'sell_price_rel_diff', 'sell_price_cumrel',
       'sell_price_roll_sd7', 'demand_total', 'lag_t28', 'rolling_mean_t7',
       'rolling_mean_t15', 'rolling_mean_t30', 'rolling_mean_t50',
       'rolling_mean_t70', 'rolling_mean_t90', 'rolling_mean_t120',
       'rolling_mean_t150', 'rolling_mean_t180', 'rolling_std_t7',
       'rolling_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30', 'year',
       'quarter', 'month', 'week', 'day', 'dayofweek', 'is_year_end',
       'is_year_start', 'is_quarter_end', 'is_quarter_start', 'is_month_end',
       'is_month_start', 'is_weekend'],
      dtype='object')

In [0]:
def run_lgb(data):
    
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()
    
    params = {
    'metric': 'rmse',
    'objective': 'poisson',
    'seed': 20,
    'learning_rate': 0.075,
    'lambda': 0.1,
    'num_leaves': 63,
    'bagging_fraction': 0.66,
    'bagging_freq': 1, 
    'colsample_bytree': 0.77
    }
    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train
    
    
    model = lgb.train(params, train_set, num_boost_round = 2000, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 30)
    
    val_pred = model.predict(x_val[features], num_iteration=model.best_iteration)
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    importance = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
    print(f'Our val rmse score is {val_score}')
    pred = model.predict(test[features], num_iteration=model.best_iteration)
    test['demand'] = pred * 1.04
    return test, importance


def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    #predictions['demand'] = predictions['demand'] * 1.08
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])

    final.to_csv('submission53.csv', index = False)
    

In [18]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,sell_price_rel_diff,sell_price_cumrel,sell_price_roll_sd7,demand_total,lag_t28,rolling_mean_t7,rolling_mean_t15,rolling_mean_t30,rolling_mean_t50,rolling_mean_t70,rolling_mean_t90,rolling_mean_t120,rolling_mean_t150,rolling_mean_t180,rolling_std_t7,rolling_std_t30,rolling_skew_t30,rolling_kurt_t30,year,quarter,month,week,day,dayofweek,is_year_end,is_year_start,is_quarter_end,is_quarter_start,is_month_end,is_month_start,is_weekend
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,1,train,2014-04-26,11413,,,,,0,0,0,4.640625,0.0,0.231201,0.0,1,,,,,,,,,,,,,,,2014,2,4,17,26,5,0,0,0,0,0,0,1
7,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,7,train,2014-04-26,11413,,,,,0,0,0,0.459961,0.0,0.037079,0.0,7,,,,,,,,,,,,,,,2014,2,4,17,26,5,0,0,0,0,0,0,1
9,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,2,train,2014-04-26,11413,,,,,0,0,0,2.970703,0.0,0.0,0.0,2,,,,,,,,,,,,,,,2014,2,4,17,26,5,0,0,0,0,0,0,1
12,HOBBIES_1_013_CA_1_validation,1449,3,1,0,0,1,train,2014-04-26,11413,,,,,0,0,0,1.969727,0.0,0.0,0.0,1,,,,,,,,,,,,,,,2014,2,4,17,26,5,0,0,0,0,0,0,1
14,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,11,train,2014-04-26,11413,,,,,0,0,0,0.700195,0.0,0.019226,0.0,11,,,,,,,,,,,,,,,2014,2,4,17,26,5,0,0,0,0,0,0,1


In [0]:
features = ["item_id","dept_id",
    "cat_id","store_id","state_id",
    "event_name_1","event_type_1","snap_CA",
    "snap_TX","snap_WI","sell_price", "sell_price_rel_diff", "sell_price_cumrel", "sell_price_roll_sd7",
    # demand features.
     "lag_t28", "rolling_mean_t7", "rolling_mean_t15", "rolling_mean_t30", 
     "rolling_mean_t50", "rolling_mean_t70", "rolling_mean_t90", 
     "rolling_mean_t120", "rolling_mean_t150", "rolling_mean_t180",
     "rolling_std_t7", "rolling_std_t30",
     "rolling_skew_t30","rolling_kurt_t30",
       'year', 'quarter', 'month', 'week', 'day', 'dayofweek', 'holidays'#, 'is_year_end',
     #  'is_year_start', 'is_quarter_end', 'is_quarter_start', 'is_month_end',
     #  'is_month_start', 'is_weekend'
       ]

In [0]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays(start=data['date'].iloc[0], end=data['date'].iloc[-1])
data['Holiday'] = data['date'].isin(holidays)
data['holiday2'] = data['date'].dt.weekday >=5
data['holidays'] = data['Holiday'] + data['holiday2']
data['holidays'] = data['holidays'].replace({True: 1, False: 0})
data = data.drop(['Holiday', 'holiday2'], axis=1)

In [0]:
data_hobby = data[data['id'].str.contains("HOBBIES")]
data_house = data[data['id'].str.contains("HOUSE")]
data_foods = data[data['id'].str.contains("FOODS")]

In [22]:
test_hobby, importance_hobby = run_lgb(data_hobby)

Training until validation scores don't improve for 50 rounds.
[30]	training's rmse: 1.74969	valid_1's rmse: 1.70229
[60]	training's rmse: 1.70835	valid_1's rmse: 1.66492
[90]	training's rmse: 1.69748	valid_1's rmse: 1.66097
[120]	training's rmse: 1.6881	valid_1's rmse: 1.65958
[150]	training's rmse: 1.67999	valid_1's rmse: 1.65886
[180]	training's rmse: 1.67197	valid_1's rmse: 1.65793
[210]	training's rmse: 1.66265	valid_1's rmse: 1.65788
[240]	training's rmse: 1.65501	valid_1's rmse: 1.65701
[270]	training's rmse: 1.64651	valid_1's rmse: 1.65727
[300]	training's rmse: 1.63989	valid_1's rmse: 1.65673
[330]	training's rmse: 1.63341	valid_1's rmse: 1.65714
Early stopping, best iteration is:
[290]	training's rmse: 1.64246	valid_1's rmse: 1.65657
Our val rmse score is 1.656565355722221


In [23]:
test_house, importance_house = run_lgb(data_house)

Training until validation scores don't improve for 50 rounds.
[30]	training's rmse: 1.564	valid_1's rmse: 1.54768
[60]	training's rmse: 1.47977	valid_1's rmse: 1.47768
[90]	training's rmse: 1.45887	valid_1's rmse: 1.47011
[120]	training's rmse: 1.44718	valid_1's rmse: 1.46721
[150]	training's rmse: 1.4395	valid_1's rmse: 1.46581
[180]	training's rmse: 1.43235	valid_1's rmse: 1.4644
[210]	training's rmse: 1.42598	valid_1's rmse: 1.46377
[240]	training's rmse: 1.41985	valid_1's rmse: 1.46343
[270]	training's rmse: 1.41308	valid_1's rmse: 1.46228
[300]	training's rmse: 1.40611	valid_1's rmse: 1.46208
[330]	training's rmse: 1.40151	valid_1's rmse: 1.46118
[360]	training's rmse: 1.39658	valid_1's rmse: 1.46064
[390]	training's rmse: 1.39165	valid_1's rmse: 1.45995
[420]	training's rmse: 1.38768	valid_1's rmse: 1.45938
[450]	training's rmse: 1.38393	valid_1's rmse: 1.4592
[480]	training's rmse: 1.37958	valid_1's rmse: 1.45859
[510]	training's rmse: 1.37724	valid_1's rmse: 1.45817
[540]	train

In [24]:
test_foods, importance_foods = run_lgb(data_foods)

Training until validation scores don't improve for 50 rounds.
[30]	training's rmse: 3.32398	valid_1's rmse: 2.97718
[60]	training's rmse: 3.10912	valid_1's rmse: 2.73235
[90]	training's rmse: 3.05827	valid_1's rmse: 2.69065
[120]	training's rmse: 3.02139	valid_1's rmse: 2.68017
[150]	training's rmse: 2.99519	valid_1's rmse: 2.66993
[180]	training's rmse: 2.97071	valid_1's rmse: 2.66482
[210]	training's rmse: 2.94777	valid_1's rmse: 2.65838
[240]	training's rmse: 2.927	valid_1's rmse: 2.65881
[270]	training's rmse: 2.90994	valid_1's rmse: 2.65553
[300]	training's rmse: 2.88845	valid_1's rmse: 2.6504
[330]	training's rmse: 2.8756	valid_1's rmse: 2.6515
[360]	training's rmse: 2.85842	valid_1's rmse: 2.64967
[390]	training's rmse: 2.84598	valid_1's rmse: 2.64644
[420]	training's rmse: 2.83233	valid_1's rmse: 2.64561
[450]	training's rmse: 2.82085	valid_1's rmse: 2.64552
Early stopping, best iteration is:
[404]	training's rmse: 2.83977	valid_1's rmse: 2.64479
Our val rmse score is 2.6447854

In [0]:
test = pd.concat([test_hobby[test_hobby['id'].str.contains("HOBBIES")], test_house[test_house['id'].str.contains("HOUSE")], test_foods[test_foods['id'].str.contains("FOODS")]])

In [0]:
predict(test, submission)

In [79]:
test.shape

(853720, 49)

In [80]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [81]:
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions submit -c m5-forecasting-accuracy -f submission53.csv -m "ver7"

100% 20.5M/20.5M [00:00<00:00, 41.8MB/s]
Successfully submitted to M5 Forecasting - Accuracy