In [None]:
import math
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression,LinearRegression,BayesianRidge, Lasso
from statistics import mean
from math import sqrt



import gc
import datetime
import warnings
from tqdm import tqdm
from pathlib import Path
import time
from copy import deepcopy
import os

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def read_data():
    calendar = pd.read_csv('calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission

In [None]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    data = data.loc[nrows:]
    
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now, don't delete when we do next stage of testing in June 
    data = data[data['part'] != 'test2']
    
    if merge:
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['day'], inplace = True, axis = 1)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass

    
    return data

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 

calendar, sell_prices, sales_train_validation, submission = read_data()

data = melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 35000000, merge = True)

In [None]:
import datetime
def truncate(data, start_date):
    data["date"] = pd.to_datetime(data["date"])

    mask = (data['date'] > start_date)
    data = data.loc[mask]
    data.head()
    
    return data

In [None]:
del calendar, sell_prices, sales_train_validation, submission
gc.collect()

In [None]:
from sklearn.preprocessing import LabelEncoder

def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
        
    data['date'] = pd.to_datetime(data['date'])
    time_features = ['year', 'month', 'quarter', 'week', 'day', 'dayofweek', 'dayofyear']
    dtype = np.int16
    for time_feature in time_features:
        data[time_feature] = getattr(data['date'].dt, time_feature).astype(dtype)
        
    data = reduce_mem_usage(data)
    
    return data

In [None]:
data = truncate(data, '2014-04-01')
data = transform(data)

In [None]:
def get_weights(data):
    weights = pd.read_csv("weight_scale_1914.csv")
    
    weights = weights[[ 'Level_id', 'Agg_Level_1', 'Agg_Level_2','weight', 'scale']]
    level_12 = weights[weights.Level_id == 'Level12']
    
    level_12['scaled_weight'] = level_12.weight/np.sqrt(level_12.scale)
    level_12["combined"] = level_12["Agg_Level_1"] + '_' + level_12["Agg_Level_2"] + "_validation"

    temp_weights = level_12[['combined', 'scaled_weight']]
    data = pd.merge(data, temp_weights, how = 'left', left_on = 'id', right_on = 'combined')
    
    del level_12, weights
    gc.collect()
    
    return data

In [None]:
data = get_weights(data)
data = reduce_mem_usage(data)
gc.collect()

In [None]:
train_df = pd.read_csv('sales_train_evaluation.csv')
train_df.drop(train_df.columns[6:1165], axis=1, inplace = True)
d_cols = [col for col in train_df.columns if 'd_' in col]
rec = (train_df[d_cols].values)


def add_lags(grid_df, shift):
    
    lags = [0, 1, 2, 3, 4, 5, 6, 7, 28]
    start_time = time()
    print( 72 * '#', '\nAdding lag columns')
    for i in lags:
        grid_df[f'lag_{i}'] = grid_df['demand'].shift(30490 * (i + shift)).astype(np.float16)
        grid_df[f'price_lag_{i}'] = grid_df['sell_price'].shift(30490 * (i + shift)).astype(np.float16)
        if (i == 0):
            grid_df[f'event_name1_lag_{i}'] = grid_df['event_name_1'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_type1_lag_{i}'] = grid_df['event_type_1'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_name2_lag_{i}'] = grid_df['event_name_2'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_type2_lag_{i}'] = grid_df['event_type_2'].shift(30490 * (i + shift)).astype(np.float16)
            
            #grid_df['is_weekend_lag'] = grid_df['is_weekend'].shift(30490 * (i + shift)).astype(np.float16)
        if (i == 7):
            grid_df[f'event_name1_lag_{i}'] = grid_df['event_name_1'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_type1_lag_{i}'] = grid_df['event_type_1'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_name2_lag_{i}'] = grid_df['event_name_2'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_type2_lag_{i}'] = grid_df['event_type_2'].shift(30490 * (i + shift)).astype(np.float16)
            grid_df[f'event_name1_future'] = grid_df['event_name_1'].shift(-30490 * (i)).astype(np.float16)
            grid_df[f'event_type1_future'] = grid_df['event_type_1'].shift(-30490 * (i)).astype(np.float16)
            grid_df[f'event_name2_future'] = grid_df['event_name_2'].shift(-30490 * (i)).astype(np.float16)
            grid_df[f'event_type2_future'] = grid_df['event_type_2'].shift(-30490 * (i)).astype(np.float16)
            #grid_df[f'event_name1_past'] = grid_df['event_name_1'].shift(30490 * (i)).astype(np.float16)
            #grid_df[f'event_type1_past'] = grid_df['event_type_1'].shift(30490 * (i)).astype(np.float16)
            #grid_df[f'event_name2_past'] = grid_df['event_name_2'].shift(30490 * (i)).astype(np.float16)
            #grid_df[f'event_type2_past'] = grid_df['event_type_2'].shift(30490 * (i)).astype(np.float16)
    
    print(f'Time: {(time() - start_time):.2f} seconds')
        
        
############################################################       
################# Rolling window columns ###################

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)



def make_rolling_col(rw, window, function, weights = False): 
    # We need to take off the last columns to
    # get the rolling feature shifted one day.
    
    split_rw = np.split(rw, 10, axis=0)
    split_col = [function(rw, -1) for rw in split_rw]
    col = np.concatenate(split_col)
    col = col[:, :-1].T.reshape(-1,)

    # The new column must be prepended with np.nans 
    # to account for missing gaps
    
    
    weight_7 = [1, 1, 1, 1, 1, 1, 3]
    weight_14 = [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3]
    weight_28 = [1, 1, 1, 1, 1, 1, 1.2, 1, 1, 1, 1, 1, 1, 1.5, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3]
    if (weights == True):
        if (window == 7):
            split_rw = np.split(rw, 10, axis=0)
            split_col = [function(rw, -1, weights = weight_7) for rw in split_rw]
            col = np.concatenate(split_col)
            col = col[:, :-1].T.reshape(-1,)
        elif (window == 14):
            split_rw = np.split(rw, 10, axis=0)
            split_col = [function(rw, -1, weights = weight_14) for rw in split_rw]
            col = np.concatenate(split_col)
            col = col[:, :-1].T.reshape(-1,)
        else:
            split_rw = np.split(rw, 10, axis=0)
            split_col = [function(rw, -1, weights = weight_28) for rw in split_rw]
            col = np.concatenate(split_col)
            col = col[:, :-1].T.reshape(-1,)
            
            
    return np.append(np.zeros(30490 * window) + np.nan, col).astype(np.float16)

    




def add_rolling_cols(df: pd.DataFrame, rec: np.array, windows: list, weighted_windows, functions: list, function_names: list): 
    """Adds rolling features to df."""
    
    print( 72 * '#', '\nAdding rolling columns\n',  )
    start_time = time()
    f = list(zip(functions, function_names))
    
    for window in windows: 
        rw = rolling_window(rec, window)
        for function in f: 
            s_time = time()
            if (function[1] != 'weighted_mean'):
                df[f'shift_1_rolling_{function[1]}_{str(window)}'] = make_rolling_col(rw, window, function[0])
                print(f'{function[1]} with window {window} time: {(time() - s_time):.2f} seconds')
    
    for window in weighted_windows: 
        rw = rolling_window(rec, window)
        for function in f: 
            s_time = time()
            if (function[1] == 'weighted_mean'):
                df[f'shift_1_rolling_{function[1]}_{str(window)}'] = make_rolling_col(rw, window, function[0], weights = True)
                print(f'{function[1]} with window {window} time: {(time() - s_time):.2f} seconds')
                
                
    print(f'Total time for rolling cols: {(time() - start_time)/60:.2f}')
    
    
    
############################################################       
################# Shifting function ########################
def add_shift_cols(grid_df, shifts, cols, num_series=30490): 
    
    print( 72 * '#', '\nAdding shift columns',  )
    start_time = time()
    for shift in shifts: 
        for col in cols: 
            grid_df[f"{col.replace('shift_1', f'shift_{shift}')}"] = grid_df[col].shift((shift - 1) * num_series)
    print(f'Time: {(time() - start_time):.2f} seconds')


         
            
############################################################       
################# Create lags df ###########################
def make_lags_df_day(data, rec, day): 
    
    start_time = time()

    
    window = day
    add_lags(data, window)
    add_rolling_cols(data, 
                     rec, 
                     windows=[7, 14, 28],
                     weighted_windows = [7, 14, 28], 
                     functions=[np.mean, np.std, np.average], 
                     function_names=['mean', 'std', 'weighted_mean'])
    
    
    shifts = [day]
   
    group = data.groupby('id')['demand']
    data['shift_28_rolling_std_7'] = group.transform(lambda x: x.shift(day).rolling(7).std())
    data['shift_28_rolling_std_14'] = group.transform(lambda x: x.shift(day).rolling(14).std())
    data['shift_28_rolling_std_28'] = group.transform(lambda x: x.shift(day).rolling(28).std())
    #data['revenue_0'] = data['price_lag_0'] * data['lag_0']
    ocols = [f'shift_1_rolling_mean_{i}' for i in [7, 14, 28]]
    add_shift_cols(data, shifts, ocols, num_series=30490)
    oocols = [f'shift_1_rolling_weighted_mean_{i}' for i in [7, 14, 28]]
    add_shift_cols(data, shifts, oocols, num_series=30490)
    
    print(72 * '#', f'Total time: {(time() - start_time)//60:} : {(time() - start_time)%60:.2f}')
    return data

In [None]:
import sklearn.metrics as metrics 
from datetime import datetime, timedelta
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from sklearn.model_selection import PredefinedSplit

def run_lgb_daily(data, cat_indices, rec = rec):
    
    # reset_index
    #data.reset_index(inplace = True, drop = True)
    
    
    
    # going to evaluate with the last 28 days, try Kfold TSS at some point 
    
    #x_train = data[data['date'] <= '2016-04-24']
    #y_train = x_train['demand']
    #x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    #y_val = x_val['demand']
    test = data[data['date'] >= '2016-04-25']

    #train_weights = x_train['scaled_weight']
    #val_weights = x_val['scaled_weight']


    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'seed': 42,   
        'learning_rate': 0.07,
        'bagging_fraction': 0.85,
        'bagging_freq': 1, 
        'colsample_bytree': 0.85,
        'colsample_bynode': 0.85,
        'min_data_per_leaf': 25,
        'num_leaves': 200,
        'lambda_l1': 0.2,
        'lambda_l2': 0.2,
        'objective': 'tweedie',
        'metric': 'rmse',
        "tweedie_variance_power":1.1}
    
    new_params = {'boosting_type': 'gbdt',
            'seed': 42,
            'num_leaves': 2047,
            'max_bin': 3050,
            'n_estimators': 4000,
            'verbose': 20,
            'bagging_fraction': 0.85,
            'bagging_freq': 1, 
            'colsample_bytree': 0.85,
            'colsample_bynode': 0.85,
            'lambda_l1': 0.2,
            'lambda_l2': 0.2, 
            'objective': 'tweedie',
            'metric': 'rmse',
            "tweedie_variance_power":1.1}
    
    
    TS = 30490
    #date_list = x_train['date'].unique()
    
    #test = data[data['date'] >= '2016-04-25']
    
    
    for j in range(1, 29):
        
        make_lags_df_day(data, rec, j)
        gc.collect()
 
        
        dates = data['date'].unique()
        starting_date = dates[0]
        del dates
        
        print(j)
        
        start_date = starting_date + pd.to_timedelta(28 + j, unit="D")
        end_test = datetime.strptime('2016-04-24', '%d/%m/%y') + pd.to_timedelta(j, unit="D")
        
        temp_test = data[(data['date'] >= '2016-04-25') & (data['date'] <= end_test)]
        x_train = data[(data['date'] >= start_date) & (data['date'] <= '2016-03-27')] 
        x_val = data[(data['date'] >= '2016-03-28') & (data['date'] <= '2016-04-24')] 
        
        preds = np.zeros(len(temp_test))
    
        train_days = x_train['date'].nunique()
        train_list = []
        for a in range(30490):
                index = a
                train_list.append(a)
                for b in range(train_days - 1):
                    index += (30490)
                    train_list.append(index)
        
        x_train = x_train.reset_index()
        x_train = x_train.reindex(train_list)
        x_train = x_train.reset_index()
        del train_list
        
        
        val_days = x_val['date'].nunique()
        val_list = []
        for c in range(30490):
                index = c
                val_list.append(c)
                for d in range(val_days - 1):
                    index += (30490)
                    val_list.append(index)
        
        x_val = x_val.reset_index()
        x_val = x_val.reindex(val_list)
        x_val = x_val.reset_index()
        del val_list
       

        print(f'Training fold for day {j}')
    
        
        i = 3
        #val_filter = (-70 + 14 * i) * TS
        #val_stopper = (-42 + 14 * i) * TS

        #train_fold_df = training_df.iloc[:, :(-98 + 14 * i)]
        #valid_fold_df = training_df.iloc[:, (-98 + 14 * i):(-70 + 14 * i)].copy()
        #w = WRMSSEForLightGBM(train_fold_df, valid_fold_df, temp_calendar, temp_prices)

        features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 
            'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'year', 
                'month', 'quarter', 'week', 'day', 'dayofweek', 'dayofyear',
             'lag_0', 'lag_1','lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_28', 
             'season',
            f'shift_{i}_rolling_mean_7',
            f'shift_{i}_rolling_mean_14',
            f'shift_{i}_rolling_mean_28',f'shift_{i}_rolling_std_7',
            f'shift_{i}_rolling_std_14',
            f'shift_{i}_rolling_std_28',
            f'shift_{i}_rolling_weighted_mean_7',
            f'shift_{i}_rolling_weighted_mean_14',
            f'shift_{i}_rolling_weighted_mean_28', 'event_name1_lag_0', 'event_type1_lag_0'
             , 'event_name1_lag_7', 'event_type1_lag_7', 'event_name2_lag_0', 'event_type2_lag_0'
             , 'event_name2_lag_7', 'event_type2_lag_7', 'price_lag_0', 'price_lag_1','price_lag_2', 'price_lag_3', 
             'price_lag_4', 'price_lag_5', 'price_lag_6',
             'price_lag_7', 'price_lag_28', 'event_name1_future', 'event_type1_future', 'event_name2_future', 'event_type2_future']
        
        train_set = lgb.Dataset(x_train[features], label = x_train['demand'], 
                                categorical_feature = cat_indices, feature_name = features, 
                                weight = x_train["scaled_weight"]) 


        val_set = lgb.Dataset(x_val[features], 
                              label = x_val['demand'], 
                              categorical_feature = cat_indices, feature_name = features, 
                              weight = x_val["scaled_weight"])
        

        model = lgb.train(new_params, train_set, num_boost_round = 4000, 
                              valid_sets = [train_set, val_set], verbose_eval = 20, 
                          early_stopping_rounds = 150)
        
    
        preds += (model.predict(temp_test[features]))
      
        
            
        print('-'*50)
        print('\n')
        
        temp_test = temp_test[['id', 'date', 'demand']]
        temp_test['demand'] = preds
        temp_test.to_csv(f'test_{j}(80).csv')
        del x_train, temp_test
        
        
        del data[f'shift_{i}_rolling_mean_7'], data[f'shift_{i}_rolling_mean_14'], data[f'shift_{i}_rolling_mean_28'], data[f'shift_{i}_rolling_weighted_mean_7'], data[f'shift_{i}_rolling_weighted_mean_14'], data[f'shift_{i}_rolling_weighted_mean_28'], data[f'shift_{i}_rolling_std_7'], data[f'shift_{i}_rolling_std_14'], data[f'shift_{i}_rolling_std_28'], data['shift_1_rolling_mean_7'], data['shift_1_rolling_mean_14'], data['shift_1_rolling_mean_28'], data['shift_1_rolling_std_7'], data['shift_1_rolling_std_14'], data['shift_1_rolling_std_28']
        
        gc.collect()
        model.save_model(f"model_1_day{j}.lgb")
    
    test1 = pd.read_csv('test_1(80).csv')
    test2 = pd.read_csv('test_2(80).csv')
    test3 = pd.read_csv('test_3(80).csv')
    test4 = pd.read_csv('test_4(80).csv')
    test5 = pd.read_csv('test_5(80).csv')
    test6 = pd.read_csv('test_6(80).csv')
    test7 = pd.read_csv('test_7(80).csv')
    test8 = pd.read_csv('test_8(80).csv')
    test9 = pd.read_csv('test_9(80).csv')
    test10 = pd.read_csv('test_10(80).csv')
    test11 = pd.read_csv('test_11(80).csv')
    test12 = pd.read_csv('test_12(80).csv')
    test13 = pd.read_csv('test_13(80).csv')
    test14 = pd.read_csv('test_14(80).csv')
    test15 = pd.read_csv('test_15(80).csv')
    test16 = pd.read_csv('test_16(80).csv')
    test17 = pd.read_csv('test_17(80).csv')
    test18 = pd.read_csv('test_18(80).csv')
    test19 = pd.read_csv('test_19(80).csv')
    test20 = pd.read_csv('test_20(80).csv')
    test21 = pd.read_csv('test_21(80).csv')
    test22 = pd.read_csv('test_22(80).csv')
    test23 = pd.read_csv('test_23(80).csv')
    test24 = pd.read_csv('test_24(80).csv')
    test25 = pd.read_csv('test_25(80).csv')
    test26 = pd.read_csv('test_26(80).csv')
    test27 = pd.read_csv('test_27(80).csv')
    test28 = pd.read_csv('test_28(80).csv')

    test = pd.concat([test1, test2, test3, test4, test5, test6, test7, test8, test9, test10, test11, test12, test13, test14, test15, test16, test17, test18, test19, test20, test21, test22, test23, test24, test25, test26, test27, test28], ignore_index = True)

    return test

In [None]:
def submit(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv(f'model_68.csv', index = False)


In [None]:
submission = pd.read_csv('sample_submission.csv')
submission = reduce_mem_usage(submission)

In [None]:
nba_finals = ['2014-06-05', '2014-06-08', '2014-06-10', '2014-06-12', '2014-06-15', '2015-06-04', '2015-06-07', 
             '2015-06-09', '2015-06-11', '2015-06-14', '2015-06-16']

In [None]:
#data['LeBron'] = np.zeros(len(data))
#for nba in nba_finals:
    #data.loc[data.date==nba, 'LeBron'] = 1

In [None]:
def change_events(data):
    father_date = '2014-06-15'
    data.loc[data.date==father_date, 'event_name_1'] = 7
    data.loc[data.date==father_date, 'event_type_1'] = 0
    nba_2014_start = '2014-06-05'
    data.loc[data.date==nba_2014_start, 'event_name_1'] = 30
    data.loc[data.date==nba_2014_start, 'event_type_1'] = 4
    nba_2015_start = '2015-06-04'
    data.loc[data.date==nba_2015_start, 'event_name_1'] = 30
    data.loc[data.date==nba_2015_start, 'event_type_1'] = 4
    nba_2015_end = '2015-06-16'
    data.loc[data.date==nba_2015_end, 'event_name_1'] = 30
    data.loc[data.date==nba_2015_end, 'event_type_1'] = 4
    
    return data
    
#data = change_events(data)

In [None]:
data = reduce_mem_usage(data)
gc.collect()

In [None]:
cat_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8]
test = run_lgb_weekly(data, cat_indices)