In [1]:
import pandas as pd
import numpy as np
import torch 
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
import warnings
import seaborn as sns
from sklearn.metrics import mean_absolute_error as mse
#var
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller

#协整分析
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.vector_ar.vecm import coint_johansen

#VECM分析
from statsmodels.tsa.vector_ar.vecm import coint_johansen,VECM

#格兰杰因果检验
from statsmodels.tsa.stattools import grangercausalitytests

from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [2]:
list_of_families = ['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
                    'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
                    'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
                    'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
                    'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
                    'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
                    'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
                    'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
                    'SEAFOOD']

In [3]:
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
 
sample_submission = pd.read_csv('data/sample_submission.csv')
def scorethis_rmsle(prediction_list, y_list):
 
    scorelist = list()
 
    for x in range(prediction_list.__len__()):
 
 
        log_score_x = np.abs(np.abs(prediction_list[x]) - np.abs(y_list[x]))
        
        try:
            [scorelist.append(y) for y in log_score_x.values]
        except:
            scorelist.append(log_score_x)
 
    score_array = np.array(scorelist)
 
    rmsle = np.sqrt(np.mean(score_array**2)) # sqrt of mean of power of difference of the logs
    rmsle = np.round(rmsle, 3)
 
    return rmsle
def create_validation(this_family_df, validation=True):
    
    if validation is True:
    
        this_family_df = this_family_df[:-864]
        # Remove the 864 top submission rows if it is for validation
    
    this_family_sales = this_family_df['sales']
 
    this_family_df.drop(['sales', 'date'], axis=1, inplace=True)
 
    ########################
    # Scale Data           #
    ########################
 
    scaler = MinMaxScaler()
    this_family_df[this_family_df.columns] = scaler.fit_transform(this_family_df[this_family_df.columns])
 
    ########################
    # Split Train and Test #
    ########################
 
    test = this_family_df.iloc[-864:]
    test_y = this_family_sales.iloc[-864:]
 
    train = this_family_df.iloc[:-864]
    train_y = this_family_sales.iloc[:-864]
 
    return train, train_y, test, test_y
def lgbmr_run(train, train_y, test, test_y,
           validation=True):
    
    #################
    # Create Model  #
    #################
 
    lgbmr_model = LGBMRegressor(
        colsample_bytree=0.7,
        learning_rate=0.055,
        min_child_samples=10,
        num_leaves=19,
        objective='regression',
        n_estimators=1000,
        n_jobs=4,
        random_state=337)
 
    #################
    # Execute LGBMR #
    #################
 
    lgbmr_model.fit(train, train_y)
    lgbmr_pred = lgbmr_model.predict(test).tolist()
    lgbmr_pred = [round(x, 2) for x in lgbmr_pred]
    
    if validation == True:
        
        # validation set also has ground truths:
        test_y = test_y.to_list()
 
        return lgbmr_pred, test_y
 
    else:
 
        return lgbmr_pred
def execute_validation(thisfunc):
 
    double_list_of_predictions = []
    double_list_of_ground_truths = []
 
    for x in list_of_families: # 33
        
        if x == 'BREAD/BAKERY':
 
            x = 'BREADBAKERY'
            # Otherwise would create an error searching for the BREAD/ directory instead of the file
 
        print('Evaluating '+str(x)+'...')
        
        this_df = pd.read_csv('data/' + str(x) + '.csv')
 
        train, train_y, test, test_y = create_validation(this_df)
        pred, y = thisfunc(train, train_y, test, test_y, validation=True)
        
        if x == 'BOOKS':
 
            zero_list = []
 
            for g in range(864):
 
                zero_list.append(0.6931471805599453) 
                # this will be exactly 0 when we transform our predictions again
                # to before we did log(sales +1)
 
            double_list_of_predictions.append(zero_list)
            double_list_of_ground_truths.append(y) 
            
        else:
            
            double_list_of_predictions.append(pred) # 33 * [864]
            double_list_of_ground_truths.append(y) # 33 * [864]
 
    list_of_predictions = list()
    list_of_ground_truths = list()
 
    for x in double_list_of_predictions:
        for y in x:
            list_of_predictions.append(y) # unpack 33 * 864
 
    for x in double_list_of_ground_truths:
        for z in x:
            list_of_ground_truths.append(z) # unpack 33 * 864
 
    return list_of_predictions, list_of_ground_truths
# --- Execute LGBMR Model On Validation Set --- #
 
# Run this code if you want to do a validation test + see the score:
 
# list_of_lgbmr_predictions, list_of_ground_truths = execute_validation(lgbmr_run)
# scorethis_rmsle(list_of_lgbmr_predictions, list_of_ground_truths)

In [None]:
def TemporalFusionTransformer(train, train_y, test, test_y,
           validation=True):

 
    #################
    # Execute Model #
    #################
 
    lgbmr_model.fit(train, train_y)
    lgbmr_pred = lgbmr_model.predict(test).tolist()
    lgbmr_pred = [round(x, 2) for x in lgbmr_pred]
    
    if validation == True:
        
        # validation set also has ground truths:
        test_y = test_y.to_list()
 
        return lgbmr_pred, test_y
 
    else:
 
        return lgbmr_pred

In [None]:
def execute_submission(thisfunc):
 
    list_of_predictions = []
 
    for x in list_of_families:
        
        if x == 'BREAD/BAKERY':
 
            x = 'BREADBAKERY'
            # Otherwise would create an error searching for the BREAD/ directory instead of the file
 
        print('Evaluating '+str(x)+'...')
        this_df = pd.read_csv('data/' + str(x) + '.csv')
        
        if x == 'BOOKS':
 
            zero_list = []
 
            for g in range(864):
 
                zero_list.append(0.6931471805599453) 
                # this will be exactly 0 when we transform our predictions again
                # to before we did log(sales +1)
 
            list_of_predictions.append(zero_list)
 
        else:
    
            train, train_y, test, test_y = create_validation(this_df, validation=False)
            pred = thisfunc(train, train_y, test, test_y=None, validation=False)
            list_of_predictions.append(pred)
    
    ###############################
    # Put Back In Submission Form # 
    ###############################
    
    restructured_predictions = list()
 
    for y in range(864):
 
        for z in range(33):
            restructured_predictions.append(list_of_predictions[z][y])
 
    restructured_predictions = np.expm1(restructured_predictions) - 1
 
    return restructured_predictions
# --- Execute Submission --- #
 
restructured_predictions = execute_submission(lgbmr_run)
sample_submission['sales'] = restructured_predictions
 
# Convert some (slightly) negative predictions to a zero prediction:
sample_submission['sales'] = [0 if x < 0 else x for x in sample_submission['sales']]
 
sample_submission.to_csv('data/submission.csv', index=False)