改动最多的版本，score：0.42868，文件为submission3.csv

In [73]:
import numpy as np
import pandas as pd
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import make_scorer
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import random
import os


def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 2021
set_seed(SEED)
is_scale = False

# Timeline类
初始化，用于记录每天的节日、工作日、油价等情况

In [74]:
class Timeline:
    def __init__(self):
        self.timeline = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31')).to_period('D')
        self.holiday = None

    def get_oil(self, file_path):
        oil = pd.read_csv(file_path,
                          parse_dates=['date'], infer_datetime_format=True,
                          index_col='date').to_period('D')
        # 用0填充NaN（Not a number）
        oil['dcoilwtico'] = np.where(oil['dcoilwtico'] == 0, np.nan, oil['dcoilwtico'])
        # 作插值计算
        oil['dcoilwtico'] = oil.dcoilwtico.interpolate()
        # 计算每7天的均值
        oil['mean_oil'] = oil['dcoilwtico'].rolling(7).mean()
        self.timeline = self.timeline.join(oil.mean_oil)
        self.timeline.fillna(method='ffill', inplace=True)
        self.timeline.dropna(inplace=True)

    def get_Holidays(self, file_path):
        holiday = pd.read_csv(file_path,
                              parse_dates=['date'], infer_datetime_format=True,
                              index_col='date').to_period('D')
        holiday = holiday.sort_index()
        holiday = holiday[holiday['locale'] == 'National']  # 只考虑国际节日
        self.holiday = holiday.groupby(holiday.index).first()  # 每天只保留一个节日信息

    def get_Workday(self):
        self.timeline = self.timeline.join(self.holiday)
        self.timeline['dayofweek'] = self.timeline.index.dayofweek
        self.timeline['workday'] = True
        self.timeline.loc[self.timeline['dayofweek'] > 4, 'workday'] = False
        self.timeline.loc[self.timeline['type'] == 'Work Day', 'workday'] = True
        self.timeline.loc[self.timeline['type'] == 'Transfer', 'workday'] = False
        self.timeline.loc[self.timeline['type'] == 'Bridge', 'workday'] = False
        self.timeline.loc[
            (self.timeline['type'] == 'Holiday') & (self.timeline['transferred'] == False), 'workday'] = False
        self.timeline.loc[
            (self.timeline['type'] == 'Holiday') & (self.timeline['transferred'] == True), 'workday'] = True
        self.timeline = pd.get_dummies(self.timeline, columns=['type'])
        self.timeline.drop(['locale', 'locale_name', 'description', 'transferred'], axis=1, inplace=True)

In [75]:
T = Timeline()
T.get_oil('data/oil.csv')
T.get_Holidays('data/holidays_events.csv')
T.get_Workday()

## train

读取数据，`parse_dates`为将指定列解析为`datetime64[ns]`数据类型，`infer_datetime_format`的作用为加速时间列解析速度
将`store_nbr`指定为`object`数据类型，并从`train.csv`中选取`date`,`store_nbr`,`family`,`sales`四列数据读入变量`train`

In [76]:
train = pd.read_csv('data/train.csv',
                    parse_dates=['date'], infer_datetime_format=True,
                    dtype={'store_nbr': 'object'},
                    usecols=['date', 'store_nbr', 'family', 'sales'])
train['date'] = train.date.dt.to_period('D')
train = train.set_index(['date', 'store_nbr', 'family']).sort_index()

## test

In [77]:
test = pd.read_csv('data/test.csv',
                   parse_dates=['date'], infer_datetime_format=True,
                   dtype={'store_nbr': 'object'},
                   usecols=['date', 'store_nbr', 'family'])
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['date', 'store_nbr', 'family']).sort_index()

## 训练样本清洗

In [78]:
train_date_start = '2017-04-01'
train_date_end = '2017-08-15'
y_raw = train.unstack(['store_nbr', 'family']).loc[train_date_start:train_date_end]

fourier = CalendarFourier(freq='W', order=4)

dp = DeterministicProcess(index=y_raw.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X_raw = dp.in_sample()

X_raw['mean_oil'] = T.timeline.loc[train_date_start:train_date_end]['mean_oil'].values
X_raw['dayofweek'] = T.timeline.loc[train_date_start:train_date_end]['dayofweek'].values
X_raw['workday'] = T.timeline.loc[train_date_start:train_date_end]['workday'].values
X_raw = X_raw.join(T.timeline.loc[train_date_start:train_date_end, 'type_Additional':'type_Work Day'])
X_raw = pd.get_dummies(X_raw, columns=['dayofweek'], drop_first=True)
# X.drop('trend', axis=1, inplace=True)

In [79]:
test_date_start = '2017-08-16'
test_date_end = '2017-08-31'

X_test = dp.out_of_sample(steps=16)

X_test['mean_oil'] = T.timeline.loc[test_date_start:test_date_end]['mean_oil'].values
X_test['dayofweek'] = T.timeline.loc[test_date_start:test_date_end]['dayofweek'].values
X_test['workday'] = T.timeline.loc[test_date_start:test_date_end]['workday'].values
X_test = X_test.join(T.timeline.loc[test_date_start:test_date_end, 'type_Additional':'type_Work Day'])
X_test = pd.get_dummies(X_test, columns=['dayofweek'], drop_first=True)
# X_test.drop('trend', axis=1, inplace=True)

## 归一化（可选）
如需进行归一化操作，则令`is_scale=True`即可

In [80]:
if is_scale:
    from sklearn.preprocessing import MinMaxScaler

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X_raw)
    y_scaled = scaler_y.fit_transform(y_raw)
    X_test_scaled = scaler_X.transform(X_test)

## 模型训练函数

`train_model`函数用于网格化搜索确定模型最优参数，正式预测中不再使用

In [81]:
def my_score(y, y_pred):
    y_pred = pd.DataFrame(y_pred, columns=y_raw.columns)
    y_pred = y_pred.stack(['store_nbr', 'family']).clip(0.)
    y_ = pd.DataFrame(y, columns=y_raw.columns)
    y_ = y_.stack(['store_nbr', 'family'])
    rmse = np.sqrt(np.sqrt(msle(y_['sales'], y_pred['sales'])))
    print(f'rmse:{rmse}')
    return rmse


def train_model(model, gridsearch_params, X, y):
    # 通过train_model进行调参，确定模型最优参数
    gridsearch = GridSearchCV(model, gridsearch_params, cv=3,
                              scoring=make_scorer(my_score, greater_is_better=False),
                              verbose=1, return_train_score=True)
    gridsearch.fit(X, y)
    model = gridsearch.best_estimator_
    y_pred = model.predict(X)
    print('------------')
    print(model)
    print('------------')
    print(f'rmse={my_score(y, y_pred)}')
    return model

def create_Model(X, y):
    ridge = Ridge(alpha=0.75, random_state=SEED)
    svr = SVR(C=0.2)
    root1 = ExtraTreesRegressor(n_estimators = 200, random_state=SEED)
    root2 = RandomForestRegressor(n_estimators = 200, random_state=SEED)
    gbdt = GradientBoostingRegressor(n_estimators = 200, random_state=SEED)
    # model = train_model(lasso, param_grid, X_raw, y_raw)
    model = VotingRegressor([('ridge', ridge), ('svr', svr), ('extra', root1), ('rf', root2), ('gbdt', gbdt)])
    model.fit(X, y)
    return model

def model_fit(X, y):
    if type(X) == np.ndarray:
        estimators = Parallel(n_jobs=-1,
                              verbose=0)(delayed(create_Model)(X, y[:, i]) for i in tqdm(range(y.shape[1])))
    else:
        estimators = Parallel(n_jobs=-1,
                              verbose=0)(delayed(create_Model)(X, y.iloc[:, i]) for i in tqdm(range(y.shape[1])))
    return estimators

def predict(estimators_, X):
    y_pred = Parallel(n_jobs=-1,
                      verbose=0)(delayed(e.predict)(X) for e in tqdm(estimators_))
    return np.stack(y_pred, axis=1)

## 函数拟合

In [82]:
if is_scale:
    model = model_fit(X_scaled, y_scaled)
else:
    model = model_fit(X_raw, y_raw)

  0%|          | 0/1782 [00:00<?, ?it/s]

## 预测输出

In [83]:
if is_scale:
    y_pred = predict(model, X_test_scaled)
    y_pred = scaler_y.inverse_transform(y_pred)
else:
    y_pred = predict(model, X_test)
y_pred = pd.DataFrame(y_pred, index=X_test.index, columns=y_raw.columns).clip(0.)
y_pred = y_pred.stack(['store_nbr', 'family'])
submit_file = pd.read_csv('data/sample_submission.csv')
submit_file['sales'] = y_pred.values
submit_file.to_csv('submission.csv', index=False)

  0%|          | 0/1782 [00:00<?, ?it/s]