In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_dir = 'store-sales-time-series-forecasting'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))


In [4]:

# 加载数据
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))
holiday_df = pd.read_csv(os.path.join(data_dir, 'holidays_events.csv'))



In [5]:
# 将日期转换为datetime类型
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])
oil_df['date'] = pd.to_datetime(oil_df['date'])
holiday_df['date'] = pd.to_datetime(holiday_df['date'])

# 将油价数据加入到训练数据中
train_df = pd.merge(train_df, oil_df, on='date', how='left')
test_df = pd.merge(test_df, oil_df, on='date', how='left')

# 将节假日数据加入到训练数据中
train_df = pd.merge(train_df, holiday_df, on='date', how='left')
test_df = pd.merge(test_df, holiday_df, on='date', how='left')

# 特征工程
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['dayofweek'] = train_df['date'].dt.dayofweek
train_df['is_promotion'] = train_df['onpromotion'].apply(lambda x: 1 if x else 0)
train_df['is_holiday'] = train_df['type'].apply(lambda x: 1 if x == 'Holiday' else 0)

test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['dayofweek'] = test_df['date'].dt.dayofweek
test_df['is_promotion'] = test_df['onpromotion'].apply(lambda x: 1 if x else 0)
test_df['is_holiday'] = test_df['type'].apply(lambda x: 1 if x == 'Holiday' else 0)



In [31]:
# 特征工程
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [32]:


# 对family字段进行One-Hot编码
enc = OneHotEncoder()
family_train = enc.fit_transform(train_df[['family']]).toarray()
family_test = enc.transform(test_df[['family']]).toarray()



In [33]:
# 获取One-Hot编码的列名
family_names = enc.get_feature_names_out(['family'])
family_names = [name.replace('x0_', '') for name in family_names]
family_train_df = pd.DataFrame(family_train, columns=family_names)
family_test_df = pd.DataFrame(family_test, columns=family_names)

# 将One-Hot编码的结果添加到训练和测试数据中
train_df = pd.concat([train_df, family_train_df], axis=1)
test_df = pd.concat([test_df, family_test_df], axis=1)


In [34]:

# 特征选择
X_train = train_df[['store_nbr', 'month', 'day', 'dayofweek', 'is_promotion', 'is_holiday', 'dcoilwtico'] + family_names]
y_train = train_df['sales']
X_test = test_df[['store_nbr', 'month', 'day', 'dayofweek', 'is_promotion', 'is_holiday', 'dcoilwtico'] + family_names]
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
# 训练模型
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)


In [35]:

# 预测销量
y_pred = rf.predict(X_test)


In [36]:

# 输出结果
output = pd.DataFrame({'id': test_df['id'], 'sales': y_pred})
output.to_csv('submission_baseline.csv', index=False)