In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# 设置数据目录路径
data_dir = 'store-sales-time-series-forecasting'

# 读取训练数据、商店数据、测试数据和油价数据
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))
holidays_df = pd.read_csv(os.path.join(data_dir, 'holidays_events.csv'))

# 合并油价数据到训练集和测试集
train_df = pd.merge(train_df, oil_df, on='date', how='left')
test_df = pd.merge(test_df, oil_df, on='date', how='left')




In [2]:
# 添加节假日特征
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['is_holiday'] = train_df['date'].isin(holidays_df['date']).astype(int)
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['is_holiday'] = test_df['date'].isin(holidays_df['date']).astype(int)

# 对family字段进行编码处理
label_encoder = LabelEncoder()
train_df['family_encoded'] = label_encoder.fit_transform(train_df['family'])
test_df['family_encoded'] = label_encoder.transform(test_df['family'])



In [4]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion,dcoilwtico,is_holiday,family_encoded
0,3000888,2017-08-16,1,AUTOMOTIVE,0,46.80,0,0
1,3000889,2017-08-16,1,BABY CARE,0,46.80,0,1
2,3000890,2017-08-16,1,BEAUTY,2,46.80,0,2
3,3000891,2017-08-16,1,BEVERAGES,20,46.80,0,3
4,3000892,2017-08-16,1,BOOKS,0,46.80,0,4
...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1,47.26,0,28
28508,3029396,2017-08-31,9,PREPARED FOODS,0,47.26,0,29
28509,3029397,2017-08-31,9,PRODUCE,1,47.26,0,30
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,47.26,0,31


In [3]:
# 填充缺失值
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# 特征选择
features = ['store_nbr', 'family_encoded', 'dcoilwtico', 'is_holiday']
target = 'sales'

  train_df.fillna(train_df.mean(), inplace=True)
  train_df.fillna(train_df.mean(), inplace=True)
  test_df.fillna(test_df.mean(), inplace=True)
  test_df.fillna(test_df.mean(), inplace=True)


In [None]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

# 随机森林回归模型
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)