In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 加载数据
data_dir = 'store-sales-time-series-forecasting'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))
holidays_df = pd.read_csv(os.path.join(data_dir, 'holidays_events.csv'))

# 合并数据集
train_df = train_df.merge(stores_df, on='store_nbr', how='left')
train_df = train_df.merge(oil_df, on='date', how='left')
train_df = train_df.merge(holidays_df, on='date', how='left')







In [None]:
# 处理缺失值
train_df.fillna(train_df.mean(), inplace=True)

In [3]:
train_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type_x,cluster,dcoilwtico,type_y,locale,locale_name,description,transferred,family_encoded
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False,0
1,1,2013-01-01,1,BABY CARE,0.000,0,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False,1
2,2,2013-01-01,1,BEAUTY,0.000,0,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False,2
3,3,2013-01-01,1,BEVERAGES,0.000,0,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False,3
4,4,2013-01-01,1,BOOKS,0.000,0,Quito,Pichincha,D,13,,Holiday,National,Ecuador,Primer dia del ano,False,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,POULTRY,438.133,0,Quito,Pichincha,B,6,47.57,Holiday,Local,Riobamba,Fundacion de Riobamba,False,28
3054344,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Quito,Pichincha,B,6,47.57,Holiday,Local,Riobamba,Fundacion de Riobamba,False,29
3054345,3000885,2017-08-15,9,PRODUCE,2419.729,148,Quito,Pichincha,B,6,47.57,Holiday,Local,Riobamba,Fundacion de Riobamba,False,30
3054346,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Quito,Pichincha,B,6,47.57,Holiday,Local,Riobamba,Fundacion de Riobamba,False,31


In [4]:
# 特征编码
label_encoder = LabelEncoder()
train_df['family_encoded'] = label_encoder.fit_transform(train_df['family'])
train_df['is_holiday'] = train_df['type_y'].apply(lambda x: 1 if x == 'Holiday' else 0)

# 划分训练集和测试集
X = train_df[['store_nbr', 'family_encoded', 'dcoilwtico', 'is_holiday']]
y = train_df['sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# 定义XGBoost模型
model = xgb.XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3)

# 训练模型
model.fit(X_train, y_train)



In [21]:


y_pred = model.predict(X_test)

# 创建布尔索引
id_range = range(3000888, 3029400)
bool_index = np.isin(test_df['id'], id_range)

# 获取布尔索引对应的索引值
indices = np.where(bool_index)[0]

# 过滤y_pred
filtered_y_pred = y_pred[indices]

# 构建输出的DataFrame
output = pd.DataFrame({'id': test_df['id'][indices], 'sales': filtered_y_pred})

# 输出到CSV文件
output.to_csv('submission.csv', index=False)


In [19]:
y_pred


array([ 447.21158 ,  -50.004795,  425.39346 , ..., 3185.4744  ,
        -31.237642,   89.038536], dtype=float32)

In [7]:

# 评估模型
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

Root Mean Squared Error: 650.3403121315996


In [9]:
output

Unnamed: 0,id
0,3000888
1,3000889
2,3000890
3,3000891
4,3000892
...,...
28507,3029395
28508,3029396
28509,3029397
28510,3029398


In [12]:
# 构建输出的DataFrame
output = test_df[test_df['id'].between(3000888, 3029399)][['id']]




In [13]:
output

Unnamed: 0,id
0,3000888
1,3000889
2,3000890
3,3000891
4,3000892
...,...
28507,3029395
28508,3029396
28509,3029397
28510,3029398


In [14]:
output['sales'] = y_pred

# 输出到CSV文件
output.to_csv('submission.csv', index=False)

ValueError: Length of values (610870) does not match length of index (28512)