In [131]:
import pandas as pd
import numpy as np
import plotly.express as px
import chardet
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from datetime import datetime
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

#### 説明変数、温度だけで出荷個数を予測する(2018-2022)


In [132]:
df_list = [pd.read_csv(f"data/{year}_df") for year in range(2018, 2022)]

In [133]:
all_df = pd.concat(df_list)
all_df["year"] = pd.to_datetime(all_df["date"]).dt.year
all_df["month"] = pd.to_datetime(all_df["date"]).dt.month
all_df = all_df[~(all_df["個数"] == 0)]
all_df.reset_index(drop=True, inplace=True)
all_df.drop("date", axis=1, inplace=True)
all_df

Unnamed: 0,平均気温(℃),個数,year,month
0,19.066667,2.0,2018,6
1,19.300000,13.0,2018,6
2,22.200000,63.0,2018,6
3,23.866667,74.0,2018,6
4,23.633333,173.0,2018,7
...,...,...,...,...
144,19.933333,185.0,2021,9
145,15.933333,102.0,2021,9
146,18.300000,62.0,2021,9
147,18.133333,39.0,2021,10


In [134]:

train_df = all_df
column_list = train_df.drop("個数", axis=1).columns
# train_df = result_df[result_df.index]
# データを訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(
    train_df[column_list], train_df["個数"], test_size=0.2, random_state=42)

# XGBoostの回帰モデルを訓練
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# テストデータで予測
y_pred = model.predict(X_test)

# モデルの評価（例：平均二乗誤差）
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 387.5973162865345


In [135]:
y_pred

array([  59.06142  ,  342.20016  ,  769.0999   ,  399.26477  ,
          2.9758914,   54.162506 ,  330.71234  ,   87.58029  ,
        101.555    ,  416.60654  ,   37.70062  ,  346.52518  ,
        586.7416   ,  506.64746  ,  107.151146 ,  843.5275   ,
        876.65967  ,  106.07385  ,  383.3526   ,  205.2547   ,
         89.18     ,  560.1416   ,  290.91806  , 1050.7832   ,
       1418.5459   ,   85.74245  ,   26.62786  ,  173.2804   ,
        509.7687   ,  106.383705 ], dtype=float32)

#### 説明変数：温度,最高温度,最低温度だけで出荷個数を予測する(2018-2022)


In [136]:
all_df2 = pd.concat(df_list)
for i in range(1, 11):
    all_df2[f"前{5*i}日の平均気温"] = all_df2["平均気温(℃)"].shift(i)
# all_df2["積算"] = (all_df2.iloc[:, 1] + all_df2.iloc[:, 3:].sum(axis=1))
all_df2["year"] = pd.to_datetime(all_df2["date"]).dt.year
all_df2["month"] = pd.to_datetime(all_df2["date"]).dt.month
all_df2.drop("date", axis=1, inplace=True)
all_df2 = all_df2[~(all_df2["個数"] == 0)]
all_df2.reset_index(drop=True, inplace=True)
all_df2

Unnamed: 0,平均気温(℃),個数,前5日の平均気温,前10日の平均気温,前15日の平均気温,前20日の平均気温,前25日の平均気温,前30日の平均気温,前35日の平均気温,前40日の平均気温,前45日の平均気温,前50日の平均気温,year,month
0,19.066667,2.0,16.933333,16.933333,19.700000,20.000000,18.233333,18.833333,14.766667,18.166667,18.433333,15.100000,2018,6
1,19.300000,13.0,19.066667,16.933333,16.933333,19.700000,20.000000,18.233333,18.833333,14.766667,18.166667,18.433333,2018,6
2,22.200000,63.0,19.300000,19.066667,16.933333,16.933333,19.700000,20.000000,18.233333,18.833333,14.766667,18.166667,2018,6
3,23.866667,74.0,22.200000,19.300000,19.066667,16.933333,16.933333,19.700000,20.000000,18.233333,18.833333,14.766667,2018,6
4,23.633333,173.0,23.866667,22.200000,19.300000,19.066667,16.933333,16.933333,19.700000,20.000000,18.233333,18.833333,2018,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,19.933333,185.0,19.566667,19.500000,19.466667,20.033333,19.000000,18.300000,18.766667,23.966667,25.800000,23.933333,2021,9
145,15.933333,102.0,19.933333,19.566667,19.500000,19.466667,20.033333,19.000000,18.300000,18.766667,23.966667,25.800000,2021,9
146,18.300000,62.0,15.933333,19.933333,19.566667,19.500000,19.466667,20.033333,19.000000,18.300000,18.766667,23.966667,2021,9
147,18.133333,39.0,18.300000,15.933333,19.933333,19.566667,19.500000,19.466667,20.033333,19.000000,18.300000,18.766667,2021,10


In [137]:
train_df = all_df2
column_list = train_df.drop("個数", axis=1).columns

# データを訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(
    train_df[column_list], train_df["個数"], test_size=0.2)

# XGBoostの回帰モデルを訓練
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# テストデータで予測
y_pred = model.predict(X_test)

# モデルの評価（例：平均二乗誤差）
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = sklearn.metrics.mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

RMSE: 269.7374211299715
MAE: 199.22894287109375


In [138]:
fig = px.scatter(x=y_pred, y=y_test)
fig.show()

In [139]:
column_list[1]

'前5日の平均気温'

In [140]:
df_list[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   date     122 non-null    object 
 1   平均気温(℃)  122 non-null    float64
 2   個数       122 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.0+ KB


In [141]:
# XGBoostモデルを作成
model = xgb.XGBRegressor()

# ハイパーパラメーターの探索範囲を指定
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
}

# RandomizedSearchCVを設定
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=100,  # 試行回数
    scoring='neg_mean_squared_error',  # 最小化する目的関数
    cv=5,  # 交差検証の分割数
    verbose=1,
    n_jobs=-1  # 並列処理を使用
)

# ランダムサーチを実行
random_search.fit(X_train, y_train)

# 最適なハイパーパラメーター設定を表示
print("Best parameters found: ", random_search.best_params_)

# 最適なモデルを取得
best_model = random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Best parameters found:  {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}


In [142]:
# 最適なモデルで予測を実行
predictions = best_model.predict(X_test)

# 予測結果を表示
print("Predictions:", predictions)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = sklearn.metrics.mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

Predictions: [336.36307  333.7201   535.66656  195.81279  182.63011  442.35596
 393.18118  396.82758  189.1668    44.100803 214.36998  705.80414
 659.5542   662.34766  586.85895  425.33237  430.59613  548.4922
 157.17415  656.7014   295.17676  810.9034   414.18326  153.59612
 492.4065   229.20485  656.34973  737.08435  650.3524    96.963066]
RMSE: 269.7374211299715
MAE: 199.22894287109375
