In [2]:
import pandas as pd

# Load the uploaded CSV file
file_path = "online_0616d24.csv"
df = pd.read_csv(file_path)

# # Display the first few rows and info to understand the structure
# df.info(), df.head()


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Step 1: Drop the "Date" column
df = df.drop(columns=["Date"])

# Step 2: Separate numerical and categorical columns
num_cols = df.select_dtypes(include=["number"]).columns.drop("Quantity")
cat_cols = df.select_dtypes(include=["object"]).columns

# Step 2: Impute missing values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Step 3: One-hot encode categorical columns
cat_encoder = OneHotEncoder(handle_unknown="ignore")

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_imputer, num_cols),
        ("cat", Pipeline(steps=[("imputer", cat_imputer), ("encoder", cat_encoder)]), cat_cols)
    ]
)

# Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split dataset
X = df.drop(columns=["Quantity"])
y = df["Quantity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(204.88974666286367, 0.5205547212451591)

In [None]:
from sklearn import set_config
set_config(display='diagram')  # 讓 pipeline 視覺化

pipeline.fit(X_train, y_train)  # 執行這行後就會自動畫出圖
pipeline  # 在下一格輸出 pipeline 就會出現圖

### GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid for Grid Search
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

# Wrap the original pipeline with GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Best estimator and its R2 score on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
best_r2 = r2_score(y_test, y_pred_best)
best_params = grid_search.best_params_

best_params, best_r2


KeyboardInterrupt: 

In [4]:
# 匯出預測結果與實際值
results_df = X_test.copy()
results_df["Actual_Quantity"] = y_test.values
results_df["Predicted_Quantity"] = y_pred




In [6]:
# 存成 CSV
output_path = "random_forest_062003.csv"
results_df.to_csv(output_path, index=False)

output_path

'random_forest_062003.csv'