### Load Model and Target DF

In [5]:
import pandas as pd
import joblib

# 1. 載入模型
model = joblib.load("stacked_model_refined.pkl")

# 2. 載入新資料（不含 Quantity）
df_new = pd.read_csv("online_0616d25.csv")



### Missing Value Input

In [7]:
df_new['ShippingCost'] = df_new['ShippingCost'].fillna(df_new['ShippingCost'].median())
df_new['WarehouseLocation'] = df_new['WarehouseLocation'].fillna(df_new['WarehouseLocation'].mode()[0])

In [8]:
df_new.isnull().sum()

Description          0
Quantity             0
Year                 0
Quarter              0
Month                0
DateofWeek           0
Hour                 0
InvoiceDate          0
UnitPrice            0
Country              0
Discount             0
PaymentMethod        0
ShippingCost         0
Category             0
ShipmentProvider     0
WarehouseLocation    0
dtype: int64

### Feature Selection

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Define your categorical columns
categorical_columns = df_new.select_dtypes(include=['object', 'category']).columns.tolist()
# numerical_cols = df_new.select_dtypes(include=['int64', 'float64'])
# Specify your categorical columns

# Define the encoder and the transformation
column_transformer = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')

# Separate features and target
X = df_new.drop('Quantity', axis=1)
y = df_new['Quantity']

# Encode the features
X_encoded = column_transformer.fit_transform(X)
feature_names_encoded = column_transformer.get_feature_names_out()

# Split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and fit the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get the feature importances
importances = model.feature_importances_

# Combine feature names and their importances, sort them, and select the top 15
features_importances = sorted(zip(feature_names_encoded, importances), key=lambda x: x[1], reverse=True)[:15]

# Extract just the feature names for the top 15
top_features = [feature for feature, importance in features_importances]

# Print the top 15 features
print("Top 15 Features based on Importance:")
for feature in top_features:
    print(feature)

Top 15 Features based on Importance:
remainder__UnitPrice
remainder__ShippingCost
remainder__Discount
encoder__WarehouseLocation_Paris
encoder__ShipmentProvider_DHL
encoder__ShipmentProvider_Royal Mail
encoder__PaymentMethod_Credit Card
encoder__PaymentMethod_paypall
encoder__Category_Electronics
encoder__DateofWeek_Fri
encoder__ShipmentProvider_UPS
encoder__Category_Stationery
encoder__Category_Accessories
encoder__Category_Apparel
encoder__DateofWeek_Sun


In [16]:
# Trim the first word and underscores
feature_selected = ['__'.join(feature.split('__')[1:]) for feature in top_features]

# Print the trimmed feature names
print("Trimmed Features:")
for feature in feature_selected:
    print(feature)

Trimmed Features:
UnitPrice
ShippingCost
Discount
WarehouseLocation_Paris
ShipmentProvider_DHL
ShipmentProvider_Royal Mail
PaymentMethod_Credit Card
PaymentMethod_paypall
Category_Electronics
DateofWeek_Fri
ShipmentProvider_UPS
Category_Stationery
Category_Accessories
Category_Apparel
DateofWeek_Sun


### Onehot-Encoding

In [17]:
df_new['Year'] = df_new['Year'].astype('object')
df_new['Hour'] = df_new['Hour'].astype('object')

df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5934 entries, 0 to 5933
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Description        5934 non-null   object 
 1   Quantity           5934 non-null   int64  
 2   Year               5934 non-null   object 
 3   Quarter            5934 non-null   object 
 4   Month              5934 non-null   object 
 5   DateofWeek         5934 non-null   object 
 6   Hour               5934 non-null   object 
 7   InvoiceDate        5934 non-null   object 
 8   UnitPrice          5934 non-null   float64
 9   Country            5934 non-null   object 
 10  Discount           5934 non-null   float64
 11  PaymentMethod      5934 non-null   object 
 12  ShippingCost       5934 non-null   float64
 13  Category           5934 non-null   object 
 14  ShipmentProvider   5934 non-null   object 
 15  WarehouseLocation  5934 non-null   object 
dtypes: float64(3), int64(1),

In [18]:
from sklearn.preprocessing import OneHotEncoder

# Identifying categorical columns in 'df_new'
categorical_cols = df_new.select_dtypes(include=['object']).columns.tolist()


# Initializing the OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop=None)  # `drop='first'` to avoid dummy variable trap

# Fitting and transforming the categorical columns
# Note: `.fit_transform()` expects a 2D array, hence the double brackets `[[column]]`
encoded_data = ohe.fit_transform(df_new[categorical_cols])

# Creating a DataFrame with the encoded data
# `get_feature_names_out()` gives new column names
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(categorical_cols))



# Concatenating the encoded_df with the original DataFrame (minus the categorical columns)
df_new_numeric = df_new.drop(columns=categorical_cols)
df_encoded = pd.concat([df_new_numeric.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Display the first few rows to verify
print(df_encoded.head())

   Quantity  UnitPrice  Discount  ShippingCost  Description_Backpack  \
0        18      28.65      0.41         28.06                   0.0   
1        33      85.88      0.05         11.00                   0.0   
2        47      75.47      0.28         14.92                   0.0   
3        18      67.85      0.36         16.41                   0.0   
4        45      98.30      0.06         25.36                   0.0   

   Description_Blue Pen  Description_Desk Lamp  Description_Headphones  \
0                   0.0                    0.0                     0.0   
1                   0.0                    0.0                     0.0   
2                   0.0                    0.0                     0.0   
3                   0.0                    0.0                     0.0   
4                   1.0                    0.0                     0.0   

   Description_Notebook  Description_Office Chair  ...  Category_Stationery  \
0                   0.0                    

In [19]:
# Include 'quantity' in the data frame
df_selected = df_encoded[feature_selected]
df_selected['Quantity'] = df_encoded['Quantity']
df_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Quantity'] = df_encoded['Quantity']


Unnamed: 0,UnitPrice,ShippingCost,Discount,WarehouseLocation_Paris,ShipmentProvider_DHL,ShipmentProvider_Royal Mail,PaymentMethod_Credit Card,PaymentMethod_paypall,Category_Electronics,DateofWeek_Fri,ShipmentProvider_UPS,Category_Stationery,Category_Accessories,Category_Apparel,DateofWeek_Sun,Quantity
0,28.65,28.06,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,18
1,85.88,11.0,0.05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,33
2,75.47,14.92,0.28,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47
3,67.85,16.41,0.36,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,18
4,98.3,25.36,0.06,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,45


In [20]:
# Examine the data frame by exporting it
df_selected.to_csv('df_selected0619.csv', index=False)

In [22]:
# 3. 保留與訓練時一致的特徵欄位（你自己補上 feature_columns）
X_new = df_selected

# 4. 預測
df_selected['Predicted_Quantity'] = model.predict(X_new)

# 5. 輸出結果
df_selected.to_csv("predicted_output.csv", index=False)



ValueError: X has 16 features, but RandomForestRegressor is expecting 6021 features as input.