In [24]:
import warnings

from ecommerce.configs import columns as c
from ecommerce.operations.base.read import ReadOperation
from ecommerce.operations.base.write import WriteOperation
from ecommerce.operations.custom.data_splitter import SplitDataOperation
from ecommerce.operations.custom.feature_extractor import FeatureExtractionOperation
from ecommerce.operations.custom.read_criteo_seach_data import ReadCriteoSearchData

# Suppress only FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# data = ReadCriteoSearchData(path="../../data/raw/CriteoSearchData", object_format='tsv', header=0).execute()
# WriteOperation(data=data, 
#                path="../../data/standardized/df_source.parquet",
#                data_format="parquet", 
#                mode="w", 
#                partition_cols=[c.PARTNER_ID]).execute()

In [3]:
data = ReadOperation(path="../../data/standardized/df_source.parquet/partner_id=E68029E9BCE099A60571AF757CBB6A08",object_format='parquet').execute()
df_features = FeatureExtractionOperation().execute(df=data)
df_train, df_validation, df_test = SplitDataOperation().execute(df=df_features, validation_days=7, test_days=7)

  df.loc[:, c.SALES_AMOUNT_IN_EURO] = df[c.SALES_AMOUNT_IN_EURO].astype(float).apply(lambda x: max(x, 0))
  df.loc[:, c.PRODUCT_PRICE] = df[c.PRODUCT_PRICE].apply(float)


In [7]:
df_train

Unnamed: 0,unique_product_id,date,day_of_campaign,TotalSalesAmountInEuro,NumberOfClicks,product_day_id,product_day_index
4,1,2020-10-07,1,0.00000,4.0,1_2020-10-07,5
5,1,2020-10-11,5,63.94464,10.0,1_2020-10-11,6
6,1,2020-10-13,7,0.00000,6.0,1_2020-10-13,7
7,1,2020-10-12,6,0.00000,14.0,1_2020-10-12,8
8,1,2020-10-10,4,472.33280,14.0,1_2020-10-10,9
...,...,...,...,...,...,...,...
15781,564,2020-10-14,8,0.00000,0.0,,15782
15784,564,2020-10-08,2,0.00000,0.0,,15785
15785,564,2020-10-16,10,0.00000,0.0,,15786
15790,564,2020-10-15,9,0.00000,0.0,,15791


In [25]:
import optuna
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


class TotalSalesPredictor:
    def __init__(self, train_df, val_df, test_df):
        self.train_df = train_df.copy()
        self.val_df = val_df.copy()
        self.test_df = test_df.copy()
        self.model = None

    def preprocess_data(self, df):
        df = df.copy()  # Ensure we are working on a copy to avoid warnings

        # Ensure the 'date' column is datetime type
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'], errors='coerce')

        # Check if there are any NaT values after conversion
        if df['date'].isnull().any():
            raise ValueError("NaT values found in 'date' column after conversion. Please check the input data.")

        # Ensure the 'date' column is of datetime type
        if df['date'].dtype != 'datetime64[ns]':
            raise ValueError('Date conversion failed. Date column is not of type datetime64[ns].')

        # Extract the day of the week
        df['day_of_week'] = df['date'].dt.dayofweek

        # Prepare feature matrix X and target vector y
        X = df.drop(['TotalSalesAmountInEuro', 'product_day_id', 'date'], axis=1)
        y = df['TotalSalesAmountInEuro']

        return X, y

    def objective(self, trial, X_train, y_train):
        pipeline = Pipeline([('scaler', StandardScaler()), ('regressor', RandomForestRegressor(random_state=42))])

        param_grid = {
            'regressor__n_estimators': trial.suggest_int('regressor__n_estimators', 50, 200),
            'regressor__max_depth': trial.suggest_int('regressor__max_depth', 10, 30),
            'regressor__min_samples_split': trial.suggest_int('regressor__min_samples_split', 2, 10),
            'regressor__min_samples_leaf': trial.suggest_int('regressor__min_samples_leaf', 1, 4),
        }

        pipeline.set_params(**param_grid)
        n_splits = 5
        cv_results = []

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

        for train_index, val_index in kf.split(X_train):
            X_t, X_v = X_train.iloc[train_index], X_train.iloc[val_index]
            y_t, y_v = y_train.iloc[train_index], y_train.iloc[val_index]
            pipeline.fit(X_t, y_t)
            y_pred = pipeline.predict(X_v)
            rmse = mean_squared_error(y_v, y_pred, squared=False)
            cv_results.append(rmse)

        return sum(cv_results) / n_splits

    def hyperparameter_optimization(self, X_train, y_train):
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: self.objective(trial, X_train, y_train), n_trials=50)

        best_params = study.best_params
        best_pipeline = Pipeline([
            ('scaler', StandardScaler()),
            (
                'regressor',
                RandomForestRegressor(
                    n_estimators=best_params['regressor__n_estimators'],
                    max_depth=best_params['regressor__max_depth'],
                    min_samples_split=best_params['regressor__min_samples_split'],
                    min_samples_leaf=best_params['regressor__min_samples_leaf'],
                    random_state=42,
                ),
            ),
        ])

        best_pipeline.fit(X_train, y_train)
        return best_pipeline

    def train_model(self):
        # Preprocess the data
        X_train, y_train = self.preprocess_data(self.train_df)
        X_val, y_val = self.preprocess_data(self.val_df)

        # Hyperparameter optimization
        self.model = self.hyperparameter_optimization(X_train, y_train)

        # Evaluate the model
        y_pred = self.model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        print(f'Validation RMSE: {rmse}')

    def predict(self):
        # Preprocess the test data
        X_test, _ = self.preprocess_data(self.test_df)

        # Predict using the trained model
        predictions = self.model.predict(X_test)

        return predictions


In [26]:
predictor = TotalSalesPredictor(df_train, df_validation, df_test)
predictor.train_model()
predictions = predictor.predict()
print(predictions)

[I 2024-05-17 19:46:12,201] A new study created in memory with name: no-name-65727bc1-9c70-4a85-b899-55b88cde2092


[I 2024-05-17 19:46:17,298] Trial 0 finished with value: 18.547141877928944 and parameters: {'regressor__n_estimators': 187, 'regressor__max_depth': 17, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1}. Best is trial 0 with value: 18.547141877928944.
[I 2024-05-17 19:46:21,874] Trial 1 finished with value: 18.122982655636093 and parameters: {'regressor__n_estimators': 165, 'regressor__max_depth': 26, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 2}. Best is trial 1 with value: 18.122982655636093.
[I 2024-05-17 19:46:24,313] Trial 2 finished with value: 18.47147177844352 and parameters: {'regressor__n_estimators': 92, 'regressor__max_depth': 12, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1}. Best is trial 1 with value: 18.122982655636093.
[I 2024-05-17 19:46:25,642] Trial 3 finished with value: 17.9871101003231 and parameters: {'regressor__n_estimators': 50, 'regressor__max_depth': 15, 'regressor__min_samples_split': 7, 'reg

Validation RMSE: 15.462443620532932
[67.53849496 46.27829074 31.62882263 ...  3.68699839  0.
  0.        ]
