### Import libraries

In [None]:
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from tpot import TPOTRegressor

pd.set_option('display.max_columns', None)  

### Read dataset

In [None]:
df = pd.read_csv('data/dataset_v3.csv')

I decided to proceed with TPOT as an AutoML framework. Initially, I was considering auto-sklearn, but after reading this article where TPOT was recommended for regression problems (https://medium.com/georgian-impact-blog/choosing-the-best-automl-framework-4f2a90cb1826), I made my choice.

Now let's split this dataframe into train and test. We'll pick the last 5 days as a test set.

In [None]:
# Calculate the number of unique dates that correspond to 20% of the dataset
num_dates = int(df['date'].nunique() * 0.2)

# Get the date that splits the data into 80% training and 20% testing
split_date = df['date'].unique()[-num_dates]

# Splitting the dataset
train = df[df['date'] < split_date]
test = df[df['date'] >= split_date]

X_train = train.drop(columns=['pageviews_-1d_lag', 'offer_id', 'date'])  # Dropping 'date' as it's not a feature
y_train = train['pageviews_-1d_lag']

X_test = test.drop(columns=['pageviews_-1d_lag', 'offer_id', 'date'])
y_test = test['pageviews_-1d_lag']

Define searching space.

In [None]:
tpot_config = {
    # Existing ensemble models
    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100, 200, 300],
        'max_features': ["auto", "sqrt", "log2"],
        'max_depth': [1, 5, 10],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 5, 10, 15, 20],
        'bootstrap': [True, False]
    },
    'xgboost.XGBRegressor': {
        'n_estimators': [100, 200, 300],
        'max_depth': [1, 5, 10],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [1, 2, 5, 10]
    },
    'lightgbm.LGBMRegressor': {
        'num_leaves': [20, 50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 300],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'max_depth': [1, 5, 10]
    },
}

### Instantiate the TPOTRegressor for AutoML

In [None]:
n_splits = 5
# Create a TimeSeriesSplit object
time_series_cv = TimeSeriesSplit(n_splits=n_splits)

In [None]:
tpot = TPOTRegressor(
    generations=5,
    population_size=50,
    n_jobs=-1,
    verbosity=2, 
    cv=time_series_cv,
    random_state=42,
    max_time_mins=60,
    max_eval_time_mins=10,
    config_dict=tpot_config,
)

# Run TPOT
tpot.fit(X_train, y_train)

Run the TPOT optimization

Check the score of the best pipeline.

In [None]:
print("Test Score: ", tpot.score(X_test, y_test))

Export the best pipeline as a Python script file.

In [None]:
tpot.export('tpot_model_selection/tpot_model_pipeline_ensemble.py')