In [1]:
import pandas as pd
import numpy as np

# Constants
ALPHA = 0.5  # Weight for purchase frequency
BETA = 0.5   # Weight for recency
K = 5        # Desired bundle size
C_C_MAX = 1000  # Consultant's purchasing power
THETA = 0.5  # Threshold percentage of C_C_MAX
N_CATEGORIES = 2  # Minimum categories for diversity

# Step 1: Load and preprocess data
def load_data(input_csv):
    return pd.read_csv(input_csv)

def preprocess_data(data):
    data["frequency"] = data.groupby("product_id")["product_id"].transform("count")
    data["recency"] = normalize(
        pd.to_datetime(data["purchase_date"]).max() - pd.to_datetime(data["purchase_date"])
    )
    return data

# Utility function
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# Step 2: Anchor product selection
def select_anchor_product(purchase_history):
    purchase_history["combined_score"] = (
        ALPHA * purchase_history["frequency"] + BETA * purchase_history["recency"]
    )
    return purchase_history.loc[purchase_history["combined_score"].idxmax(), "product_id"]

# Step 3: Generate related items mapping (dummy implementation)
def generate_related_items_mapping(data):
    related_items = {}
    for product in data["product_id"].unique():
        related_items[product] = data[data["product_id"] != product]["product_id"].sample(min(3, len(data) - 1)).tolist()
    return related_items

# Step 4: Bundle creation
def create_bundle(data, related_items, anchor_product):
    bundle = [anchor_product]
    catalog = data["product_id"].unique()
    while len(bundle) < K:
        candidates = generate_candidates(bundle, related_items, catalog)
        candidates_df = filter_candidates(data, candidates)
        if candidates_df.empty:
            break
        next_product = select_next_product(data, candidates_df, bundle)
        if not validate_bundle(data, bundle, next_product):
            continue
        bundle.append(next_product)
    return bundle

# Step 4(a): Generate candidate products
def generate_candidates(bundle, related_items, catalog):
    candidates = set()
    for product in bundle:
        candidates.update(related_items.get(product, []))
    return list(candidates - set(bundle))

# Step 4(b): Filter candidates by cost threshold
def filter_candidates(data, candidates):
    candidates_df = data[data["product_id"].isin(candidates)].copy()
    return candidates_df[candidates_df["price"] <= THETA * C_C_MAX]

# Step 4(c): Score and select the next product
def select_next_product(data, candidates_df, bundle):
    candidates_df["score"] = candidates_df.apply(
        lambda row: 0.7 * (row["category"] in data[data["product_id"].isin(bundle)]["category"].values) + 0.3,
        axis=1
    )
    return candidates_df.sort_values("score", ascending=False).iloc[0]["product_id"]

# Step 4(d): Validate the bundle constraints
def validate_bundle(data, bundle, next_product):
    temp_bundle = bundle + [next_product]
    total_cost = data[data["product_id"].isin(temp_bundle)]["price"].sum()
    if total_cost > C_C_MAX:
        return False
    bundle_categories = data[data["product_id"].isin(temp_bundle)]["category"].nunique()
    return bundle_categories >= N_CATEGORIES

# Main function
def main(input_csv):
    data = load_data(input_csv)
    data = preprocess_data(data)
    related_items = generate_related_items_mapping(data)
    anchor_product = select_anchor_product(data)
    bundle = create_bundle(data, related_items, anchor_product)
    return data[data["product_id"].isin(bundle)]

if __name__ == "__main__":
    input_csv = "data/F1_sample_data_1000_consultora_with_date.csv"
    final_bundle = main(input_csv)
    print("Final Bundle:")
    print(final_bundle)


KeyboardInterrupt: 