In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from unique_key_generator import generate_keys_for_column , generate_transaction_ID
from item_sampler import get_sampled_item
from seasonality_adjustments import adjust_weekly_seasonality, adjust_monthly_seasonality, adjust_yearly_seasonality,add_trend_factor



## ID Generation
To make sure no accidental duplicate IDs or keys are generated ensure the key length you pass in is at least: Num_values + 1

In [2]:
items_df = pd.read_csv("../Original Files/Item Category.csv")

#items_df = generate_keys_for_column("Category", 3, items_df)
#items_df =  generate_keys_for_column("Subcategory", 4, items_df)
#items_df =  generate_keys_for_column("Item Name", 5, items_df)
#items_df.to_csv("./Output Files/Item Category.csv", index=False)
#items_df.head(5)

## Set weights
If a certain category or store should have higher sales adjust the weights to be higher

NOTE: If you want Subcategory or Item level weights be sure to adjust the item_sampler.py 

In [3]:
categorical_weights = {'Bakery & Desserts': 0.5,
                    'Snacks': 0.5,
                    'Pantry & Dry Goods': 0.7,
                    'Beverages & Water': 0.5,
                    'Alcoholic Beverages':0.3,
                    'Deli':0.8,
                    'Meat & Seafood': 0.6, 
                    'Fresh Produce': 0.9, 
                    'Household and Outdoors': 0.3,
                    'Electronics': 0.2}

store_weights = {'CPT001': [5,21],
                 'CPT003': [8,30], 
                 'CPT004': [9,34], 
                 'STB001': [5,18], 
                 'STB003': [1,20], 
                 'JHB001': [2,16], 
                 'JHB002': [7,25],
                 'DBN001': [3,17]}

store_start_date = {'CPT001': datetime(2020, 1, 1),
                    'CPT003': datetime(2020, 1, 1),
                    'CPT004': datetime(2021, 6, 1),
                    'STB001': datetime(2020, 1, 1),
                    'STB003': datetime(2022, 1, 1),
                    'JHB001': datetime(2020, 1, 1),
                    'JHB002': datetime(2020, 10, 1),
                    'DBN001': datetime(2020, 1, 1),
}
store_trend =  {'CPT001': 0.06,
                        'CPT003': 0.07,
                        'CPT004': 0.09,
                        'STB001': 0.04,
                        'STB003': -0.02,
                        'JHB001': 0.04,
                        'JHB002': 0.08,
                        'DBN001': 0.02,
                        }
"""
This is to control how many items are bought in one transaction ie per customer. 
"""
item_quantity = [1, 2, 3, 4, 5,6]
item_probs =[0.4, 0.25, 0.18749999999999997, 0.125, 0.0625, 0.0625]
total_sum = sum(item_probs)
item_quantity_probabilities = [p / total_sum for p in item_probs]

## Transactions Data Generation Function


In [4]:
def generate_transactions(start_date : datetime ,
                          end_date: datetime,
                          store_list: list, 
                          items_df: pd.DataFrame,
                          categorical_weights: dict,
                          store_weights: dict ,
                          store_start_date: dict ,
                          store_trend: dict,
                          item_quantity: list ,
                          item_quantity_probabilities: list,
                          ):
    transaction_date = start_date
    transaction_data = []
    while transaction_date < end_date:
        """
        TO DO: Add a weight adjusting function for holidays and such
        category_weights = default_weights
        """
        print(f"Transaction Date: {transaction_date}")
        for store in store_list:
            if store_start_date.get(store) > transaction_date:
                continue
            store_weight = store_weights.get(store)
            num_transactions = random.randint(store_weight[0], store_weight[1])
            """
            Add seasonality
            """
            num_transactions = adjust_yearly_seasonality(transaction_date,num_transactions, store_weight)
            num_transactions = adjust_monthly_seasonality(transaction_date,num_transactions, store_weight)
            num_transactions = adjust_weekly_seasonality(transaction_date,num_transactions, store_weight)
            """
            Add a decreasing/increasing trend in the year 
            
            """
            num_transactions = add_trend_factor(num_transactions, start_date.year ,transaction_date.year, store_weight[0]/90 + store_trend.get(store) )
            """
            TO DO:
            Add a event adjustment to the number of items per transaction. 
            """
            transaction = 0 
            # Dummy time object to help with the creation of the transaction ID
            time_object = datetime.strptime("08:00:00", "%H:%M:%S")        
            while transaction <= num_transactions:
                
                transaction_id = generate_transaction_ID(store,transaction_date,time_object) 
                """
                To do: add a adjustment to the number of items in a transaction given a condition?
                """
                num_items_in_transaction = random.randint(1, store_weight[0] + 3)
                
                for i in range(num_items_in_transaction):
                    item = get_sampled_item(items_df, categorical_weights)
                    item_id = item["Item_ID"]
                    category = item["Category"]
                    """
                    Remove this if statement if you are using your own item_category file.s
                    """
                    if category == "Electronics":
                        quantity = 1
                    else:
                        quantity  = np.random.choice(item_quantity, p=item_quantity_probabilities)
                    unit_price = item['Price']
                    total_price = item['Price'] * quantity
                    transaction_data.append(
                        [transaction_id,
                        transaction_date.strftime("%Y-%m-%d"), 
                        store, 
                        item_id, 
                        quantity, 
                        unit_price, 
                        total_price])
                time_object += timedelta(minutes=random.randint(0, 30), seconds=random.randint(0, 59))
                transaction += 1
        transaction_date += timedelta(days=1)
    columns = ['Transaction ID',
            'Transaction Date',
            'Store ID', 
            'Item ID', 
            'Quantity', 
            'Unit Price', 
            'Total Amount']
    return pd.DataFrame(transaction_data, columns=columns)

In [5]:
default_weights = {'Bakery & Desserts': 0.5,
                    'Snacks': 0.5,
                    'Pantry & Dry Goods': 0.7,
                    'Beverages & Water': 0.5,
                    'Alcoholic Beverages':0.3,
                    'Deli':0.8,
                    'Meat & Seafood': 0.6, 
                    'Fresh Produce': 0.9, 
                    'Household and Outdoors': 0.3,
                    'Electronics': 0.2}

store_weights = {'CPT001': [5,21],
                 'CPT003': [8,30], 
                 'CPT004': [9,34], 
                 'STB001': [5,18], 
                 'STB003': [1,20], 
                 'JHB001': [2,16], 
                 'JHB002': [7,25],
                 'DBN001': [3,17]}

store_start_date = {'CPT001': datetime(2020, 1, 1),
                    'CPT003': datetime(2020, 1, 1),
                    'CPT004': datetime(2021, 6, 1),
                    'STB001': datetime(2020, 1, 1),
                    'STB003': datetime(2022, 1, 1),
                    'JHB001': datetime(2020, 1, 1),
                    'JHB002': datetime(2020, 10, 1),
                    'DBN001': datetime(2020, 1, 1),
}
store_trend =  {'CPT001': 0.06,
                        'CPT003': 0.07,
                        'CPT004': 0.09,
                        'STB001': 0.04,
                        'STB003': -0.02,
                        'JHB001': 0.04,
                        'JHB002': 0.08,
                        'DBN001': 0.02,
                        }


item_quantity = [1, 2, 3, 4, 5,6]
item_probs =[0.4, 0.25, 0.18749999999999997, 0.125, 0.0625, 0.0625]
total_sum = sum(item_probs)
item_quantity_probabilities = [p / total_sum for p in item_probs]

## Function call

In [6]:
stores_df = pd.read_csv("../Original Files/StoreMaster.csv")
items_df = pd.read_csv("../Output Files/Item Category.csv")
start_date = datetime(2020, 1, 1)
end_date = datetime(2020, 1, 31)
store_list = stores_df["Store_ID"].unique()
transactions_df = generate_transactions(start_date, end_date,
                          store_list,
                          items_df,
                          categorical_weights,
                          store_weights,
                          store_start_date,
                          store_trend,
                          item_quantity,
                          item_quantity_probabilities)
transactions_df.to_csv("../Output Files/example_output.csv", index=False)



TypeError: generate_transactions() missing 4 required positional arguments: 'store_start_date', 'store_trend', 'item_quantity', and 'item_quantity_probabilities'