### Pavan Kumar Mistry - Team-1 
#### Homework-4

In [2]:
import pandas as pd
import sqlite3 as lite

In [3]:
NUM_USERS_LOW = 1000
NUM_USERS_HIGH = 1040
PRICE_MULTIPLIER = 1.05
MAX_ITEMS = 80
WEEKEND_INCREASE = 75

database_name = "store2"

In [4]:
class ProductManager:
    def __init__(self, products_file):
        self.products_file = products_file
        self.products_frame = self._read_products_file() 
        self.sku_info = self.pull_sku_info(self.products_frame)

    def get_products_file(self):
        return self.products_file

    def get_all_skus(self):
        return self.sku_info.keys()

    def show_information(self):
        '''
        Show information about the products frame
        '''
        print(len(self.products_frame))
        print(self.products_frame.columns)
        print(self.products_frame.head())
        print(self.products_frame["itemType"].unique())
    
    def get_sku_info(self, sku):
        '''Get the record from dataframe using sku value'''
        return self.sku_info[sku]

    def get_skus_of_item_type(self, item_type):
        '''Get the list of sku values for a given item type'''
        item_type_frame = self.products_frame[self.products_frame['itemType'] == item_type]
        return item_type_frame['SKU'].values

    def get_non_special_items(self, item_types):
        '''Get all the skus for items outside a set of given item_types'''
        item_type_frame = self.products_frame[~self.products_frame['itemType'].isin(item_types)]
        return item_type_frame['SKU'].values
    
    def _read_products_file(self):
        df = pd.read_csv(self.products_file, delimiter="|",
                         encoding="ISO-8859-1")
        df.fillna("Unknow", inplace=True)
        return df 

    def pull_sku_info(self, products_frame):
        output = {}

        for index, row in products_frame.iterrows():
            output[row['SKU']] = {
                "BasePrice": row['BasePrice'],
                "ProductName": row['Product Name'],
                "Category": row['itemType']
            }
    
        return output

In [5]:
conn = lite.connect(f'{database_name}.db')
cur = conn.cursor()

# Drop all existing tables
cur.execute("Drop table if exists products")
cur.execute("Drop table if exists user_transactions")

conn.commit()

In [6]:
# Create the table
cur.execute("""
    create table if not exists products(
            Manufacturer varchar(30),
            ProductName varchar(200),
            Size varchar(20),
            SKU varchar(20),
            itemType varchar(20),
            BasePrice varchar(10)
            )
""")

# cur.execute("""
#     CREATE TABLE IF NOT EXISTS inventory (
#         Manufacturer varchar(30),
#         ProductName varchar(200),
#         Size varchar(20),
#         SKU varchar(20),
#         itemType varchar(20),
#         BasePrice varchar(10),
#         items_left INT,
#         total_cases_ordered INT DEFAULT 0
#     )
# """)

<sqlite3.Cursor at 0x2543af10bc0>

In [7]:
import datetime

In [8]:
class UserTransactionManager:
    def __init__(self, database_file):
        self.database_file = database_file
        self.conn, self.cursor = self._setup_database(self.database_file)

    def commit_changes(self):
        self.conn.commit()

    def close_db_connection(self):
        self.conn.close()
    
    def reopen_db_connection(self):
        self.conn, self.cursor = self._setup_database(self.database_file)
    
    def get_user_transactions(self):
        return self.cursor.execute("select * from user_transactions").fetchall()

    def fetch_transactions_by_date(self, date):
        date_str = "{:%Y/%m/%d}".format(date)
        return self.cursor.execute('''
            select * from user_transactions where date = ?
        ''', (date_str, ))

    def get_total_transaction_count(self):
        return self.cursor.execute('''
            select count(*) from user_transactions
        ''').fetchall()[0][0]

    def get_total_sales(self):
        return self.cursor.execute('''
            select sum(sale_price) from user_transactions
        ''').fetchall()[0][0]

    def top_selling_items(self):
        return self.cursor.execute("""
            select count(transaction_id) as cnt1, sku from user_transactions group by sku order by cnt1 desc limit 10
        """).fetchall()

    def get_number_of_users(self):
        return len(self.cursor.execute("select count(transaction_id) as count from user_transactions group by date, customer_number").fetchall())

    def load_transaction(self, _date, user_id, sku, sale_price, item_left, total_cases_ordered):
        date_str = "{:%Y/%m/%d}".format(datetime.strptime(_date, "%Y%m%d"))
        self.cursor.execute("INSERT INTO user_transactions(date, customer_number, sku, sale_price, items_left, cases_ordered) VALUES(?, ?, ?, ?, ?, ?)",
                            (date_str, user_id, sku, sale_price, item_left, total_cases_ordered))

    def get_database_file(self):
        return self.database_file
    
    def _setup_database(self, database_file):
        try:
            conn = lite.connect(database_file)
            cursor = conn.cursor()

            cursor.execute("CREATE TABLE IF NOT EXISTS " +
                           "user_transactions(transaction_id INTEGER PRIMARY KEY AUTOINCREMENT, " +
                           "date TEXT, " +
                           "customer_number INT, " +
                           "sku INT, " +
                           "sale_price DECIMAL, items_left INT, cases_ordered INT)")
            return conn, cursor
        except lite.Error as e:
            print("Unable to connect to database or create user_transactions table")
            print(e)

In [9]:
product_manager = ProductManager(products_file="./Products1.txt")
userTransactionManager = UserTransactionManager(database_file=f'{database_name}.db')

product_df = product_manager._read_products_file()

product_df.to_sql("products", userTransactionManager.conn,
                  if_exists="replace", index=False)

2075

In [10]:
import random
from time import time 

In [11]:
def inclusive_random(a, b):
    """
    Generate a random integer N such that a <= N <= b.
    """
    return random.randint(a, b + 1)

In [12]:
inclusive_random(980, 1020)

1003

In [13]:
def dice_roll(probability):
    """
    Simulate a single dice roll with the given probability of success.
    
    ex: 
    >>> dice_roll(0.5) = 50% then it returns True else returns False
    """

    return random.random() <= probability

In [14]:
dice_roll(.80)

True

In [15]:
def get_random_element(input_list):
    """Returns a random element from the input list."""

    if len(input_list) == 0:
        raise Exception("Input list is empty!!!")
    
    return input_list[inclusive_random(0, len(input_list) - 2)]

In [16]:
sample_list = [1,2,3,4,4,5,3,3]
get_random_element(sample_list)

1

In [17]:
speical_items = ["Milk", "Cereal", "Baby Food",
                     "Diapers", "Bread", "Peanut Butter", "Jelly/Jam"]

item_skus = {}

for item in speical_items:
    item_skus[item] = product_manager.get_skus_of_item_type(item)

In [18]:
print(list(item_skus.items())[:2])

[('Milk', array([42355001, 42356001, 42357001, 42358001, 42359001, 42360001])), ('Cereal', array([42091001, 42099001, 42336001, 42346001, 42347001, 42348001,
       42390001, 42391001, 42392001, 42439001, 42440001, 42441001,
       42442001, 42443001, 42444001, 42445001, 42446001, 42447001,
       42448001, 42449001, 42450001, 42651001, 42652001, 42653001,
       42654001, 42873001, 42874001, 42896001, 42897001, 42898001,
       42899001, 42900001, 42901001, 42902001, 42903001, 42904001,
       42905001, 42906001, 42908001, 42909001, 42913001, 42914001,
       42915001, 42919001, 42920001, 42921001, 42922001, 42923001,
       42924001, 42926001, 42930001, 42932001, 42933001, 42934001,
       42938001, 42939001, 42941001, 42942001, 43384001, 43385001,
       43386001, 43387001, 43388001, 43389001, 43390001, 43391001,
       43392001, 43393001, 43394001, 43395001, 43396001, 43397001,
       43398001, 43399001, 43400001, 43401001, 43402001, 43403001,
       43404001, 43530001, 43531001, 4

In [19]:
item_skus['other'] = product_manager.get_non_special_items(speical_items)

In [20]:
item_skus['other']

array([42081001, 42082001, 42083001, ..., 44158001, 44159001, 44160001],
      shape=(1660,))

In [21]:
for item_type in item_skus.keys():
    print(f"{item_type} : {len(item_skus[item_type])}")

Milk : 6
Cereal : 93
Baby Food : 162
Diapers : 82
Bread : 48
Peanut Butter : 20
Jelly/Jam : 4
other : 1660


In [22]:
def get_item_by_category(category):
    """Return a list of items that belong to the given category."""

    sku = get_random_element(item_skus[category])
    item = product_manager.get_sku_info(sku)
    sale_price = round((float(item['BasePrice'][1:]) * PRICE_MULTIPLIER), 2)

    return int(sku), sale_price

In [23]:
get_item_by_category("Cereal")

(42926001, 4.61)

In [24]:
# Homework-4:::::::::::::::::::::::::::

daily_items_per_cat = {
    "Milk": 737,
    "Cereal": 384,
    "Baby Food": 211,
    "Diapers": 177,
    "Bread": 527,
    "Peanut Butter": 105,
    "Jelly/Jam": 142,
    "other": 40376
}

In [25]:
# Set-up inventory
static_required = {}
item_inventory = {}
cases_ordered = {}

In [26]:
for category in item_skus.keys():
    category_skus = item_skus[category]

    # set inventory multiplier
    inventory_multiplier = 3

    if category == "Milk":
        inventory_multiplier = 1.5

    
    for sku in category_skus:
        items_per_sku_category = inventory_multiplier * daily_items_per_cat[category]

        # Dividing to find the average per item
        item_inventory[sku] = int(items_per_sku_category / len(category_skus))

        # add to account for cases of size 12
        if not item_inventory[sku] % 12 == 0:
            item_inventory[sku] += 12 - (item_inventory[sku] % 12)
        
        static_required[sku] = item_inventory[sku]

        # set total number of cases ordered
        cases_ordered[sku] = int(item_inventory[sku] / 12)


In [27]:
list(cases_ordered.items())[:5]

[(np.int64(42355001), 16),
 (np.int64(42356001), 16),
 (np.int64(42357001), 16),
 (np.int64(42358001), 16),
 (np.int64(42359001), 16)]

In [28]:
def is_items_exist(category):
    skus = item_skus[category]

    for sku in skus:
        if item_inventory[int(sku)] > 0:
            return True
    
    return False

In [29]:
is_items_exist("Milk")

True

### Transaction Database

In [30]:
import tqdm
from datetime import datetime, timedelta

# Initialize dates
current_date = datetime(2023, 12, 31)
end_date = datetime(2024, 12, 31)

# Calculate total days between start and end dates
total_days = (end_date - current_date).days

In [31]:
total_days

366

In [32]:

# Progress bar
pbar = tqdm.tqdm(total=total_days, desc="Processing Days")

# Main loop
while current_date < end_date:  # Stop before reaching end_date
    # Increment date at the beginning of the loop
    current_date += timedelta(days=1)
    date_str = "{:%Y/%m/%d}".format(current_date)

    increase = 0

    # Adding Weekdays
    weekday = current_date.strftime("%A")
    if weekday == "Saturday" or weekday == "Sunday":
        increase = WEEKEND_INCREASE
    
    # Random number of users, [low, high]
    num_of_users_today = inclusive_random(NUM_USERS_LOW + increase, NUM_USERS_HIGH + increase)


    """Simulate Deliveries"""

    # Milk
    for milk_sku in item_skus['Milk']:
        # Number we need vs number we have
        discrep = static_required[milk_sku] - item_inventory[milk_sku]

        # Number of cases needed (if we need even one extra, we order a whole new case)
        cases_to_order = int(discrep / 12)
        if not discrep % 12 == 0:
            cases_to_order += 1

        # Add items and cases
        item_inventory[milk_sku] += cases_to_order * 12
        cases_ordered[milk_sku] += cases_to_order

    # Simulate the rest of the stuff
    if weekday in ['Monday', "Wednesday", "Friday"]:
        for category in item_skus.keys():
            if not category == "Milk":
                for sku in item_skus[category]:
                    # Number we need vs number we have
                    discrep = static_required[sku] - item_inventory[sku]

                    # Number of cases needed (if we need even one extra, we order a whole new case)
                    cases_to_order = int(discrep / 12)
                    if not discrep % 12 == 0:
                        cases_to_order += 1

                    # Add items and cases
                    item_inventory[sku] += cases_to_order * 12
                    cases_ordered[sku] += cases_to_order

    # Simulate user transactions
    for user_i in range(1, num_of_users_today + 1):
        # Number of user items
        num_user_items_to_buy = inclusive_random(1, MAX_ITEMS)
        num_user_items_currently = 0
        user_items = []

        # Simulate milk buying
        if dice_roll(.7):  # 70% of users will buy milk
            user_items.append("Milk")
            if dice_roll(.5):  # 50% chance to also buy cereal
                user_items.append("Cereal")
        else:
            if dice_roll(.05):  # 5% chance to buy cereal if not buying milk
                user_items.append("Cereal")

        # Simulate baby food buying
        if dice_roll(.2):  # 20% buy baby food
            user_items.append("Baby Food")
            if dice_roll(.8):  # 80% chance to also buy diapers
                user_items.append("Diapers")
        else:
            if dice_roll(.01):  # 1% chance to buy diapers if not buying baby food
                user_items.append("Diapers")

        # Simulate bread buying
        if dice_roll(.5):  # 50% buy bread
            user_items.append("Bread")

        # Simulate peanut butter buying
        if dice_roll(.1):  # 10% buy peanut butter
            user_items.append("Peanut Butter")
            if dice_roll(.9):  # 90% chance to also buy jelly/jam
                user_items.append("Jelly/Jam")
        else:
            if dice_roll(.05):  # 5% chance to buy jelly/jam if not buying peanut butter
                user_items.append("Jelly/Jam")

        # Simulate the rest of the purchases
        for item_i in range(num_user_items_currently, num_user_items_to_buy):
            user_items.append("other")

        # Add all user transactions to the db
        for item in user_items[:num_user_items_to_buy]:
            # Manage inventory
            if is_items_exist(item):
                while True:
                    sku, sale_price = get_item_by_category(item)
                    if item_inventory[sku] > 0:
                        break
            else:
                if is_items_exist("other"):
                    while True:
                        sku, sale_price = get_item_by_category("other")
                        if item_inventory[sku] > 0:
                            break
                else:
                    break  # Exit if no items are available

            # Execute transaction
            item_inventory[sku] -= 1
            userTransactionManager.load_transaction(
                current_date.strftime('%Y%m%d'), user_i, int(
                    sku), sale_price, item_inventory[sku], cases_ordered[sku]
            )

    # Commit changes every 100 records
    if user_i % 100 == 0:
        userTransactionManager.commit_changes()

    # Update progress bar
    pbar.update(1)

# Close progress bar
pbar.close()

Processing Days: 100%|██████████| 366/366 [13:20<00:00,  2.19s/it]


In [33]:
top_10_sellling = userTransactionManager.top_selling_items()

In [34]:
top_10_sellling

[(44627, 42355001),
 (44592, 42358001),
 (44568, 42360001),
 (44554, 42359001),
 (44409, 42357001),
 (44341, 42356001),
 (12703, 42311001),
 (12498, 42313001),
 (12469, 42314001),
 (12461, 42312001)]

In [35]:
userTransactionManager.get_total_transaction_count()

15651599

In [36]:
userTransactionManager.get_number_of_users()

381641

In [37]:
userTransactionManager.get_total_sales()

50827941.02

In [38]:
transactions = userTransactionManager.get_user_transactions()

transactions_df = pd.DataFrame(transactions, columns=['Index', "Date", "Customer Number", "SKU","Sales Price", "Items_left", "Total cases Ordered"])

In [39]:
transactions_df.head(10)

Unnamed: 0,Index,Date,Customer Number,SKU,Sales Price,Items_left,Total cases Ordered
0,1,2024/01/01,1,42357001,3.87,191,16
1,2,2024/01/01,1,42448001,4.61,11,1
2,3,2024/01/01,1,43287001,5.76,11,1
3,4,2024/01/01,1,43031001,24.14,11,1
4,5,2024/01/01,1,43676001,3.02,71,6
5,6,2024/01/01,1,43077001,1.5,71,6
6,7,2024/01/01,1,42911001,2.62,71,6
7,8,2024/01/01,1,42454001,1.68,71,6
8,9,2024/01/01,1,43791001,1.05,71,6
9,10,2024/01/01,1,43859001,5.45,71,6


In [40]:
product_df = product_manager._read_products_file()
product_df.head(10)

Unnamed: 0,Manufacturer,Product Name,Size,itemType,SKU,BasePrice
0,Zatarains,Jambalaya Rice Mix,12 oz,Rice/Rice Mix,42081001,$2.49
1,Zatarains,Jambalaya Rice Mix,8 oz,Rice/Rice Mix,42082001,$1.79
2,Yucatan,Guacamole Regular,8 oz,Unknow,42083001,$3.99
3,Yuban,Coffee Original Blend,12 oz,Coffee/Creamer,42084001,$3.99
4,Yoplait,GoGurt Variety Pack,8 ct,Yogurt,42085001,$2.99
5,Wishbone,Italian Dressing,16 oz,Salad Dressing,42086001,$2.00
6,White Castle,Cheeseburger Heat & Serve Sliders,29.28 oz,Unknow,42087001,$11.59
7,Whiskas,Choice Cuts Poultry,36 oz,Pet Food,42088001,$4.99
8,Welchs,Farmers Pick Concord Grape,46 oz,Unknow,42089001,$3.59
9,Welchs,Juice Red Grape,64 oz,Juice,42090001,$4.79


In [41]:
product_df.isnull().sum()

Manufacturer    0
Product Name    0
Size            0
itemType        0
SKU             0
BasePrice       0
dtype: int64

In [42]:
final_output = []
for sku, items_left in item_inventory.items():
    product_info = product_df[product_df["SKU"] == sku]

    if not product_info.empty:
        # Get the first matching row
        product_info = product_info.iloc[0]  
    else:
        # If SKU not found, use default values for "other" items
        product_info = pd.Series({
            "Manufacturer": "GenericCo",
            "Product Name": "Generic Item",
            "Size": "N/A",
            "itemType": "Misc",
            "BasePrice": 1.00,
        })

    final_output.append({
        "Manufacturer": product_info["Manufacturer"],
        "Product Name": product_info["Product Name"],
        "Size": product_info["Size"],
        "itemType": product_info["itemType"],
        "SKU": sku,
        "BasePrice": product_info["BasePrice"],
        "Items_left_count": items_left,
        "Total_case_ordered": cases_ordered.get(sku, 0),
    })

In [43]:
len(final_output)

2075

In [44]:
final_df = pd.DataFrame(final_output)
final_df

Unnamed: 0,Manufacturer,Product Name,Size,itemType,SKU,BasePrice,Items_left_count,Total_case_ordered
0,Rowan Dairy,1.00% Milk,1 gal,Milk,42355001,$3.69,85,3726
1,Rowan Dairy,1.00% Milk,1/2 gal,Milk,42356001,$1.89,83,3702
2,Rowan Dairy,2.00% Milk,1 gal,Milk,42357001,$3.69,75,3707
3,Rowan Dairy,2.00% Milk,1/2 gal,Milk,42358001,$1.89,84,3723
4,Rowan Dairy,Whole Milk Milk,1 gal,Milk,42359001,$3.69,62,3718
...,...,...,...,...,...,...,...,...
2070,Aidells,Meatballs Pineapple Teriyaki,12 oz,Frozen Food,44156001,$6.59,19,748
2071,Aidells,Meatballs Sundried Tomato,12 oz,Frozen Food,44157001,$6.59,27,743
2072,Act II,Popcorn Butter,6 pkg,Frozen Food,44158001,$3.99,28,737
2073,A Taste Of Thai,Coconut Milk Unsweetened,13.5 oz,Drink,44159001,$3.59,28,738


In [45]:
userTransactionManager.commit_changes()
userTransactionManager.close_db_connection()

In [48]:
# conn = lite.connect(database_name + ".db")
# cursor = conn.cursor()

In [49]:
# final_df.to_sql("inventory", conn, if_exists="replace", index=False)

In [50]:
# cursor.execute("select * from inventory")

# inventory_df = pd.DataFrame(cursor.fetchall(), columns=[
#     "Manufacturer", "ProductName", "Size", "itemType", "SKU", "BasePrice", "Items_left_count", "Total_cases_ordered"
# ])

In [51]:
# inventory_df.head(10)

In [52]:
# inventory_df.shape