In [111]:
import sqlite3
import pandas as pd

# Define the store databases for Region A (Stores 1-4)

products_df = pd.read_csv("./Products1.txt", delimiter="|")

store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions": {
            "table_name": "user_transactions",
            "column_names": {
                "id": "transaction_id",
                "date": "date",
                "customer_number": "customer_number",
                "sku": "sku",
                "sale_price": "sales_price",
                "items_left": "items_left",
                "cases_ordered": "cases_ordered"
            },
        },

    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions":  {
            "table_name": "sales",
            "column_names": {
                "id": "",
                "date": "saleDate",
                "customer_number": "customerNum",
                "sku": "sku",
                "sales_price": "price",
                "items_left": "stockCount",
                "cases_ordered": "caseCount"
            },
        },
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions": {
            "table_name": "sales",
            "column_names": {
                "id": "",
                "date": "Date",
                "customer_number": "Customer #",
                "sku": "SKU",
                "sales_price": "Sale Price",
                "items_left": "ItemsLeft",
                "cases_ordered": "TotalCasesOrdered"
            },
        },
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions": {
            "table_name": "sales_transactions",
            "column_names": {
                "id": "",
                "date": "date",
                "customer_number": "customer_number",
                "sku": "product_sku",
                "sales_price": "SalesPrice",
                "items_left": "items_left",
                "cases_ordered": "cases_ordered"
            }
        },
    }
}

In [112]:

# Function to load December data from a store's database
def load_december_data(store):
    conn = sqlite3.connect(store['database_name'])

    table_name = store['transactions']['table_name']
    _date = store['transactions']['column_names']['date']

    # Load December transactions (filter for month = 12)
    december_transactions_query = """
        SELECT * 
        FROM """ + table_name + """ 
        WHERE """ +  _date + """ LIKE '%/12/%' OR """ + _date+ """ LIKE '%-12-%' OR """ + _date+ """ LIKE '%12%'
    """

    print(december_transactions_query)
    december_transactions_df = pd.read_sql_query(
        december_transactions_query, conn)

    conn.close()

    return december_transactions_df, products_df

In [113]:
# Function to get top-selling items for December
def get_december_top_selling_items(december_transactions_df, products_df,  top_n=25):
    # Merging transactions with products to get product names
    merged_df = pd.merge(december_transactions_df, products_df,
                         left_on="sku", right_on="SKU", how="left")

    # Group by sku and Product Name, then sum the quantities sold
    top_items = merged_df.groupby(
        ["sku", "Product Name"]).size().reset_index(name="Quantity Sold")

    # Sort by quantity sold in descending order
    top_items = top_items.sort_values(by="Quantity Sold", ascending=False)

    # Add a rank column (1 to N) based on quantity sold
    top_items["Rank"] = top_items["Quantity Sold"].rank(
        method="min", ascending=False).astype(int)

    # Get the top N items
    top_items = top_items.head(top_n)

    return top_items

In [114]:
def calculate_december_customer_count(december_transactions_df, col_name):
    return december_transactions_df[col_name].nunique()

In [115]:
def calculate_december_sales(december_transactions_df, col_name):
    return december_transactions_df[col_name].sum()

In [116]:
result = {
    "Store": [],
    "December Sales": [],
    "December Customer Count": [],
}

for store_name, db_path in store_databases.items():
    print(f"Processing {store_name}...")

    # Load December data
    december_transactions_df, products_df = load_december_data(db_path)

    transactions_col_name = december_transactions_df.columns
    products_col_name = products_df.columns	

    sales_price_format = [x for x in transactions_col_name if x in ['sale_price', 'sales_price', 'SalesPrice',
                                                                    "salesPrice", "salesprice", "Sale Price"]]
    customer_number_format = [x for x in transactions_col_name if x in ['customer_number', "Customer Number",
                                                                        "customer number", "Customer#", "Customer #"]]
    
    print(sales_price_format, customer_number_format)

    if sales_price_format:
        december_sales = calculate_december_sales(
            december_transactions_df, sales_price_format[0])
    else:
        print(f"Error: Sales price column not found in {store_name}. Skipping this store.")
        continue

    if customer_number_format:
        december_customer_count = calculate_december_customer_count(
            december_transactions_df, customer_number_format[0])
    else:
        print(f"Error: Customer number column not found in {store_name}. Skipping this store.")
        continue
    
    result["Store"].append(store_name)
    result["December Sales"].append(december_sales)
    result["December Customer Count"].append(december_customer_count)

    sales_price_format = []
    customer_number_format = []

Processing Store 1...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sale_price'] ['customer_number']
Processing Store 2...

        SELECT * 
        FROM sales 
        WHERE saleDate LIKE '%/12/%' OR saleDate LIKE '%-12-%' OR saleDate LIKE '%12%'
    
[] []
Error: Sales price column not found in Store 2. Skipping this store.
Processing Store 3...

        SELECT * 
        FROM sales 
        WHERE Date LIKE '%/12/%' OR Date LIKE '%-12-%' OR Date LIKE '%12%'
    
['Sale Price'] ['Customer #']
Processing Store 4...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['salesPrice'] ['customer_number']


In [117]:
result = pd.DataFrame(result)
result

Unnamed: 0,Store,December Sales,December Customer Count
0,Store 1,4538694.32,1085
1,Store 3,518304.19,42
2,Store 4,5339914.35,1126


In [118]:
# Combined total sales of december month
combined_total_sales = result['December Sales'].sum()
print(f"Combined total sales of december month: ${combined_total_sales:.2f}")

# combined totall customer count of december month
combined_total_customer_count = result['December Customer Count'].sum()
print("Combined total customer count of december month: ", combined_total_customer_count)

Combined total sales of december month: $10396912.86
Combined total customer count of december month:  2253


In [119]:
# Dictionary to store top-selling items for all stores
all_top_items = {}

# Get the top 25 products from Store 1
print("Processing Store 1 to determine top 25 products...")

december_transactions_df_store1, products_df_store1 = load_december_data(
    store_databases["Store 1"])

top_25_products_store1 = get_december_top_selling_items(
    december_transactions_df_store1, products_df_store1)

top_25_products_store1

Processing Store 1 to determine top 25 products...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    


Unnamed: 0,sku,Product Name,Quantity Sold,Rank
275,42356001,1.00% Milk,5018,1
276,42357001,2.00% Milk,5018,1
274,42355001,1.00% Milk,4998,3
278,42359001,Whole Milk Milk,4978,4
279,42360001,Whole Milk Milk,4931,5
277,42358001,2.00% Milk,4887,6
233,42314001,Squeeze Jelly Grape,1391,7
230,42311001,Jam Grape,1357,8
232,42313001,Jelly Grape,1356,9
231,42312001,Jam Strawberry,1354,10


In [120]:
# Extract the SKUs of the top 25 products from Store 1
top_25_skus = top_25_products_store1["sku"].tolist()
top_25_skus

[42356001,
 42357001,
 42355001,
 42359001,
 42360001,
 42358001,
 42314001,
 42311001,
 42313001,
 42312001,
 44155001,
 43496001,
 42725001,
 43245001,
 42863001,
 43200001,
 42486001,
 42501001,
 44001001,
 42370001,
 42142001,
 43666001,
 43522001,
 42866001,
 44146001]

In [121]:
# Using above store-1 skus, get sales data for all stores
for store_name, db_path in store_databases.items():
    print(f"Processing {store_name}...")

    # Load December data for each store
    december_transactions_df, products_df = load_december_data(db_path)

    transactions_col_names = december_transactions_df.columns
    products_col_names = products_df.columns


    trans_sku_format = [x for x in transactions_col_names if x in ["sku", "SKU"]]
    prod_sku_format = [x for x in products_col_names if x in ["sku", "SKU"]]

    print(trans_sku_format, prod_sku_format)

    # Ensure that the lists are not empty before accessing their elements
    if trans_sku_format and prod_sku_format:
        # Merge transactions with products to get product names
        merged_df = pd.merge(december_transactions_df, products_df,
                             left_on=trans_sku_format[0], right_on=prod_sku_format[0], how="left")
    else:
        print(f"Error: SKU column not found in {store_name}. Skipping this store.")
        continue

    # Filter for the top 25 SKUs from Store 1
    filtered_df = merged_df[merged_df[trans_sku_format[0]].isin(top_25_skus)]

    # Group by sku and Product Name, then sum the quantities sold
    top_items = filtered_df.groupby(
        [trans_sku_format[0], "Product Name"]).size().reset_index(name="Quantity Sold")

    # Sort by quantity sold in descending order
    top_items = top_items.sort_values(by="Quantity Sold", ascending=False)

    # Add a rank column (1 to 25) based on quantity sold
    top_items["Rank"] = top_items["Quantity Sold"].rank(
        method="min", ascending=False).astype(int)

    all_top_items[store_name] = top_items

    trans_sku_format = []
    prod_sku_format = []

Processing Store 1...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']
Processing Store 2...

        SELECT * 
        FROM sales 
        WHERE saleDate LIKE '%/12/%' OR saleDate LIKE '%-12-%' OR saleDate LIKE '%12%'
    
['sku'] ['SKU']
Processing Store 3...

        SELECT * 
        FROM sales 
        WHERE Date LIKE '%/12/%' OR Date LIKE '%-12-%' OR Date LIKE '%12%'
    
['SKU'] ['SKU']
Processing Store 4...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']


In [122]:
# Creating a single Dataframe
combined_top_items = pd.DataFrame()

# Iterate through each store's top-selling items
for store_name, top_items in all_top_items.items():
    # Standardize column names to lowercase for consistency
    top_items.columns = top_items.columns.str.lower()
    top_items = top_items.rename(columns={
        "quantity sold": f"{store_name} Quantity Sold",
        "rank": f"{store_name} Rank"
    })

    # Merge with the combined DataFrame
    if combined_top_items.empty:
        combined_top_items = top_items[[
            "sku", "product name", f"{store_name} Quantity Sold", f"{store_name} Rank"]]
    else:
        combined_top_items = pd.merge(
            combined_top_items,
            top_items[["sku", f"{store_name} Quantity Sold",
                       f"{store_name} Rank"]],
            on="sku",
            how="outer"
        ).sort_values(by="Store 1 Rank")

In [123]:
combined_top_items.head()

Unnamed: 0,sku,product name,Store 1 Quantity Sold,Store 1 Rank,Store 2 Quantity Sold,Store 2 Rank,Store 3 Quantity Sold,Store 3 Rank,Store 4 Quantity Sold,Store 4 Rank
6,42356001,1.00% Milk,5018,1,,,61,1,5285,5
7,42357001,2.00% Milk,5018,1,,,61,1,5291,4
5,42355001,1.00% Milk,4998,3,,,61,1,5213,6
9,42359001,Whole Milk Milk,4978,4,,,61,1,5297,3
10,42360001,Whole Milk Milk,4931,5,,,61,1,5332,2


In [124]:
# Fill NaN values with 0 for Quantity Sold and "N/A" for Rank
combined_top_items = combined_top_items.fillna({
    "Store 1 Quantity Sold": 0,
    "Store 1 Rank": "N/A",
    "Store 2 Quantity Sold": 0,
    "Store 2 Rank": "N/A",
    "Store 3 Quantity Sold": 0,
    "Store 3 Rank": "N/A",
    "Store 4 Quantity Sold": 0,
    "Store 4 Rank": "N/A"
})

combined_top_items = combined_top_items.sort_values(by="Store 1 Rank")

print("\nBest Selling Items Combined for December in Region A (Stores 1-4):")
combined_top_items = combined_top_items.reset_index(drop=True)
combined_top_items


Best Selling Items Combined for December in Region A (Stores 1-4):


Unnamed: 0,sku,product name,Store 1 Quantity Sold,Store 1 Rank,Store 2 Quantity Sold,Store 2 Rank,Store 3 Quantity Sold,Store 3 Rank,Store 4 Quantity Sold,Store 4 Rank
0,42356001,1.00% Milk,5018,1,0.0,,61,1,5285,5
1,42357001,2.00% Milk,5018,1,0.0,,61,1,5291,4
2,42355001,1.00% Milk,4998,3,0.0,,61,1,5213,6
3,42359001,Whole Milk Milk,4978,4,0.0,,61,1,5297,3
4,42360001,Whole Milk Milk,4931,5,0.0,,61,1,5332,2
5,42358001,2.00% Milk,4887,6,0.0,,61,1,5392,1
6,42314001,Squeeze Jelly Grape,1391,7,0.0,,61,1,1485,8
7,42311001,Jam Grape,1357,8,0.0,,61,1,1433,10
8,42313001,Jelly Grape,1356,9,0.0,,61,1,1441,9
9,42312001,Jam Strawberry,1354,10,0.0,,61,1,1500,7


In [125]:
combined_cal = pd.DataFrame({
    "combined_total_sales": combined_total_sales,
    "combined_total_customer_count": combined_total_customer_count,
}, index=[1])

combined_cal

Unnamed: 0,combined_total_sales,combined_total_customer_count
1,10396912.86,2253


In [126]:
# Save the result DataFrame
result.to_csv("Final-deliverable-1.csv", index=False, header=True)

combined_cal.to_csv("Final-deliverable-1.csv", index=False, mode='a', header=True)

# Append the combined_top_items DataFrame to the same file
combined_top_items.to_csv("Final-deliverable-1.csv", index=False, mode='a', header=True)

### Deliverable 2a

In [44]:
import pandas as pd

# Existing products table
products = pd.read_csv('Products1.txt', delimiter="|")
product_class = pd.read_csv(
    'product_class.csv')

In [45]:
products.head()

Unnamed: 0,Manufacturer,Product Name,Size,itemType,SKU,BasePrice
0,Zatarains,Jambalaya Rice Mix,12 oz,Rice/Rice Mix,42081001,$2.49
1,Zatarains,Jambalaya Rice Mix,8 oz,Rice/Rice Mix,42082001,$1.79
2,Yucatan,Guacamole Regular,8 oz,,42083001,$3.99
3,Yuban,Coffee Original Blend,12 oz,Coffee/Creamer,42084001,$3.99
4,Yoplait,GoGurt Variety Pack,8 ct,Yogurt,42085001,$2.99


In [46]:
product_class.head(99)

Unnamed: 0,product_class_id,product_subcategory,product_category,product_department,product_family
0,1,Nuts,Specialty,Produce,Food
1,2,Shellfish,Seafood,Seafood,Food
2,3,Canned Fruit,Fruit,Canned Products,Food
3,4,Spices,Baking Goods,Baking Goods,Food
4,5,Pasta,Starchy Foods,Starchy Foods,Food
...,...,...,...,...,...
94,95,Anchovies,Canned Anchovies,Canned Foods,Food
95,96,Clams,Canned Clams,Canned Foods,Food
96,97,Oysters,Canned Oysters,Canned Foods,Food
97,98,Sardines,Canned Sardines,Canned Foods,Food


In [47]:
[products.shape, product_class.shape]

[(2075, 6), (114, 5)]

In [49]:
product_class[product_class['product_class_id'] == 99]

Unnamed: 0,product_class_id,product_subcategory,product_category,product_department,product_family
98,99,Fresh Fruit,Fruit,Produce,Food


#### Total Null itemType

In [50]:
total_null_items = len(products[products['itemType'].isnull()])
total_null_items

294

In [51]:
subcategory_mapping = {
    'Frito Lay': 'Chips',
    'Whole Milk': 'Dairy',
    'Potato Chips': 'Chips',
    'Sliced Bread': 'Sliced Bread',
    'Yogurt': 'Yogurt',
    'Cheese': 'Cheese',
    'Fresh Vegetables': 'Fresh Vegetables',
    'Chips': 'Chips',
    'Soda': 'Soda',
    'Juice': 'Juice',
    'Cereal': 'Cereal',
    'Pasta': 'Pasta',
    'Coffee': 'Coffee',
    'Ice Cream': 'Ice Cream',
    'Cookies': 'Cookies',
    'Pretzels': 'Pretzels',
    'Popcorn': 'Popcorn',
    'Crackers': 'Crackers',
    'Dips': 'Dips',
    'Donuts': 'Donuts'
}


def map_subcategory(product_name):
    for keyword, subcategory in subcategory_mapping.items():
        if keyword in product_name:
            return subcategory
    return None

In [52]:
products['Subcategory'] = products['Product Name'].apply(map_subcategory)

In [57]:
products.sample(10)

Unnamed: 0,Manufacturer,Product Name,Size,itemType,SKU,BasePrice,Subcategory
1917,Best Choice,Best Choice Fudge Brownies,6.23,Cookies,44003001,$1.73,
1514,Entenmanns,Cake Lemon Crunch,26 oz,Baked Goods Other than Bread,43600001,$7.59,
1554,Edwards,Pie Key Lime,36 oz,,43640001,$7.99,
1142,Golden,Golden Frozen Chicken Wings,13.3,Frozen Chicken,43228001,$3.48,
1300,General Mills,Cheerios Cereal,36 oz,Cereal,43386001,$6.59,Cereal
109,Tell Tale,Tell Tale Canned Peanuts,6.86,Nuts,42190001,$3.40,
1955,Beech-Nut,Stage 2 Yellow Cling Peaches,4 oz,Baby Food,44041001,$0.79,
1669,Coke,Sprite Soda,67.6 oz,Soda,43755001,$1.50,Soda
376,Pleasant,Pleasant Canned Tuna in Water,3.4,Tuna,42457001,$2.96,
949,Huggies,Snug & Dry - Disney Size 5,24 ct,Diapers,43033001,$10.99,


In [58]:
uncategorized_products = products[products['Subcategory'].isnull()]
uncategorized_products.count()

Manufacturer    1484
Product Name    1484
Size            1484
itemType        1255
SKU             1484
BasePrice       1484
Subcategory        0
dtype: int64

In [60]:
subcategories = list(product_class['product_subcategory'])

for subcategory in subcategories:
    products.loc[products['Product Name'].str.contains(
        subcategory, case=False) & products['Subcategory'].isnull(), 'Subcategory'] = subcategory

In [61]:
# Filter products where 'itemType' is null and select the 'Product Name' column
products[products['Product Name'].isin(products[products['itemType'].isnull()]['Product Name'].unique())]

Unnamed: 0,Manufacturer,Product Name,Size,itemType,SKU,BasePrice,Subcategory
2,Yucatan,Guacamole Regular,8 oz,,42083001,$3.99,
6,White Castle,Cheeseburger Heat & Serve Sliders,29.28 oz,,42087001,$11.59,Cheese
8,Welchs,Farmers Pick Concord Grape,46 oz,,42089001,$3.59,
19,Turkey Hill,Iced Tea Lemon,128 oz,,42100001,$2.99,
88,Thomas,Bagels Blueberry,22 oz,Baked Goods Other than Bread,42169001,$3.00,Bagels
...,...,...,...,...,...,...,...
2019,Barber,Chicken Breast Stuffed Broccoli & Cheese,2.5 oz,,44105001,$5.49,Cheese
2023,Banquet,Chicken Breast Strips,24 oz,,44109001,$4.39,
2024,Banquet,Chicken Patties,27 oz,,44110001,$4.39,
2025,Banquet,Chicken Patty,27 oz,,44111001,$4.39,


In [62]:
remaining_uncategorized = products[products['Subcategory'].isnull()]
print(len(remaining_uncategorized))

998


In [63]:
if len(remaining_uncategorized) > 0:
    print("\nSuggesting new subcategories for remaining uncategorized products:")
    for product_name in remaining_uncategorized['Product Name'].unique():
        print(
            f"Product: {product_name} → Suggested Subcategory: [New Subcategory]")


Suggesting new subcategories for remaining uncategorized products:
Product: Guacamole Regular → Suggested Subcategory: [New Subcategory]
Product: GoGurt Variety Pack → Suggested Subcategory: [New Subcategory]
Product: Italian Dressing → Suggested Subcategory: [New Subcategory]
Product: Choice Cuts Poultry → Suggested Subcategory: [New Subcategory]
Product: Farmers Pick Concord Grape → Suggested Subcategory: [New Subcategory]
Product: Splash Berry Blend → Suggested Subcategory: [New Subcategory]
Product: Splash Mango Peach → Suggested Subcategory: [New Subcategory]
Product: V-Fusion Berry → Suggested Subcategory: [New Subcategory]
Product: V-Fusion Pomegranate Blueberry → Suggested Subcategory: [New Subcategory]
Product: V-Fusion Refreshers Peach Strawberry → Suggested Subcategory: [New Subcategory]
Product: Iced Tea Lemon → Suggested Subcategory: [New Subcategory]
Product: Trop50 Calcium → Suggested Subcategory: [New Subcategory]
Product: Tri-State Almonds  → Suggested Subcategory: [N

In [71]:
new_products_columns = [
    'Product Key', 'SKU', 'Product Name', 'Product Class ID', 'Subcategory',
    'Category', 'Department', 'Product Family', 'Size', '#Per Case',
    'Brand Name', 'Manufacturer', 'Supplier'
]

new_products = products[['SKU', 'Product Name', 'Subcategory']].copy()

In [72]:
# Unique product keys
new_products['Product Key'] = range(
    1, len(new_products) + 1)


# Ensure the product_subcategory column in product_class has unique values
product_class_unique = product_class.drop_duplicates(
    subset='product_subcategory')

new_products['Product Class ID'] = new_products['Subcategory'].map(
    product_class_unique.set_index('product_subcategory')['product_class_id'])
new_products['Category'] = new_products['Subcategory'].map(
    product_class_unique.set_index('product_subcategory')['product_category'])
new_products['Department'] = new_products['Subcategory'].map(
    product_class_unique.set_index('product_subcategory')['product_department'])
new_products['Product Family'] = new_products['Subcategory'].map(
    product_class_unique.set_index('product_subcategory')['product_family'])
new_products['Size'] = 'Standard'  # Default size
new_products['#Per Case'] = 12  # Default #Per Case
new_products['Brand Name'] = 'Generic'  # Default brand name
new_products['Manufacturer'] = 'Rowan Warehouse'  # Default manufacturer
new_products['Supplier'] = new_products['Product Name'].apply(
    lambda x: 'Rowan Dairy' if 'Milk' in x else 'Rowan Warehouse')

In [74]:
# Reorder columns to match the required format
new_products = new_products[new_products_columns]
new_products.head()

Unnamed: 0,Product Key,SKU,Product Name,Product Class ID,Subcategory,Category,Department,Product Family,Size,#Per Case,Brand Name,Manufacturer,Supplier
0,1,42081001,Jambalaya Rice Mix,57.0,Rice,Starchy Foods,Starchy Foods,Food,Standard,12,Generic,Rowan Warehouse,Rowan Warehouse
1,2,42082001,Jambalaya Rice Mix,57.0,Rice,Starchy Foods,Starchy Foods,Food,Standard,12,Generic,Rowan Warehouse,Rowan Warehouse
2,3,42083001,Guacamole Regular,,,,,,Standard,12,Generic,Rowan Warehouse,Rowan Warehouse
3,4,42084001,Coffee Original Blend,7.0,Coffee,Dry Goods,Baking Goods,Drink,Standard,12,Generic,Rowan Warehouse,Rowan Warehouse
4,5,42085001,GoGurt Variety Pack,,,,,,Standard,12,Generic,Rowan Warehouse,Rowan Warehouse


In [75]:
new_products.to_csv('new_products.csv', index=False)
print("\nNew Products Table saved as 'new_products.csv'.")

PermissionError: [Errno 13] Permission denied: 'new_products.csv'