In [6]:
import sqlite3
import pandas as pd

# Define the store databases for Region A (Stores 1-4)

products_df = pd.read_csv("./Products1.txt", delimiter="|")

store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "sales",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name":  "sales_transactions"
    }
}

In [7]:
# Standardizing column names across databases
column_mapping = {
    'transaction_id': ['id', 'transaction id', 'transaction_id'],
    'date': ['date', 'transaction date', 'transaction_date', 'saleDate', 'SaleDate', 'sale Date'],
    "customer_number": ['customer_number', "Customer Number",
                        "customer number", "Customer#", "Customer #", "CustomerNum", "customerNum"],
    "sku": ["sku", "SKU", "product_sku", "product sku"],
    "sales_price": ['sale_price', 'sales_price', 'SalesPrice',
                    "salesPrice", "salesprice", "Sale Price", "price", "Price"],
    "items_left": ["items_left", "total_item_left", "stockCount"],
    "cases_ordered": ["cases_ordered", 'total_cases_ordered', "caseCount"]
}

In [8]:
def standardize_columns():
    for store_name, db_config in store_databases.items():

        print(f"Processing {store_name}...")
        conn = None
        try:
            # Connect to the SQLite database
            conn = sqlite3.connect(db_config['database_name'])
            cursor = conn.cursor()


            # Get the table name (fixed the dictionary access)
            table_name = db_config['transactions_table_name']

            # Get current column names
            cursor.execute(f"PRAGMA table_info({table_name})")
            current_columns = [column[1] for column in cursor.fetchall()]

            print(current_columns)

            # Generate column mapping for this database
            rename_operations = {}
            for standard_col, possible_names in column_mapping.items():
                for col in current_columns:
                    if col in possible_names and col != standard_col:
                        rename_operations[col] = standard_col
                        break

            # Execute ALTER TABLE statements for each column to rename
            for old_name, new_name in rename_operations.items():
                try:
                    # SQLite ALTER TABLE RENAME COLUMN syntax (requires SQLite 3.25.0+)
                    cursor.execute(f"""
                        ALTER TABLE {table_name} 
                        RENAME COLUMN "{old_name}" TO "{new_name}"
                    """)
                    print(f"Renamed {old_name} to {new_name} in {store_name}")
                except sqlite3.OperationalError as e:
                    print(
                        f"Couldn't rename {old_name} in {store_name}: {str(e)}")

            conn.commit()
            print(f"Completed standardization for {store_name}")

        except Exception as e:
            print(f"Error processing {store_name}: {str(e)}")
            if conn:
                conn.rollback()
        finally:
            if conn:
                conn.close()

In [9]:
standardize_columns()

Processing Store 1...
['transaction_id', 'date', 'customer_number', 'sku', 'sales_price', 'items_left', 'cases_ordered']
Completed standardization for Store 1
Processing Store 2...
['date', 'customer_number', 'sku', 'sales_price', 'items_left', 'cases_ordered']
Completed standardization for Store 2
Processing Store 3...
['transaction_id', 'date', 'customer_number', 'sku', 'sales_price', 'items_left', 'cases_ordered']
Completed standardization for Store 3
Processing Store 4...
['date', 'customer_number', 'sku', 'sales_price', 'items_left', 'cases_ordered']
Completed standardization for Store 4


In [10]:

# Function to load December data from a store's database
def load_december_data(store):
    conn = sqlite3.connect(store['database_name'])

    table_name = store['transactions_table_name']
    _date = 'date'

    # Load December transactions (filter for month = 12)
    december_transactions_query = """
        SELECT * 
        FROM """ + table_name + """ 
        WHERE """ +  _date + """ LIKE '%/12/%' OR """ + _date+ """ LIKE '%-12-%' OR """ + _date+ """ LIKE '%12%'
    """

    print(december_transactions_query)
    december_transactions_df = pd.read_sql_query(
        december_transactions_query, conn)

    conn.close()

    return december_transactions_df, products_df

In [11]:
# Function to get top-selling items for December
def get_december_top_selling_items(december_transactions_df, products_df,  top_n=25):
    # Merging transactions with products to get product names
    merged_df = pd.merge(december_transactions_df, products_df,
                         left_on="sku", right_on="SKU", how="left")

    # Group by sku and Product Name, then sum the quantities sold
    top_items = merged_df.groupby(
        ["sku", "Product Name"]).size().reset_index(name="Quantity Sold")

    # Sort by quantity sold in descending order
    top_items = top_items.sort_values(by="Quantity Sold", ascending=False)

    # Add a rank column (1 to N) based on quantity sold
    top_items["Rank"] = top_items["Quantity Sold"].rank(
        method="min", ascending=False).astype(int)

    # Get the top N items
    top_items = top_items.head(top_n)

    return top_items

In [12]:
def calculate_december_customer_count(december_transactions_df, col_name):
    return december_transactions_df[col_name].nunique()

In [13]:
def calculate_december_sales(december_transactions_df, col_name):
    return december_transactions_df[col_name].sum()

In [14]:
result = {
    "Store": [],
    "December Sales": [],
    "December Customer Count": [],
}

for store_name, db_path in store_databases.items():
    print(f"Processing {store_name}...")

    # Load December data
    december_transactions_df, products_df = load_december_data(db_path)

    transactions_col_name = december_transactions_df.columns
    products_col_name = products_df.columns	

    sales_price_format = [x for x in transactions_col_name if x in ['sale_price', 'sales_price', 'SalesPrice',
                                                                    "salesPrice", "salesprice", "Sale Price", "price", "Price"]]
    customer_number_format = [x for x in transactions_col_name if x in ['customer_number', "Customer Number",
                                                                        "customer number", "Customer#", "Customer #", "CustomerNum", "customerNum"]]
    
    print(sales_price_format, customer_number_format)

    if sales_price_format:
        december_sales = calculate_december_sales(
            december_transactions_df, sales_price_format[0])
    else:
        print(f"Error: Sales price column not found in {store_name}. Skipping this store.")
        continue

    if customer_number_format:
        december_customer_count = calculate_december_customer_count(
            december_transactions_df, customer_number_format[0])
    else:
        print(f"Error: Customer number column not found in {store_name}. Skipping this store.")
        continue
    
    result["Store"].append(store_name)
    result["December Sales"].append(december_sales)
    result["December Customer Count"].append(december_customer_count)

    sales_price_format = []
    customer_number_format = []

Processing Store 1...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sales_price'] ['customer_number']
Processing Store 2...

        SELECT * 
        FROM sales 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sales_price'] ['customer_number']
Processing Store 3...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sales_price'] ['customer_number']
Processing Store 4...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sales_price'] ['customer_number']


In [15]:
result = pd.DataFrame(result)
result

Unnamed: 0,Store,December Sales,December Customer Count
0,Store 1,4538694.32,1085
1,Store 2,0.0,0
2,Store 3,7270866.56,1133
3,Store 4,5339914.35,1126


In [16]:
# Combined total sales of december month
combined_total_sales = result['December Sales'].sum()
print(f"Combined total sales of december month: ${combined_total_sales:.2f}")

# combined totall customer count of december month
combined_total_customer_count = result['December Customer Count'].sum()
print("Combined total customer count of december month: ", combined_total_customer_count)

Combined total sales of december month: $17149475.23
Combined total customer count of december month:  3344


In [17]:
# Dictionary to store top-selling items for all stores
all_top_items = {}

# Get the top 25 products from Store 1
print("Processing Store 1 to determine top 25 products...")

december_transactions_df_store1, products_df_store1 = load_december_data(
    store_databases["Store 1"])

top_25_products_store1 = get_december_top_selling_items(
    december_transactions_df_store1, products_df_store1)

top_25_products_store1

Processing Store 1 to determine top 25 products...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    


Unnamed: 0,sku,Product Name,Quantity Sold,Rank
275,42356001,1.00% Milk,5018,1
276,42357001,2.00% Milk,5018,1
274,42355001,1.00% Milk,4998,3
278,42359001,Whole Milk Milk,4978,4
279,42360001,Whole Milk Milk,4931,5
277,42358001,2.00% Milk,4887,6
233,42314001,Squeeze Jelly Grape,1391,7
230,42311001,Jam Grape,1357,8
232,42313001,Jelly Grape,1356,9
231,42312001,Jam Strawberry,1354,10


In [18]:
# Extract the SKUs of the top 25 products from Store 1
top_25_skus = top_25_products_store1["sku"].tolist()
top_25_skus

[42356001,
 42357001,
 42355001,
 42359001,
 42360001,
 42358001,
 42314001,
 42311001,
 42313001,
 42312001,
 44155001,
 43496001,
 42725001,
 43245001,
 42863001,
 43200001,
 42486001,
 42501001,
 44001001,
 42370001,
 42142001,
 43666001,
 43522001,
 42866001,
 44146001]

In [19]:
# Using above store-1 skus, get sales data for all stores
for store_name, db_path in store_databases.items():
    print(f"Processing {store_name}...")

    # Load December data for each store
    december_transactions_df, products_df = load_december_data(db_path)

    transactions_col_names = december_transactions_df.columns
    products_col_names = products_df.columns


    trans_sku_format = [x for x in transactions_col_names if x in ["sku", "SKU"]]
    prod_sku_format = [x for x in products_col_names if x in ["sku", "SKU"]]

    print(trans_sku_format, prod_sku_format)

    # Ensure that the lists are not empty before accessing their elements
    if trans_sku_format and prod_sku_format:
        # Merge transactions with products to get product names
        merged_df = pd.merge(december_transactions_df, products_df,
                             left_on=trans_sku_format[0], right_on=prod_sku_format[0], how="left")
    else:
        print(f"Error: SKU column not found in {store_name}. Skipping this store.")
        continue

    # Filter for the top 25 SKUs from Store 1
    filtered_df = merged_df[merged_df[trans_sku_format[0]].isin(top_25_skus)]

    # Group by sku and Product Name, then sum the quantities sold
    top_items = filtered_df.groupby(
        [trans_sku_format[0], "Product Name"]).size().reset_index(name="Quantity Sold")

    # Sort by quantity sold in descending order
    top_items = top_items.sort_values(by="Quantity Sold", ascending=False)

    # Add a rank column (1 to 25) based on quantity sold
    top_items["Rank"] = top_items["Quantity Sold"].rank(
        method="min", ascending=False).astype(int)

    all_top_items[store_name] = top_items

    trans_sku_format = []
    prod_sku_format = []

Processing Store 1...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']
Processing Store 2...

        SELECT * 
        FROM sales 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']
Processing Store 3...

        SELECT * 
        FROM user_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']
Processing Store 4...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
['sku'] ['SKU']


In [20]:
# Creating a single Dataframe
combined_top_items = pd.DataFrame()

# Iterate through each store's top-selling items
for store_name, top_items in all_top_items.items():
    # Standardize column names to lowercase for consistency
    top_items.columns = top_items.columns.str.lower()
    top_items = top_items.rename(columns={
        "quantity sold": f"{store_name} Quantity Sold",
        "rank": f"{store_name} Rank"
    })

    # Merge with the combined DataFrame
    if combined_top_items.empty:
        combined_top_items = top_items[[
            "sku", "product name", f"{store_name} Quantity Sold", f"{store_name} Rank"]]
    else:
        combined_top_items = pd.merge(
            combined_top_items,
            top_items[["sku", f"{store_name} Quantity Sold",
                       f"{store_name} Rank"]],
            on="sku",
            how="outer"
        ).sort_values(by="Store 1 Rank")

In [21]:
combined_top_items.head()

Unnamed: 0,sku,product name,Store 1 Quantity Sold,Store 1 Rank,Store 2 Quantity Sold,Store 2 Rank,Store 3 Quantity Sold,Store 3 Rank,Store 4 Quantity Sold,Store 4 Rank
6,42356001,1.00% Milk,5018,1,,,5114,6,5285,5
7,42357001,2.00% Milk,5018,1,,,5357,1,5291,4
5,42355001,1.00% Milk,4998,3,,,5178,5,5213,6
9,42359001,Whole Milk Milk,4978,4,,,5223,3,5297,3
10,42360001,Whole Milk Milk,4931,5,,,5211,4,5332,2


In [22]:
# Fill NaN values with 0 for Quantity Sold and "N/A" for Rank
combined_top_items = combined_top_items.fillna({
    "Store 1 Quantity Sold": 0,
    "Store 1 Rank": "N/A",
    "Store 2 Quantity Sold": 0,
    "Store 2 Rank": "N/A",
    "Store 3 Quantity Sold": 0,
    "Store 3 Rank": "N/A",
    "Store 4 Quantity Sold": 0,
    "Store 4 Rank": "N/A"
})

combined_top_items = combined_top_items.sort_values(by="Store 1 Rank")

print("\nBest Selling Items Combined for December in Region A (Stores 1-4):")
combined_top_items = combined_top_items.reset_index(drop=True)
combined_top_items


Best Selling Items Combined for December in Region A (Stores 1-4):


Unnamed: 0,sku,product name,Store 1 Quantity Sold,Store 1 Rank,Store 2 Quantity Sold,Store 2 Rank,Store 3 Quantity Sold,Store 3 Rank,Store 4 Quantity Sold,Store 4 Rank
0,42356001,1.00% Milk,5018,1,0.0,,5114,6,5285,5
1,42357001,2.00% Milk,5018,1,0.0,,5357,1,5291,4
2,42355001,1.00% Milk,4998,3,0.0,,5178,5,5213,6
3,42359001,Whole Milk Milk,4978,4,0.0,,5223,3,5297,3
4,42360001,Whole Milk Milk,4931,5,0.0,,5211,4,5332,2
5,42358001,2.00% Milk,4887,6,0.0,,5268,2,5392,1
6,42314001,Squeeze Jelly Grape,1391,7,0.0,,1417,9,1485,8
7,42311001,Jam Grape,1357,8,0.0,,1486,7,1433,10
8,42313001,Jelly Grape,1356,9,0.0,,1370,10,1441,9
9,42312001,Jam Strawberry,1354,10,0.0,,1441,8,1500,7


In [23]:
combined_cal = pd.DataFrame({
    "combined_total_sales": combined_total_sales,
    "combined_total_customer_count": combined_total_customer_count,
}, index=[1])

combined_cal

Unnamed: 0,combined_total_sales,combined_total_customer_count
1,17149475.23,3344


In [24]:
# Save the result DataFrame
result.to_csv("Final-deliverable-1.csv", index=False, header=True)

combined_cal.to_csv("Final-deliverable-1.csv", index=False, mode='a', header=True)

# Append the combined_top_items DataFrame to the same file
combined_top_items.to_csv("Final-deliverable-1.csv", index=False, mode='a', header=True)