### Deliverable 2a: Products Dimension table

In [1]:
import pandas as pd
import numpy as np
import sqlite3 as lite

In [2]:
output_file_path = "./output/"

In [3]:
# Load data
products = pd.read_csv("Products1.txt", delimiter="|")
product_class = pd.read_csv("product_class.csv")

conformed_products = pd.read_csv('./ConformedProducts2.txt', delimiter="\t")

In [4]:
# Create mapping dictionaries
subcategory_map = dict(
    zip(product_class['product_subcategory'], product_class['product_class_id']))

len(subcategory_map)

111

In [5]:
subcategory_list = list(subcategory_map.keys())
subcategory_list[:20]

['Nuts',
 'Shellfish',
 'Canned Fruit',
 'Spices',
 'Pasta',
 'Yogurt',
 'Coffee',
 'Deli Meats',
 'Ice Cream',
 'TV Dinner',
 'Cheese',
 'Chips',
 'Fresh Vegetables',
 'Sour Cream',
 'Cottage Cheese',
 'Deli Salads',
 'Dried Meat',
 'Paper Wipes',
 'Soda',
 'Deodorizers']

In [6]:
products['clean_itemType'] = products['itemType'].copy()


# ItemType in subcategory
valid_mask = products['clean_itemType'].isin(subcategory_map.keys())
print("ItemType matched with subcategory => ", valid_mask.sum())

ItemType matched with subcategory =>  1186


In [7]:
needed_mapping = ~valid_mask & products['clean_itemType'].notna()
print("ItemType which are not null and not valid => ", needed_mapping.sum())

ItemType which are not null and not valid =>  595


In [8]:
conn = lite.connect("store1.db")
curr = conn.cursor()

In [9]:
curr.execute("DROP TABLE IF EXISTS ProductDimension")

curr.execute("""
    CREATE TABLE ProductDimension (
    ProductKey INTEGER PRIMARY KEY AUTOINCREMENT,
    SKU VARCHAR(50) NOT NULL,
    ProductName VARCHAR(255) NOT NULL,
    ProductClassID INT NOT NULL,
    Subcategory VARCHAR(100) NOT NULL,
    Category VARCHAR(100) NOT NULL,
    Department VARCHAR(100) NOT NULL,
    ProductFamily VARCHAR(100) NOT NULL,
    Size VARCHAR(50),
    PerCase INT NOT NULL,
    BrandName VARCHAR(100),
    Supplier VARCHAR(100) NOT NULL
    );

""")

<sqlite3.Cursor at 0x1640b046cc0>

In [10]:
conformed_products.to_sql("ProductDimension", conn, if_exists="replace", index=False)

2075

In [11]:
columns = conformed_products.columns
columns

Index(['ProductKey', 'sku', 'product_name', 'product_class_id', 'subcategory',
       'category', 'department', 'product_family', 'size', 'brandName',
       'supplier'],
      dtype='object')

In [12]:
curr.execute("Select * from ProductDimension")

rows = curr.fetchall()

product_dim = pd.DataFrame(rows, columns=columns)
product_dim.head(10)

Unnamed: 0,ProductKey,sku,product_name,product_class_id,subcategory,category,department,product_family,size,brandName,supplier
0,1,42081001,Jambalaya Rice Mix,57,Rice,Starchy Foods,Starchy Foods,Food,12 oz,Zatarains,Rowan Warehouse
1,2,42082001,Jambalaya Rice Mix,57,Rice,Starchy Foods,Starchy Foods,Food,8 oz,Zatarains,Rowan Warehouse
2,3,42083001,Guacamole Regular,99,Fresh Fruit,Fruit,Produce,Food,8 oz,Yucatan,Rowan Warehouse
3,4,42084001,Coffee Original Blend,90,Coffee,Hot Beverages,Beverages,Drink,12 oz,Yuban,Rowan Warehouse
4,5,42085001,GoGurt Variety Pack,6,Yogurt,Dairy,Dairy,Food,8 ct,Yoplait,Rowan Warehouse
5,6,42086001,Italian Dressing,260,Dressings,Baking Goods,Baking Products,Food,16 oz,Wishbone,Rowan Warehouse
6,7,42087001,Cheeseburger Heat & Serve Sliders,65,Hamburger,Meat,Meat,Food,29.28 oz,White Castle,Rowan Warehouse
7,8,42088001,Choice Cuts Poultry,77,Fresh Chicken,Meat,Deli,Food,36 oz,Whiskas,Rowan Warehouse
8,9,42089001,Farmers Pick Concord Grape,99,Fresh Fruit,Fruit,Produce,Food,46 oz,Welchs,Rowan Warehouse
9,10,42090001,Juice Red Grape,30,Juice,Pure Juice Beverages,Beverages,Drink,64 oz,Welchs,Rowan Warehouse


In [13]:
product_dim.to_csv(f"{output_file_path}ProductDimension-final.csv", index=False)

##### Deliverable 2b Implementation: Metadata Tracking and Date Standardization

##### Metadata Tracking System

In [14]:
source_definition = pd.DataFrame({
    'source_id': [1, 2, 3, 4, 5],
    'source_description': [
        'Original product table',
        'Manually mapped by Pavan Kumar Mistry',
        'String match (e.g., Product Name contains "Frito Lay" → "Chips")',
        'Fuzzy matched from itemType',
        'Keyword analysis from Product Name'
    ],
    'responsible_party': [
        'ETL System',
        'Pavan Kumar Mistry',
        'Automated Matcher',
        'Fuzzy Matching Algorithm',
        'Keyword Analyzer'
    ]

})

In [15]:
valid_mask = products['clean_itemType'].isin(subcategory_map.keys())
needed_mapping = ~valid_mask & products['clean_itemType'].notna()
still_null = products['clean_itemType'].isna()

In [16]:
all_in_map = products['clean_itemType'].apply(
    lambda x: x in subcategory_map.keys()).all()

all_in_map

np.False_

In [17]:
if all_in_map:
    # tracking based on whether itemType was modified
    product_sources = pd.DataFrame({
        'SKU': products['SKU'],
        'source_id': np.where(
            products['itemType'] == products['clean_itemType'],
            1,  # original matched exactly
            np.where(
                products['itemType'].isna(),
                4,  # came from keyword analysis
                3   # was modified through fuzzy/string matching
            )
        ),
        'mapping_date': pd.Timestamp.now().strftime('%Y%m%d')
    })
else:
    # Original logic if not all values are in the map
    valid_mask = products['clean_itemType'].isin(subcategory_map.keys())
    needed_mapping = ~valid_mask & products['clean_itemType'].notna()
    still_null = products['clean_itemType'].isna()

    product_sources = pd.DataFrame({
        'SKU': products['SKU'],
        'source_id': np.where(
            valid_mask, 1,
            np.where(
                needed_mapping, 3,  # modified though fuzzy/string matching
                np.where(
                    still_null, 4,  # using keyword analysis
                    2  # Manually
                )
            )
        ),
        'mapping_date': pd.Timestamp.now().strftime('%Y%m%d')
    })

In [18]:
# Test
assert len(product_sources) == len(products), "Lengths still don't match!"

In [19]:
# Create tables
curr.execute("DROP TABLE IF EXISTS SourceDefinitions")
curr.execute("""
    CREATE TABLE SourceDefinitions (
        source_id INT PRIMARY KEY,
        source_description TEXT NOT NULL,
        responsible_party TEXT NOT NULL
    );
""")

<sqlite3.Cursor at 0x1640b046cc0>

In [20]:
source_definition.to_sql("SourceDefinitions", conn, if_exists="replace", index=False)

5

In [21]:
res = curr.execute("select * from SourceDefinitions")
rows = curr.fetchall()
for row in rows:
    print(row)

(1, 'Original product table', 'ETL System')
(2, 'Manually mapped by Pavan Kumar Mistry', 'Pavan Kumar Mistry')
(3, 'String match (e.g., Product Name contains "Frito Lay" → "Chips")', 'Automated Matcher')
(4, 'Fuzzy matched from itemType', 'Fuzzy Matching Algorithm')
(5, 'Keyword analysis from Product Name', 'Keyword Analyzer')


In [22]:
source_definition.to_csv(f"{output_file_path}SourceDefinitions.csv", index=False)

In [23]:
curr.execute("DROP TABLE IF EXISTS ProductMetaData")
curr.execute("""
    CREATE TABLE ProductMetaData (
        SKU VARCHAR(50) NOT NULL,
        source_id INT NOT NULL,
        mapping_date CHAR(8) NOT NULL,
        PRIMARY KEY (SKU, source_id),
        FOREIGN KEY (source_id) REFERENCES SourceDefinitions(source_id),
        FOREIGN KEY (SKU) REFERENCES ProductDimension(SKU)
    );
""")

<sqlite3.Cursor at 0x1640b046cc0>

In [24]:
product_sources.to_sql("ProductMetaData", conn, if_exists='replace', index=False)

2075

In [25]:
product_sources.to_csv(f"{output_file_path}ProductMetaData-final.csv", index=False)

In [26]:
res = curr.execute("select * from ProductMetaData")

product_meta_data = pd.DataFrame(res, columns=["SKU", "source_id", "mapping_date"])
product_meta_data.sample(10)

Unnamed: 0,SKU,source_id,mapping_date
1156,43242001,1,20250503
421,42502001,1,20250503
1743,43829001,1,20250503
751,42834001,1,20250503
378,42459001,1,20250503
935,43019001,1,20250503
558,42639001,4,20250503
519,42600001,1,20250503
1027,43113001,1,20250503
303,42384001,4,20250503


In [27]:
product_sources['source_id'].value_counts()

source_id
1    1186
3     595
4     294
Name: count, dtype: int64

In [28]:
curr.close()
conn.close()