In [1]:
import pandas as pd

In [2]:
# Change as needed
BASE_DIR = r'C:\Users\lukef\Documents\projects\fetch_rewards'

In [3]:
# Define data paths
data_dir = BASE_DIR + '/data'
raw_data_dir = data_dir + '/raw'
cleansed_data_dir = data_dir + '/cleansed'

In [4]:
# Import data sets
dim_brands = pd.read_parquet(cleansed_data_dir + '/dim_brands.parquet')
dim_users = pd.read_parquet(cleansed_data_dir + '/dim_users.parquet')
fact_receipts = pd.read_parquet(cleansed_data_dir + '/fact_receipts.parquet')
fact_receipt_items = pd.read_parquet(cleansed_data_dir + '/fact_receipt_items.parquet')

# Helper functions

In [5]:
def basic_checks(df, column):
    """
    Performs basic checks on a DataFrame column including count of unique values and number of null or empty values.
    """
    print('\n' + '*' * 20 + '\n' + column + '\n' + '*' * 20)
    print(f"{column}: {df[column].nunique()} unique values out of {len(df)}")
    print(f"{column}: {len(df.loc[df[column].isnull() | (df[column] == '')])} null or empty values")

def id_column_checks(df, column):
    """
    Performs checks specific to ID columns: basic checks plus uniqueness check.
    """
    basic_checks(df, column)
    if df[column].nunique() == len(df):
        print(f"{column} is unique")
    else:
        print(f"{column} is not unique")

def varchar_column_checks(df, column):
    """
    Performs checks specific to varchar/string columns: basic checks plus frequency counts of values (including missing).
    """
    basic_checks(df, column)
    print(df.fillna('#Missing').groupby(column).size().sort_values(ascending=False))

def numeric_column_checks(df, column):
    """
    Performs checks specific to numeric columns: basic checks plus descriptive statistics.
    """
    basic_checks(df, column)
    print(df[column].describe())

def all_column_checks(df, column_type_dict):
    """
    Performs appropriate checks on all columns in a DataFrame, based on column types provided in a dictionary.
    """
    if 'id_columns' in column_type_dict:
        for column in column_type_dict['id_columns']:
            id_column_checks(df, column)

    if 'varchar_columns' in column_type_dict:
        for column in column_type_dict['varchar_columns']:
            varchar_column_checks(df, column)

    if 'numeric_columns' in column_type_dict:
        for column in column_type_dict['numeric_columns']:
            numeric_column_checks(df, column)

def date_order_check(df, earlier_date_column, later_date_column):
    """
    Checks for records where the later date column contains dates earlier than the earlier date column.
    """
    print(f"{sum(df[later_date_column] < df[earlier_date_column])} records with {later_date_column} before {earlier_date_column}")

def referential_integrity_check(df, column, reference_column):
    """
    Checks for orphaned values in a DataFrame column by comparing against a reference column.
    """
    print(f"{len(df.loc[~df[column].isin(reference_column)])} out of {len(df)} orphaned values of {column}")

# dim_brands

In [6]:
dim_brands.head()

Unnamed: 0,brandId,barcode,categoryCode,category,cpgId,cpgRef,name,topBrand,brandCode
0,601ac115be37ce2ead437551,511111019862,BAKING,Baking,601ac114be37ce2ead437550,Cogs,test brand @1612366101024,0.0,
1,601c5460be37ce2ead43755f,511111519928,BEVERAGES,Beverages,5332f5fbe4b03c9a25efd0ba,Cogs,Starbucks,0.0,STARBUCKS
2,601ac142be37ce2ead43755d,511111819905,BAKING,Baking,601ac142be37ce2ead437559,Cogs,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,601ac142be37ce2ead43755a,511111519874,BAKING,Baking,601ac142be37ce2ead437559,Cogs,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,601ac142be37ce2ead43755e,511111319917,CANDY_AND_SWEETS,Candy & Sweets,5332fa12e4b03c9a25efd1e7,Cogs,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


In [7]:
dim_brands_cols = {
    'id_columns': ['brandId', 'cpgId'],
    'varchar_columns': ['barcode', 'categoryCode', 'category', 'cpgRef', 'name', 'topBrand', 'brandCode']
}

all_column_checks(dim_brands, dim_brands_cols)


********************
brandId
********************
brandId: 1167 unique values out of 1167
brandId: 0 null or empty values
brandId is unique

********************
cpgId
********************
cpgId: 196 unique values out of 1167
cpgId: 0 null or empty values
cpgId is not unique

********************
barcode
********************
barcode: 1160 unique values out of 1167
barcode: 0 null or empty values
barcode
511111504139    2
511111305125    2
511111704140    2
511111605058    2
511111004790    2
               ..
511111303893    1
511111303732    1
511111303664    1
511111303503    1
511111919803    1
Length: 1160, dtype: int64

********************
categoryCode
********************
categoryCode: 14 unique values out of 1167
categoryCode: 650 null or empty values
categoryCode
#Missing                         650
BAKING                           359
CANDY_AND_SWEETS                  71
BEER_WINE_SPIRITS                 31
HEALTHY_AND_WELLNESS              14
GROCERY                        

In [8]:
# Check category hierarchy
dim_brands.fillna('#Missing').groupby(['categoryCode', 'category']).size()

categoryCode                   category                   
#Missing                       #Missing                       155
                               Baby                            11
                               Baking                          10
                               Beauty                           9
                               Beauty & Personal Care           6
                               Beer Wine Spirits               59
                               Beverages                       62
                               Breakfast & Cereal              40
                               Canned Goods & Soups            12
                               Condiments & Sauces             27
                               Dairy                           33
                               Deli                             6
                               Frozen                          23
                               Grocery                         28
                 

In [9]:
# Check brand hierarchy
dim_brands.fillna('#Missing').groupby(['brandCode', 'name']).size()

brandCode        name                                    
                 Baked!                                      1
                 Coors Banquet                               1
                 Cracker Barrel Macaroni & Cheese Dinners    1
                 DOVE MEN+CARE                               1
                 Gevalia Iced Coffee                         1
                                                            ..
YOPLAIT GO-GURT  Yoplait® Go-GURT®                           1
YQ YOPLAIT       YQ by Yoplait®                              1
YUBAN            Yuban Coffee                                1
ZIMA             Zima                                        1
ZUMBIDA          Zumbida                                     1
Length: 1166, dtype: int64

## Findings for dim_brands
- `brandId` is unique.
- `brandName` appears to have junk values. More investigation required.
- `category`, `brandCode`, and `topBrand` have many missing values.
- Category hierarchy is consistent.
- Brand hierarchy needs more investigation.

# dim_users

In [10]:
dim_users.head()

Unnamed: 0,userId,active,createdDateTime,lastLoginDateTime,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,2021-01-03 15:25:30.554,2021-01-03 15:25:30.597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI


In [11]:
dim_users_cols = {
    'id_columns': ['userId'],
    'varchar_columns': ['active', 'role', 'signUpSource', 'state'],
    'numeric_columns': ['createdDateTime', 'lastLoginDateTime']
}

all_column_checks(dim_users, dim_users_cols)


********************
userId
********************
userId: 212 unique values out of 495
userId: 0 null or empty values
userId is not unique

********************
active
********************
active: 2 unique values out of 495
active: 0 null or empty values
active
True     494
False      1
dtype: int64

********************
role
********************
role: 2 unique values out of 495
role: 0 null or empty values
role
consumer       413
fetch-staff     82
dtype: int64

********************
signUpSource
********************
signUpSource: 2 unique values out of 495
signUpSource: 48 null or empty values
signUpSource
Email       443
#Missing     48
Google        4
dtype: int64

********************
state
********************
state: 8 unique values out of 495
state: 56 null or empty values
state
WI          396
#Missing     56
NH           20
AL           12
OH            5
IL            3
CO            1
KY            1
SC            1
dtype: int64

********************
createdDateTime
*********

In [12]:
date_order_check(dim_users, 'createdDateTime', 'lastLoginDateTime')

0 records with lastLoginDateTime before createdDateTime


In [13]:
id_column_checks(dim_users.drop_duplicates(), 'userId')


********************
userId
********************
userId: 212 unique values out of 212
userId: 0 null or empty values
userId is unique


## Findings for dim_users
- `userId` is not unique. However, `userId` is unique in the deduplicated dataset (i.e. it contains perfect duplicates)
- `role` contains values other than "consumer", which contradicts the data dictionary
- `signUpSource` is not in the data dictionary
- `signUpSource`, `state`, and `lastLoginDateTime` contain several missing values
- `lastLoginDateTime` is consistent with `createdDateTime` whenever it is not missing
- The earliest `createdDateTime` is in 2014, which is long before most values

# fct_receipts

In [14]:
fact_receipts.head()

Unnamed: 0,receiptId,bonusPointsEarned,bonusPointsEarnedReason,createDateTime,scannedDateTime,finishedDateTime,modifyDateTime,pointsAwardedDateTime,pointsEarned,purchaseDateTime,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:36,2021-01-03 15:25:31,500.0,2021-01-03 00:00:00,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:48,2021-01-03 15:24:43,150.0,2021-01-02 15:24:43,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 15:25:37,2021-01-03 15:25:37,NaT,2021-01-03 15:25:42,NaT,5.0,2021-01-03 00:00:00,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,5.0,All-receipts receipt bonus,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:39,2021-01-03 15:25:34,5.0,2021-01-03 00:00:00,4.0,FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,5.0,All-receipts receipt bonus,2021-01-03 15:25:06,2021-01-03 15:25:06,2021-01-03 15:25:11,2021-01-03 15:25:11,2021-01-03 15:25:06,5.0,2021-01-02 15:25:06,2.0,FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


In [15]:
fact_receipts_cols = {
    'id_columns': ['receiptId', 'userId'],
    'varchar_columns': ['bonusPointsEarnedReason', 'rewardsReceiptStatus'],
    'numeric_columns': ['bonusPointsEarned', 'createDateTime', 'scannedDateTime', 'finishedDateTime', 'modifyDateTime', 'pointsAwardedDateTime', 'pointsEarned', 'purchaseDateTime', 'purchasedItemCount', 'totalSpent']
}

all_column_checks(fact_receipts, fact_receipts_cols)


********************
receiptId
********************
receiptId: 1119 unique values out of 1119
receiptId: 0 null or empty values
receiptId is unique

********************
userId
********************
userId: 258 unique values out of 1119
userId: 0 null or empty values
userId is not unique

********************
bonusPointsEarnedReason
********************
bonusPointsEarnedReason: 9 unique values out of 1119
bonusPointsEarnedReason: 575 null or empty values
bonusPointsEarnedReason
#Missing                                                                               575
All-receipts receipt bonus                                                             183
Receipt number 1 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)    119
COMPLETE_NONPARTNER_RECEIPT                                                             71
COMPLETE_PARTNER_RECEIPT                                                                39
Receipt number 3 completed, bonus point schedule DEFAULT (5cef

In [16]:
fact_receipts['hasBonusPointsEarned'] = ~fact_receipts['bonusPointsEarned'].isnull()
fact_receipts['hasBonusPointsEarnedReason'] = ~fact_receipts['bonusPointsEarnedReason'].isnull()
fact_receipts.groupby(['hasBonusPointsEarned', 'hasBonusPointsEarnedReason']).size()

hasBonusPointsEarned  hasBonusPointsEarnedReason
False                 False                         575
True                  True                          544
dtype: int64

In [17]:
fact_receipts['hasPointsEarned'] = ~fact_receipts['pointsEarned'].isnull()
fact_receipts['hasPointsAwardedDateTime'] = ~fact_receipts['pointsAwardedDateTime'].isnull()
fact_receipts.groupby(['hasPointsEarned', 'hasPointsAwardedDateTime']).size()

hasPointsEarned  hasPointsAwardedDateTime
False            False                       510
True             False                        72
                 True                        537
dtype: int64

In [18]:
date_order_check(fact_receipts, 'createDateTime', 'modifyDateTime')
date_order_check(fact_receipts, 'createDateTime', 'scannedDateTime')
date_order_check(fact_receipts, 'scannedDateTime', 'finishedDateTime')
date_order_check(fact_receipts, 'createDateTime', 'purchaseDateTime')
date_order_check(fact_receipts, 'purchaseDateTime', 'pointsAwardedDateTime')

0 records with modifyDateTime before createDateTime
0 records with scannedDateTime before createDateTime
0 records with finishedDateTime before scannedDateTime
628 records with purchaseDateTime before createDateTime
7 records with pointsAwardedDateTime before purchaseDateTime


In [19]:
print(f"{sum(fact_receipts['finishedDateTime'] < fact_receipts['createDateTime'])} records with lastLoginDateTime before createdDateTime")

0 records with lastLoginDateTime before createdDateTime


## Findings for fact_receipts
- `receiptId` is unique
- `'hasBonusPointsEarned` and `hasBonusPointsEarnedReason` are consistent: they are either both null or both non-null
- `bonusPointsEarned` has a reasonable distribution (positive, between 5 and 750)
- `finishedDateTime` contains several missing values, which may be valid if the transactions did not finish.
- `purchaseDateTime` contains several missing values, which may be valid if the transactions did not result in a purchase.
- `purchaseDateTime` often occurs before `createDateTime`. On closer inspection, `purchaseDateTime` often contains only the date, not the date/time.
- `totalSpent` has a reasonable distribution (between $0 and $4,721.95)
- 7 records with `pointsAwardedDateTime` before `purchaseDateTime`. Should this be possible?

# fct_receipt_items

In [20]:
fact_receipt_items.head()

Unnamed: 0,receiptItemId,receiptId,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,...,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId
0,80bab23e-218c-4445-ab4e-538e430f2671,5ff1e1eb0a720f0523000575,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,1.0,5.0,...,,,,,,,,,,
1,76cf43a0-0052-4c48-b76b-5ce1538ef076,5ff1e1bb0a720f052300056b,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,...,,,,,,,,,,
1,4eba9404-308f-4729-9bca-78449262e3b1,5ff1e1bb0a720f052300056b,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,1.0,1.0,...,,,,,,,,,,
2,cd94aab2-c8c5-448e-8278-437d6af4a3e8,5ff1e1f10a720f052300057a,,,,,False,1,1.0,,...,,,,,,,,,,
3,f6ea8b25-6a48-4e87-9466-dabe53d54d1a,5ff1e1ee0a7214ada100056f,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,1.0,4.0,...,,,,,,,,,,


In [21]:
fact_receipt_items_cols = {
    'id_columns': ['receiptItemId', 'receiptId', 'partnerItemId'],
    'varchar_columns': ['barcode', 'description', 'brandCode', 'rewardsGroup', 'deleted'],
    'numeric_columns': ['finalPrice', 'itemPrice', 'pointsEarned']
}

all_column_checks(fact_receipt_items, fact_receipt_items_cols)


********************
receiptItemId
********************
receiptItemId: 6941 unique values out of 6941
receiptItemId: 0 null or empty values
receiptItemId is unique

********************
receiptId
********************
receiptId: 679 unique values out of 6941
receiptId: 0 null or empty values
receiptId is not unique

********************
partnerItemId
********************
partnerItemId: 916 unique values out of 6941
partnerItemId: 0 null or empty values
partnerItemId is not unique

********************
barcode
********************
barcode: 568 unique values out of 6941
barcode: 3851 null or empty values
barcode
#Missing        3851
4011             177
036000320893      92
034100573065      90
036000391718      87
                ... 
043000013229       1
043000012871       1
043000004944       1
043000003596       1
B08DQDHR2S         1
Length: 569, dtype: int64

********************
description
********************
description: 1889 unique values out of 6941
description: 381 null or e

## Findings for fct_receipt_items
- `receiptItemId` is unique
- `barcode` has several missing values, and also a lot of values of "4011" which are inconsistent with other values in the column
- `description` has many missing values and several more "ITEM NOT FOUND" values
- `brandCode`, `rewardsGroup` and `pointsEarned` are missing for the majority of records
- `deleted` looks reasonable, assuming blanks correspond to False
- `itemPrice` and `finalPrice` have reasonable distributions (between $0 and $441.58)

# Foreign key referential integrity

In [22]:
referential_integrity_check(fact_receipts, 'userId', dim_users['userId'])

148 out of 1119 orphaned values of userId


In [23]:
referential_integrity_check(fact_receipt_items, 'receiptId', fact_receipts['receiptId'])

0 out of 6941 orphaned values of receiptId


In [24]:
referential_integrity_check(fact_receipt_items, 'barcode', dim_brands['barcode'])

6941 out of 6941 orphaned values of barcode


In [25]:
referential_integrity_check(fact_receipt_items, 'brandCode', dim_brands['brandCode'])

1971 out of 6941 orphaned values of brandCode


## Findings for foreign key referential integrity:
- There are many orphaned `brandCode` values in the receipt items table. This is concerning because it affects the brand-level analysis in the other notebook. The data dictionary says: that `brandCode` is a "string that corresponds with the brand column in a partner product file", so the orphaned values may be unexpected.
- There are 148 orphaned `userId` values in the receipts table.
- The referential integrity of `receiptId` in the receipt items table was ensured by the data pipeline.