## Install dependencies

In [None]:
# !pip install openpyxl
# !pip install pandas

## Set up environment and creds

In [None]:
import os
import calendar
import pandas as pd
import logging as log
from google.cloud import bigquery as bq
from google.oauth2 import service_account

JSON_KEYS_PATH = '../json-keys/gch-prod-dwh01-data-pipeline.json'

# set up credentials for BQ and Drive to query data
credentials = service_account.Credentials.from_service_account_file(JSON_KEYS_PATH)
bq_client = bq.Client(credentials=credentials, project=credentials.project_id)

## Read expected results into a dataframe

In [3]:
expected_df = pd.read_excel('srp.xlsx', sheet_name='Sheet3', header=0)
all_dates = expected_df['date'].unique()

## Check if all unique dates are extracted correctly

In [4]:
all_dates

<DatetimeArray>
['2024-08-01 00:00:00', '2024-08-02 00:00:00', '2024-08-03 00:00:00',
 '2024-08-04 00:00:00', '2024-08-05 00:00:00', '2024-08-06 00:00:00',
 '2024-09-01 00:00:00', '2024-09-02 00:00:00', '2024-09-03 00:00:00',
 '2024-09-04 00:00:00', '2024-09-05 00:00:00', '2024-09-06 00:00:00',
 '2024-09-07 00:00:00', '2024-09-08 00:00:00', '2024-09-09 00:00:00',
 '2024-09-10 00:00:00', '2024-09-11 00:00:00', '2024-09-12 00:00:00',
 '2024-09-13 00:00:00', '2024-09-14 00:00:00', '2024-09-15 00:00:00',
 '2024-09-16 00:00:00', '2024-09-17 00:00:00', '2024-09-18 00:00:00',
 '2024-09-19 00:00:00', '2024-09-20 00:00:00', '2024-10-01 00:00:00',
 '2024-10-02 00:00:00', '2024-10-03 00:00:00', '2024-10-04 00:00:00',
 '2024-10-05 00:00:00', '2024-10-06 00:00:00', '2024-10-07 00:00:00',
 '2024-10-08 00:00:00', '2024-10-09 00:00:00', '2024-10-10 00:00:00',
 '2024-10-11 00:00:00', '2024-10-12 00:00:00', '2024-10-13 00:00:00',
 '2024-10-14 00:00:00', '2024-10-15 00:00:00']
Length: 41, dtype: datetime

In [5]:
len(all_dates)

41

## Inspect expected_results df

In [6]:
expected_df.head()

Unnamed: 0,date,total_qty_sales,total_sales,total_margin
0,2024-08-01,21236.2142,126738.61,29163.94
1,2024-08-02,18902.6783,125851.2,29148.37
2,2024-08-03,26079.3416,169499.2,36124.83
3,2024-08-04,26414.1901,183690.14,39238.73
4,2024-08-05,20538.136,129752.49,30793.1


### Rename ```total_sales_qty``` to ```total_qty_sales```

Somehow the name for ```total_qty_sales``` is not registered properly even after modification

In [7]:
expected_df = expected_df.rename(columns={
		'total_sales_qty': 'total_qty_sales'
})

## Inspect expected_df again

In [8]:
expected_df.head()

Unnamed: 0,date,total_qty_sales,total_sales,total_margin
0,2024-08-01,21236.2142,126738.61,29163.94
1,2024-08-02,18902.6783,125851.2,29148.37
2,2024-08-03,26079.3416,169499.2,36124.83
3,2024-08-04,26414.1901,183690.14,39238.73
4,2024-08-05,20538.136,129752.49,30793.1


## Get all mismatched data

Join all mismatches into a dataframe

In [10]:
mismatch_df = pd.DataFrame()

for date in all_dates:
	# Convert to date string in the format BigQuery expects
	formatted_date = date.strftime('%Y-%m-%d')
	
	query = f"""
	SELECT
		date,
		SUM(total_qty_sales) AS total_qty_sales,
		SUM(total_sales) AS total_sales,
		SUM(total_margin) AS total_margin
	FROM `gch-prod-dwh01.srp.agg_possales_copy`
	WHERE date = DATE '{formatted_date}'
	GROUP BY date
	"""

	results_df = bq_client.query(query).to_dataframe()
	mismatch_df = pd.concat([mismatch_df, results_df], ignore_index=True, sort=False)


## Inspect the mismatched data

In [11]:
mismatch_df.head()

Unnamed: 0,date,total_qty_sales,total_sales,total_margin
0,2024-08-01,21236.2142,126738.61,29163.94
1,2024-08-02,9555.8705,43142.5,10286.87
2,2024-08-03,13903.5728,56188.88,11392.43
3,2024-08-04,13970.0486,64507.47,13068.66
4,2024-08-05,11566.8014,43778.06,10385.05


## Create a comparison table

Set ```different``` as ```True``` if there are any mismatches

In [12]:
comparison_df = pd.DataFrame({
	'date': mismatch_df['date'],
	'expected_total_qty_sales': round(expected_df['total_qty_sales'], 2),
	'cur_total_qty_sales': round(mismatch_df['total_qty_sales'], 2),
	'expected_sales': round(expected_df['total_sales'], 2),
	'cur_sales': round(mismatch_df['total_sales'], 2),
	'expected_margin': round(expected_df['total_margin'], 2),
	'cur_margin': round(mismatch_df['total_margin'], 2)
})

comparison_df['different'] = (
	(comparison_df['expected_total_qty_sales'] != comparison_df['cur_total_qty_sales']) | 
	(comparison_df['expected_sales'] != comparison_df['cur_sales']) | 
	(comparison_df['expected_margin'] != comparison_df['cur_margin'])
)

final = comparison_df[comparison_df['different'] == True]
final

Unnamed: 0,date,expected_total_qty_sales,cur_total_qty_sales,expected_sales,cur_sales,expected_margin,cur_margin,different
1,2024-08-02,18902.68,9555.87,125851.2,43142.5,29148.37,10286.87,True
2,2024-08-03,26079.34,13903.57,169499.2,56188.88,36124.83,11392.43,True
3,2024-08-04,26414.19,13970.05,183690.14,64507.47,39238.73,13068.66,True
4,2024-08-05,20538.14,11566.8,129752.49,43778.06,30793.1,10385.05,True
5,2024-08-06,17681.04,8887.9,117909.43,41002.67,28388.03,10172.05,True
6,2024-09-01,49528.28,36961.63,242350.43,130045.22,48560.8,28518.53,True
7,2024-09-02,31294.65,22319.64,155198.02,81353.18,36703.45,20537.51,True
8,2024-09-03,28378.46,19982.98,141119.85,71854.2,31294.49,15982.74,True
9,2024-09-04,29811.6,20794.01,144340.57,68739.65,33984.47,17213.83,True
10,2024-09-05,25794.2,16546.46,146966.12,63818.31,35814.37,16599.59,True
