### Step 1: Import Packages

In [1]:
import gzip
from google.cloud import bigquery
from google.cloud import bigquery_storage
import os
import pandas as pd
import json
import warnings
import datetime as dt
warnings.filterwarnings(action="ignore")



### Step 2: Open the gzip files

In [2]:
list_gzip_files = [pos_json for pos_json in os.listdir(os.getcwd()) if pos_json.endswith(".gzip")]

df_s3_list = []
for i in list_gzip_files:
    with gzip.open(filename=os.path.join(os.getcwd(), i), mode="r") as f:
        country_code = i[0:2] # i[0:2] extracts the country code

        # Read the contents of a gzip file
        vars()["json_" + country_code] = f.read()
        f.close()
        
        # Change the bytes object into a JSON and put it in a dataframe
        vars()["df_country_" + country_code] = pd.DataFrame(json.loads(vars()["json_" + country_code].decode("utf-8")))

        # Add a column to the dataframe with the country_code
        vars()["df_country_" + country_code]["country_code"] = country_code
        df_s3_list.append(vars()["df_country_" + country_code])

# Combining the generated data frames into one
df_s3 = pd.concat(df_s3_list)

# Convert the "update_timestamp" column from epoch seconds to a readable timestamp
df_s3["update_timestamp"] = df_s3["update_timestamp"].apply(lambda x: dt.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))

### Step 3: Query the LB BQ table

In [3]:
client = bigquery.Client(project="dh-logistics-product-ops")
query = """
    SELECT *
    FROM `dh-logistics-product-ops.pricing.final_vendor_list_all_data_loved_brands_scaled_code`
    WHERE update_timestamp = '2022-11-15 17:49:40.024391 UTC' AND country_code IN ('ae', 'ar', 'bd') AND is_lb_lm = 'Y'
"""
df_bq = client.query(query=query).result().to_dataframe(bqstorage_client=bigquery_storage.BigQueryReadClient(), progress_bar_type="tqdm")

Downloading: 100%|██████████| 5421/5421 [00:03<00:00, 1768.03rows/s]


In [4]:
def bq_dataframes_func(dataframe):
    df_check = pd.merge(
        left=dataframe,
        right=df_bq[["country_code", "vendor_code"]],
        left_on=["country_code", "vendor_ids"],
        right_on=["country_code", "vendor_code"],
        how="inner",
    )
    return df_check

dataframe_country_list = [i for i in dir() if i.startswith("df_country_")]

df_check_list = []
for i in dataframe_country_list:
    country_code = i[-2:]
    vars()["df_" + country_code + "_check"] = bq_dataframes_func(dataframe=eval(i))
    df_check_list.append(vars()["df_" + country_code + "_check"])

df_check = pd.concat(df_check_list)

In [5]:
df_s3.groupby("country_code", as_index=False)["vendor_ids"].count()

Unnamed: 0,country_code,vendor_ids
0,ae,1011
1,ar,1959
2,bd,2451


In [6]:
df_bq.groupby("country_code", as_index=False)["vendor_code"].count()

Unnamed: 0,country_code,vendor_code
0,ae,1011
1,ar,1959
2,bd,2451


In [7]:
df_check.groupby("country_code", as_index=False)[["vendor_code", "vendor_ids"]].count()

Unnamed: 0,country_code,vendor_code,vendor_ids
0,ae,1011,1011
1,ar,1959,1959
2,bd,2451,2451


In [8]:
df_check.groupby("country_code", as_index=False).apply(lambda x: x.notnull().count())

Unnamed: 0,vendor_ids,update_timestamp,country_code,vendor_code
0,1011,1011,1011,1011
1,1959,1959,1959,1959
2,2451,2451,2451,2451


In [9]:
df_check["vendor_code"].isnull().value_counts() # vendor_code is the field coming from bigquery. If it is null, this means that there is a mismatch

False    5421
Name: vendor_code, dtype: int64

In [10]:
df_check["vendor_ids"].isnull().value_counts() # vendor_ids is the field coming from S3. If it is null, this means that there is a mismatch

False    5421
Name: vendor_ids, dtype: int64