## Task 2: Creating a Local CSV of a Sample of Owners & Associated Transaction Records 

In [1]:
# Import dependencies 

from google.cloud import bigquery
import pandas as pd
import db_dtypes

In [2]:
# Define Credentials & Establish Connection

gbq_proj_id = "dow-wedge-transactions"
dataset_id = "transactions"  

client = bigquery.Client(project=gbq_proj_id)

In [None]:
# Get the schema of the tables and save it to a text file for data type investigation

def save_table_schema_to_file(project_id, dataset_id, output_file):
    tables = client.list_tables(dataset_id)
    with open(output_file, 'w') as f:
        for table in tables:
            table_ref = f"{project_id}.{dataset_id}.{table.table_id}"
            table_obj = client.get_table(table_ref)
            f.write(f"Schema for table {table.table_id}:\n")
            for schema_field in table_obj.schema:
                f.write(f"{schema_field.name}: {schema_field.field_type}\n")
            f.write("\n")  # Add some spacing between tables

# Step 2: Define the output file path and call the function
output_schema_file = r'C:\Users\mason\Desktop\Applied Data Analytics\Assignments\Wedge Project\wedge_table_schema.txt'
save_table_schema_to_file(gbq_proj_id, dataset_id, output_schema_file)

output_schema_file

In [3]:
# GBQ Query to sample owners and extract records (excluding inconsistent data type columns found in Schema file)

sample_query = """
WITH sampled_owners AS (
  SELECT card_no
  FROM (
    SELECT DISTINCT card_no
    FROM `dow-wedge-transactions.transactions.transArchive_*`
    WHERE card_no != 3
  )
  ORDER BY RAND()
  LIMIT 700  -- should be approx. 280 MB total file size 
)

-- Extract all records for the sampled owner (excluding columns with data type inconsistencies (altPrice, wicable, memType, staff, organic, volDiscType) found in schema file and not needed for task 3)
SELECT
    datetime,
    register_no,
    emp_no,
    trans_no,
    upc,
    description,
    trans_type,
    trans_subtype,
    trans_status,
    department,
    quantity,
    Scale,
    cost,
    unitPrice,
    total,
    regPrice,
    tax,
    taxexempt,
    foodstamp,
    discount,
    memDiscount,
    discountable,
    discounttype,
    voided,
    percentDiscount,
    ItemQtty,
    volume,
    VolSpecial,
    mixMatch,
    matched,
    numflag,
    itemstatus,
    tenderstatus,
    charflag,
    batchHeaderID,
    local,
    display,
    receipt,
    card_no,
    store,
    branch,
    match_id,
    trans_id
FROM `dow-wedge-transactions.transactions.transArchive_*`
WHERE card_no IN (SELECT card_no FROM sampled_owners);
"""

In [None]:
# Run the query and save the results to a local text file
def run_query_and_save_to_txt(query, client, destination_txt):
    # Run the query and convert the result to a DataFrame
    df = client.query(query).to_dataframe()
    # Save the DataFrame to a .txt file (tab-separated format)
    df.to_csv(destination_txt, sep='\t', index=False)
  
    print(f"Query results saved to {destination_txt}")

# Specify the destination .txt file
destination_txt = 'sampled_owner_transactions.txt'
run_query_and_save_to_txt(sample_query, client, destination_txt)

In [None]:
# optional: Run the query and save the results to a local CSV file (If CSV preferred)
#def run_query_and_save_to_csv(query, client, destination_csv):
#    df = client.query(query).to_dataframe()
#    df.to_csv(destination_csv, index=False)
#   print(f"Query results saved to {destination_csv}")

# Specify the destination CSV file
#destination_csv = 'sampled_owner_transactions.csv'
#run_query_and_save_to_csv(sample_query, client, destination_csv)