## Connecting to GBQ

### Imports for the code

In [1]:
# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
import os

In [2]:
path_to_files = 'C:\\Users\\meand\\Documents\\Grad School\\Fall2020\\Applied Data Analytics\\Wedge\\ada-wedge\\bigwedge_clean\\'

In [3]:
clean_wedge = os.listdir("bigwedge_clean")

In [4]:
# values are specific to my machine and project 
service_path = "C:\\Users\meand\\Documents\\Grad School\Fall2020\\Applied Data Analytics\\Wedge\\ada-wedge\\"
service_file = 'Wedge Project-ed75ab1189b3.json' # this is your authentication information  
gbq_proj_id = 'wedge-project-290522'  # change this to your project_id
gbq_dataset_id = 'big_wedge_1' # and change this to your data set ID

private_key =service_path + service_file

In [5]:
# pass in our credentials so that Python has permission to access our project.
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

In [7]:
# establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

### Creating Tables

In [7]:
def tbl_exists(client, table_ref):
    from google.cloud.exceptions import NotFound
    try:
        client.get_table(table_ref)
        return True
    except NotFound:
        return False

In [8]:
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.schema_update_options = [
    bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION # This allows us to modify the table. 
]

In [9]:
job_config.schema = [
    bigquery.SchemaField("datetime", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("register_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("emp_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("upc", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("description", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_subtype", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("trans_status", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("department", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("Scale", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("cost", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("unitPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("total", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("regPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("altPrice", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tax", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("taxexempt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("foodstamp", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("wicable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discountable", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("discounttype", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("voided", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("percentDiscount", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("ItemQtty", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volDiscType", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("volume", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("VolSpecial", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("mixMatch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("matched", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("memType", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("staff", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("numflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("itemstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("tenderstatus", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("charflag", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("varflag", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("batchHeaderID", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("local", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("organic", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("display", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("receipt", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("card_no", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("store", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("branch", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("match_id", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("trans_id", "FLOAT", mode="NULLABLE"),
]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1

In [10]:
for file in clean_wedge:
    my_table,junk = file.split("_clean")#splits on _clean in csv files and removes rest as junk
    table_full_name = ".".join([gbq_proj_id,gbq_dataset_id,my_table]) #creates GBQ table name
    
    if not tbl_exists(client, table_full_name) :
        table_ref = client.create_table(table = table_full_name)
    else :
        table_ref = client.get_table(table_full_name)
    
    table = client.get_table(table_ref)
    print("Table {} contains {} columns".format(table_ref.table_id,len(table.schema)))
    
    with open(path_to_files + file, "rb") as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            location="US",  # Must match the destination dataset location.
            job_config=job_config,
        )  # API request
    job.result()  # Waits for table load to complete.
    print("Loaded {} rows into {}:{}.".format(job.output_rows, 'wedge_example', table_ref.table_id))
    

    # Checks the updated length of the schema
    table = client.get_table(table)
    print("Table {} now contains {} columns.".format(table_ref.table_id, len(table.schema)))

Table transArchive_201001_201003 contains 0 columns
Loaded 2998330 rows into wedge_example:transArchive_201001_201003.
Table transArchive_201001_201003 now contains 50 columns.
Table transArchive_201004_201006 contains 0 columns
Loaded 3185807 rows into wedge_example:transArchive_201004_201006.
Table transArchive_201004_201006 now contains 50 columns.
Table transArchive_201007_201009 contains 0 columns
Loaded 2992585 rows into wedge_example:transArchive_201007_201009.
Table transArchive_201007_201009 now contains 50 columns.
Table transArchive_201010_201012 contains 0 columns
Loaded 2957586 rows into wedge_example:transArchive_201010_201012.
Table transArchive_201010_201012 now contains 50 columns.
Table transArchive_201101_201103 contains 0 columns
Loaded 2920826 rows into wedge_example:transArchive_201101_201103.
Table transArchive_201101_201103 now contains 50 columns.
Table transArchive_201104 contains 0 columns
Loaded 1066334 rows into wedge_example:transArchive_201104.
Table tran

Loaded 872161 rows into wedge_example:transArchive_201607.
Table transArchive_201607 now contains 50 columns.
Table transArchive_201608 contains 0 columns
Loaded 858168 rows into wedge_example:transArchive_201608.
Table transArchive_201608 now contains 50 columns.
Table transArchive_201609 contains 0 columns
Loaded 861248 rows into wedge_example:transArchive_201609.
Table transArchive_201609 now contains 50 columns.
Table transArchive_201610 contains 0 columns
Loaded 905092 rows into wedge_example:transArchive_201610.
Table transArchive_201610 now contains 50 columns.
Table transArchive_201611 contains 0 columns
Loaded 925314 rows into wedge_example:transArchive_201611.
Table transArchive_201611 now contains 50 columns.
Table transArchive_201612 contains 0 columns
Loaded 915707 rows into wedge_example:transArchive_201612.
Table transArchive_201612 now contains 50 columns.
Table transArchive_201701 contains 0 columns
Loaded 936741 rows into wedge_example:transArchive_201701.
Table trans