# Final Project Milestone 1
# Part 4: BigQuery
## Daisy Pinaroc

In [2]:
PROJECT = "XXXX"
DATASET = "XXXX"
REGION = "us-central1"

In [4]:
from google.cloud import bigquery

### Creating final_project dataset
client = bigquery.Client(project=PROJECT, location=REGION)

# Create dataset reference
final_proj_dataset_ref = client.dataset(DATASET)

final_proj_dataset = bigquery.Dataset(final_proj_dataset_ref)
final_proj_dataset.location = REGION

# create dataset
try:
    client.create_dataset(final_proj_dataset, exists_ok=True)
    print(f"Dataset '{DATASET}' created successfully.")
except Exception as e:
    print(f"Error occurred while creating dataset '{DATASET}': {e}")

Dataset 'cs327e_final_project_2023' created successfully.


In [15]:
### Importing reservations_data_postgresql.csv and ticketing_data_mongodb.csv as tables in the database
### postgres.reservations.cnt_code must exist in bigquery.currency.cnt_code
### mongodb.ticketing.curr_code must exist in bigquery.currency.curr_code

# Loading reservations_data_postgresql.csv
from google.cloud import bigquery

try:
    client = bigquery.Client(project=PROJECT, location=REGION)

    # Specify references
    dataset_ref = client.dataset(DATASET)
    table_ref = dataset_ref.table("reservations")

    # GCS path to the CSV file
    gcs_path = 'gs://XXXX'

    schema = [
        bigquery.SchemaField("res_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("cust_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("prp_nm", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("prp_ch", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("adr_line_1", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("adr_line_2", "STRING"),
        bigquery.SchemaField("city", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("state", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("postal_cd", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("lat", "NUMERIC", mode="REQUIRED"),
        bigquery.SchemaField("long", "NUMERIC", mode="REQUIRED"),
        bigquery.SchemaField("cnt_code", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("arr_date", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("dep_date", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("pmt_amt", "NUMERIC", mode="REQUIRED"),
    ]

    # Load data from GCS into BigQuery with the specified schema
    job_config = bigquery.LoadJobConfig(schema=schema, skip_leading_rows=1)
    job = client.load_table_from_uri(gcs_path, table_ref, job_config=job_config)

    # Wait for the job to complete
    job.result()

    print(f"Data loaded successfully into {DATASET}.")
except Exception as e:
    print("Error occurred while loading reservations data:", e)

Data loaded successfully into cs327e_final_project_2023.


In [54]:
# Loading ticketing_data_mongodb.csv
from google.cloud import bigquery

TABLE = "ticketing"

try:
    client = bigquery.Client(project=PROJECT, location=REGION)

    # Specify references
    dataset_ref = client.dataset(DATASET)
    table_ref = dataset_ref.table(TABLE)

    # GCS path to the CSV file
    gcs_path = 'gs://XXXX'

    schema = [
        bigquery.SchemaField("_id", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("tck_id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("cust_id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("airline", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("flight_nm", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("dep_airport", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("arr_airport", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("dep_date", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("dep_time", "DATETIME", mode="REQUIRED"),
        bigquery.SchemaField("arr_date", "DATE", mode="REQUIRED"),
        bigquery.SchemaField("arr_time", "DATETIME", mode="REQUIRED"),
        bigquery.SchemaField("stops", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("tik_amt", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("curr_code", "STRING", mode="REQUIRED"),
    ]
    
    # Load data from GCS into BigQuery with the specified schema
    job_config = bigquery.LoadJobConfig(schema=schema, skip_leading_rows=1)
    job = client.load_table_from_uri(gcs_path, table_ref, job_config=job_config)
    
    # Wait for the job to complete
    job.result()

    print(f"Data loaded successfully into {DATASET} for table {TABLE}.")
except Exception as e:
    print("Error occurred while loading ticketing data:", e)
    
    '''
    # Print detailed error information
    if hasattr(job, 'errors') and job.errors:
        for error in job.errors:
            logging.error(f"Error message: {error['message']}")
            logging.error(f"Reason: {error.get('reason', 'N/A')}")
    else:
        logging.error("No detailed error information available.")
    '''

DEBUG:google.auth.transport.requests:Making request: GET http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true
DEBUG:google.auth.transport.requests:Making request: GET http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/112246414225-compute@developer.gserviceaccount.com/token?scopes=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform


Data loaded successfully into cs327e_final_project_2023 for table ticketing.


In [4]:
'''
from google.cloud import bigquery

### Creating final_project dataset
client = bigquery.Client(project=PROJECT, location=REGION)

TABLE = "currency"
table_ref = client.dataset(DATASET).table(TABLE)
delete_query = f"DELETE FROM `{PROJECT}.{DATASET}.{TABLE}` WHERE TRUE"
query_job = client.query(delete_query)
query_job.result()
print(f"All data cleared from {TABLE} in {DATASET}.")
'''

All data cleared from currency in cs327e_final_project_2023.


### 1. Create the table

In [77]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT, location=REGION)

ddl_currency = '''CREATE OR REPLACE TABLE cs327e_final_project_2023.currency(
    curr_code STRING NOT NULL REFERENCES cs327e_final_project_2023.ticketing(curr_code) NOT ENFORCED,
    curr_name STRING NOT NULL,
    cntry_code STRING NOT NULL REFERENCES cs327e_final_project_2023.reservations(cnt_code) NOT ENFORCED,
    cntry_name STRING NOT NULL,
    PRIMARY KEY(curr_code) NOT ENFORCED
)
'''

print(ddl_currency)

try:
    query_job = client.query(ddl_currency)
    query_job.result()
    print("Created table")
except Exception as e:
    print("Error occurred while creating currency table:", e)

DEBUG:google.auth.transport.requests:Making request: GET http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true
DEBUG:google.auth.transport.requests:Making request: GET http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/112246414225-compute@developer.gserviceaccount.com/token?scopes=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform


CREATE OR REPLACE TABLE cs327e_final_project_2023.currency(
    curr_code STRING NOT NULL REFERENCES cs327e_final_project_2023.ticketing(curr_code) NOT ENFORCED,
    curr_name STRING NOT NULL,
    cntry_code STRING NOT NULL REFERENCES cs327e_final_project_2023.reservations(cnt_code) NOT ENFORCED,
    cntry_name STRING NOT NULL,
    PRIMARY KEY(curr_code) NOT ENFORCED
)

Created table


### 2. Insert records into the table

In [5]:
from google.cloud import bigquery
from faker import Faker
import random

client = bigquery.Client(project=PROJECT, location=REGION)

fake = Faker()

# Function to get random currency codes from the ticketing table
ticketing_query = f"SELECT curr_code FROM cs327e_final_project_2023.ticketing"
result = client.query(ticketing_query).result()
ticketing_codes = [row["curr_code"] for row in result]
def get_random_ticketing_code():
    return random.choice(ticketing_codes)

# Function to get random country codes from the reservations table
reservations_query = f"SELECT cnt_code FROM cs327e_final_project_2023.reservations"
result = client.query(reservations_query).result()
reservations_codes = [row["cnt_code"] for row in result]
def get_random_reservations_code():
    return random.choice(reservations_codes)

currency_records = []
for _ in range(30):
    currency_record = (
        get_random_ticketing_code(),
        fake.currency_name(),
        get_random_reservations_code(),
        fake.country(),
    )
    currency_records.append(currency_record)

# Create a list of formatted value strings to insert later
formatted_values = [
    f'("{record[0]}", "{record[1]}", "{record[2]}", "{record[3]}")' 
    for record in currency_records
]

# SQL query
sql = f'''INSERT INTO cs327e_final_project_2023.currency(curr_code, curr_name, cntry_code, cntry_name) 
VALUES {','.join(formatted_values)}'''

try:
    query_job = client.query(sql)
    query_job.result()
    print("30 records written into currency table")
except Exception as e:
    print("Error occurred while writing to table:", e)

30 records written into currency table
