# Load the Pickled Dataframes into memory

In [1]:
import os
import pickle
import pandas as pd

directory = '../.pickles'

for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as file:
            df_name = os.path.splitext(filename)[0]
            globals()[df_name] = pd.DataFrame(pickle.load(file))

# Now each pickle file is loaded into its own respective DataFrame variable


# Create a connection to Big Query

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = "/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json"

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Create a BigQuery client using the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Test the connection by listing datasets
datasets = list(client.list_datasets())
if datasets:
    print("Datasets in project {}:".format(client.project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(client.project))

Datasets in project mikecancell-development:
	Datasets
	VC_data_job_postings_data_api
	uber_data


## Create a Dataset to stored the tables (if needed)

In [4]:
# Define the dataset ID
dataset_id = "{}.Datasets".format(client.project)

# Check if the dataset already exists
try:
	client.get_dataset(dataset_id)  # Make an API request.
	print("Dataset {} already exists.".format(dataset_id))
except Exception:
	# Construct a full Dataset object to send to the API
	dataset = bigquery.Dataset(dataset_id)

	# Specify the geographic location where the dataset should reside
	dataset.location = "US"

	# Send the dataset to the API for creation
	dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

	print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Dataset mikecancell-development.Datasets already exists.


# Load the Tables to the Clould

### Load to Big Query

In [5]:
# Install the pandas-gbq package
# %pip install pandas-gbq

from pandas_gbq import to_gbq

# Define the dataset ID
dataset_id = "Datasets"

# Define the table names
table_names = {
    "dim_location": "dim_location",
    "dim_payment_type": "dim_payment_type",
    "dim_taxi": "dim_taxi",
    "dim_time": "dim_time",
    "fact_trips": "fact_trips",
    "grouped_fact_trips_by_wk": "grouped_fact_trips_by_wk"
}

# Define the dataframes
dataframes = {
    "dim_location": dim_location,
    "dim_payment_type": dim_payment_type,
    "dim_taxi": dim_taxi,
    "dim_time": dim_time,
    "fact_trips": fact_trips,
    "grouped_fact_trips_by_wk": grouped_fact_trips_by_wk
}

# Create tables in BigQuery
for df_name, table_name in table_names.items():
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    to_gbq(dataframes[df_name], table_id, project_id=client.project, if_exists='replace', credentials=credentials)
    print(f"Created table {table_id}")

100%|██████████| 1/1 [00:00<00:00, 9137.92it/s]


Created table mikecancell-development.Datasets.dim_location


100%|██████████| 1/1 [00:00<00:00, 24672.38it/s]


Created table mikecancell-development.Datasets.dim_payment_type


100%|██████████| 1/1 [00:00<00:00, 21845.33it/s]


Created table mikecancell-development.Datasets.dim_taxi


100%|██████████| 1/1 [00:00<00:00, 11459.85it/s]


Created table mikecancell-development.Datasets.dim_time


100%|██████████| 1/1 [00:00<00:00, 24818.37it/s]


Created table mikecancell-development.Datasets.fact_trips


100%|██████████| 1/1 [00:00<00:00, 12192.74it/s]

Created table mikecancell-development.Datasets.grouped_fact_trips_by_wk





# Lets Delete the Tables from BQ to avoid cost

In [6]:
# Define the dataset ID
dataset_id = "Datasets"

# Define the table names
table_names = [
    "dim_location",
    "dim_payment_type",
    "dim_taxi",
    "dim_time",
    "fact_trips",
    "grouped_fact_trips_by_wk"
]

# Drop tables in BigQuery
for table_name in table_names:
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    client.delete_table(table_id, not_found_ok=True)  # Make an API request.
    print(f"Deleted table {table_id}")

Deleted table mikecancell-development.Datasets.dim_location
Deleted table mikecancell-development.Datasets.dim_payment_type
Deleted table mikecancell-development.Datasets.dim_taxi
Deleted table mikecancell-development.Datasets.dim_time
Deleted table mikecancell-development.Datasets.fact_trips
Deleted table mikecancell-development.Datasets.grouped_fact_trips_by_wk


# Done for Now!