# Load Conf and Credentials

## Load Directory Locations

In [1]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


# Load the Pickled Dataframes into memory

In [2]:
import os
import pickle
import pandas as pd
import zipfile

directory = locations_data['Rel_Pickes_Dir']

for filename in os.listdir(directory):
    if filename.endswith('.pkl') or filename.endswith('.pkl.zip'):
        filepath = os.path.join(directory, filename)
        try:
            if filename.endswith('.pkl.zip'):
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    for zip_info in zip_ref.infolist():
                        if zip_info.filename.endswith('.pkl'):
                            with zip_ref.open(zip_info) as file:
                                df_name = os.path.splitext(os.path.splitext(filename)[0])[0]
                                globals()[df_name] = pd.DataFrame(pickle.load(file))
                                print(f"Loaded DataFrame from zip: {df_name}")
                                print(globals()[df_name].info())
            else:
                with open(filepath, 'rb') as file:
                    df_name = os.path.splitext(filename)[0]
                    globals()[df_name] = pd.DataFrame(pickle.load(file))
                    print(f"Loaded DataFrame: {df_name}")
                    print(globals()[df_name].info())
        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Now each pickle file is loaded into its own respective DataFrame variable

Loaded DataFrame from zip: dim_age
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Age        45 non-null     int64   
 1   Age_Group  45 non-null     category
dtypes: category(1), int64(1)
memory usage: 601.0 bytes
None
Loaded DataFrame from zip: dim_contact
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Contacts_Count        7 non-null      int64   
 1   Contacts_Count_Group  7 non-null      category
dtypes: category(1), int64(1)
memory usage: 227.0 bytes
None
Loaded DataFrame from zip: dim_credit_limits
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6205 entries, 0 to 6204
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              ----

# Create a connection to Big Query

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = locations_data['BQ_Service_Key']

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Create a BigQuery client using the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Test the connection by listing datasets
datasets = list(client.list_datasets())
if datasets:
    print("Datasets in project {}:".format(client.project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(client.project))

Datasets in project mikecancell-development:
	Datasets
	VC_data_job_postings_data_api
	data_commons
	uber_data


## Create a Dataset to Store the Tables (if needed)

In [4]:
# Define the dataset ID
dataset_id = "{}.Datasets".format(client.project)

# Check if the dataset already exists
try:
	client.get_dataset(dataset_id)  # Make an API request.
	print("Dataset {} already exists.".format(dataset_id))
except Exception:
	# Construct a full Dataset object to send to the API
	dataset = bigquery.Dataset(dataset_id)

	# Specify the geographic location where the dataset should reside
	dataset.location = "US"

	# Send the dataset to the API for creation
	dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

	print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Dataset mikecancell-development.Datasets already exists.


# Load the Tables to the Clould

## Ensure the BQ Libs are Loaded

In [31]:
# Import necessary modules
from google.cloud import bigquery
import google.api_core.exceptions

## Set the Dataframe Names to Banking Data

In [32]:
# Update table names to precede with "Banking_Data_"
table_names = {
    'fact_table': 'Banking_Data_fact_table',
    'dim_age': 'Banking_Data_dim_age',
    'dim_contact': 'Banking_Data_dim_contact',
    'dim_credit_limits': 'Banking_Data_dim_credit_limits',
    'dim_inactive': 'Banking_Data_dim_inactive',
    'dim_naive_bayes': 'Banking_Data_dim_naive_bayes',
    'dim_revolving_bal': 'Banking_Data_dim_revolving_bal',
    'dim_trans_amt': 'Banking_Data_dim_trans_amt',
    'dim_trans_cnt': 'Banking_Data_dim_trans_cnt',
    'dim_utilization': 'Banking_Data_dim_utilization'
}


## Set the Data Dict

In [33]:
# Create a dictionary to store the DataFrames and their corresponding BigQuery table names
dataframes = {
    'fact_table': fact_table,
    'dim_age': dim_age,
    'dim_contact': dim_contact,
    'dim_credit_limits': dim_credit_limits,
    'dim_inactive': dim_inactive,
    'dim_naive_bayes': dim_naive_bayes,
    'dim_revolving_bal': dim_revolving_bal,
    'dim_trans_amt': dim_trans_amt,
    'dim_trans_cnt': dim_trans_cnt,
    'dim_utilization': dim_utilization
}


## Convert Data Frame tyes to Appropriate BQ Types

In [34]:
# Convert columns containing lists to strings
for df_name, df in dataframes.items():
    for column in df.columns:
        if df[column].apply(lambda x: isinstance(x, list)).any():
            df[column] = df[column].apply(lambda x: str(x) if isinstance(x, list) else x)

# Ensure all columns have explicit data types
for df_name, df in dataframes.items():
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype('string')
        elif df[column].dtype == 'category':
            df[column] = df[column].astype('string')
        elif df[column].dtype == 'int64':
            df[column] = df[column].astype('int')
        elif df[column].dtype == 'float64':
            df[column] = df[column].astype('float')
        elif df[column].dtype == 'datetime64[ns]':
            df[column] = df[column].astype('datetime64[ns]')


## Define a Func to Create Table

In [35]:
# Define a function to create a BigQuery table with partitioning if it does not exist
def create_table_if_not_exists(table_id, schema, partition_field=None):
    """
    Creates a BigQuery table if it does not exist. Supports optional schema definition 
    and time-based partitioning.

    Parameters:
    - table_id (str): The fully qualified BigQuery table ID (e.g., 'project_id.dataset_id.table_id').
    - schema (list[google.cloud.bigquery.SchemaField], optional): The schema for the table. 
      If None, the schema will not be explicitly defined.
    - partition_field (str, optional): The column name to use for time-based partitioning. 
      If None, no partitioning is applied.

    Behavior:
    - If the table exists and partitioning is required but not present, the table will be 
      recreated with the specified partitioning.
    - If the table does not exist, it will be created with the specified schema and partitioning.

    Raises:
    - google.api_core.exceptions.GoogleAPICallError: If there is an error during the API call.
    - google.api_core.exceptions.NotFound: If the specified table or dataset does not exist.
    - google.api_core.exceptions.Conflict: If there is a conflict during table creation.

    Example:
    create_table_if_not_exists(
        table_id="project_id.dataset_id.table_name",
        schema=fact_table_schema,
        partition_field="Attrition_Flag"
    )
    """
    try:
        table = client.get_table(table_id)  # Make an API request.
        if partition_field and table.time_partitioning is None:
            print(f"Table {table_id} exists without partitioning, but partitioning is required. Recreating the table with partitioning.")
            client.delete_table(table_id)  # Delete the existing table
            table = bigquery.Table(table_id, schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field
            )
            table = client.create_table(table)  # Recreate the table with partitioning
            print(f"Recreated table {table_id} with partitioning.")
        print(f"Table {table_id} already exists.")
    except google.api_core.exceptions.NotFound:
        table = bigquery.Table(table_id, schema=schema)
        if partition_field:
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field
            )
        table = client.create_table(table)  # Make an API request.
        print(f"Created table {table_id}")
    except google.api_core.exceptions.Conflict:
        print(f"Table {table_id} already exists and cannot be created again.")


In [36]:
# Define a function to load a DataFrame into a BigQuery table with partitioning
def load_to_bigquery(df, table_id, schema=None, partition_field=None):
    """
    Loads a pandas DataFrame into a BigQuery table. If the table does not exist, it will be created.
    Supports optional schema definition and time-based partitioning.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to load into BigQuery.
    - table_id (str): The fully qualified BigQuery table ID (e.g., 'project_id.dataset_id.table_id').
    - schema (list[google.cloud.bigquery.SchemaField], optional): The schema for the table. If None, schema will be inferred.
    - partition_field (str, optional): The column name to use for time-based partitioning. If None, no partitioning is applied.

    Behavior:
    - If the table does not exist, it will be created with the specified schema and partitioning.
    - If the table exists, its data will be replaced (WRITE_TRUNCATE mode).
    - If partition_field is provided, the table will be partitioned by the specified column.

    Raises:
    - google.api_core.exceptions.GoogleAPICallError: If there is an error during the API call.
    - google.api_core.exceptions.NotFound: If the specified table or dataset does not exist.
    - google.api_core.exceptions.Conflict: If there is a conflict during table creation.

    Example:
    load_to_bigquery(
        df=fact_table,
        table_id="project_id.dataset_id.fact_table",
        schema=fact_table_schema,
        partition_field="Attrition_Flag"
    )
    """
    create_table_if_not_exists(table_id, schema, partition_field)
    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        schema=schema,
        time_partitioning=bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field=partition_field
        ) if partition_field else None
    )
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()  # Wait for the job to complete
    print(f"Loaded {df.shape[0]} rows into {table_id}")


## Define a Schema for the Fact Table

In [37]:
# Define the schema for the fact_table DataFrame
fact_table_schema = [
    bigquery.SchemaField("Attrition_Flag", "STRING"),
    bigquery.SchemaField("Age", "INTEGER"),
    bigquery.SchemaField("Gender", "STRING"),
    bigquery.SchemaField("Dependent_count", "INTEGER"),
    bigquery.SchemaField("Education_Level", "STRING"),
    bigquery.SchemaField("Marital_Status", "STRING"),
    bigquery.SchemaField("Income_Category", "STRING"),
    bigquery.SchemaField("Card_Category", "STRING"),
    bigquery.SchemaField("Months_on_book", "INTEGER"),
    bigquery.SchemaField("Total_Relationship_Count", "INTEGER"),
    bigquery.SchemaField("Months_Inactive_12_mon", "INTEGER"),
    bigquery.SchemaField("Contacts_Count_12_mon", "INTEGER"),
    bigquery.SchemaField("Credit_Limit", "FLOAT"),
    bigquery.SchemaField("Total_Revolving_Bal", "INTEGER"),
    bigquery.SchemaField("Avg_Open_To_Buy", "FLOAT"),
    bigquery.SchemaField("Total_Amt_Chng_Q4_Q1", "FLOAT"),
    bigquery.SchemaField("Total_Trans_Amt", "INTEGER"),
    bigquery.SchemaField("Total_Trans_Ct", "INTEGER"),
    bigquery.SchemaField("Total_Ct_Chng_Q4_Q1", "FLOAT"),
    bigquery.SchemaField("Avg_Utilization_Ratio", "FLOAT"),
    bigquery.SchemaField("is_Attrited", "BOOLEAN")
]


## Load All Tables to BQ

In [25]:
# Load each DataFrame into its corresponding BigQuery table without partitioning
for df_name, df in dataframes.items():
    table_id = f"{dataset_id}.{table_names[df_name]}"
    schema = fact_table_schema if df_name == 'fact_table' else None
    load_to_bigquery(df, table_id, schema=schema)


Created table mikecancell-development.Datasets.Banking_Data_fact_table
Loaded 10127 rows into mikecancell-development.Datasets.Banking_Data_fact_table
Created table mikecancell-development.Datasets.Banking_Data_dim_age
Loaded 45 rows into mikecancell-development.Datasets.Banking_Data_dim_age
Created table mikecancell-development.Datasets.Banking_Data_dim_contact
Loaded 7 rows into mikecancell-development.Datasets.Banking_Data_dim_contact
Created table mikecancell-development.Datasets.Banking_Data_dim_credit_limits
Loaded 6205 rows into mikecancell-development.Datasets.Banking_Data_dim_credit_limits
Created table mikecancell-development.Datasets.Banking_Data_dim_inactive
Loaded 7 rows into mikecancell-development.Datasets.Banking_Data_dim_inactive
Created table mikecancell-development.Datasets.Banking_Data_dim_naive_bayes
Loaded 1591 rows into mikecancell-development.Datasets.Banking_Data_dim_naive_bayes
Created table mikecancell-development.Datasets.Banking_Data_dim_revolving_bal
Loade

# Delete the Tables to Save Space

In [26]:
user_input = input("Do you really want to delete the tables? (y to proceed, n or any other key to cancel): ").strip().lower()

if user_input == 'y':
    for table_name in table_names.values():
        table_id = f"mikecancell-development.Datasets.{table_name}"
        try:
            client.delete_table(table_id)
            print(f"Deleted table {table_id}")
        except Exception as e:
            print(f"Error deleting table {table_id}: {e}")
else:
    print("Table deletion canceled.")

Table deletion canceled.
