# Load Conf and Credentials

## Load Directory Locations

In [1]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


# Load the Pickled Dataframes into memory

In [2]:
import os
import pickle
import pandas as pd
import zipfile

directory = locations_data['Rel_Pickes_Dir']

for filename in os.listdir(directory):
    if filename.endswith('.pkl') or filename.endswith('.pkl.zip'):
        filepath = os.path.join(directory, filename)
        try:
            if filename.endswith('.pkl.zip'):
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    for zip_info in zip_ref.infolist():
                        if zip_info.filename.endswith('.pkl'):
                            with zip_ref.open(zip_info) as file:
                                df_name = os.path.splitext(os.path.splitext(filename)[0])[0]
                                globals()[df_name] = pd.DataFrame(pickle.load(file))
                                print(f"Loaded DataFrame from zip: {df_name}")
                                print(globals()[df_name].info())
            else:
                with open(filepath, 'rb') as file:
                    df_name = os.path.splitext(filename)[0]
                    globals()[df_name] = pd.DataFrame(pickle.load(file))
                    print(f"Loaded DataFrame: {df_name}")
                    print(globals()[df_name].info())
        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Now each pickle file is loaded into its own respective DataFrame variable

Loaded DataFrame from zip: crime_facts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   dr_no           585000 non-null  object        
 1   date_rptd       585000 non-null  datetime64[ns]
 2   datetime_occ    585000 non-null  datetime64[ns]
 3   rpt_dist_no     585000 non-null  int16         
 4   vict_age        585000 non-null  int16         
 5   lat             585000 non-null  float64       
 6   lon             585000 non-null  float64       
 7   area            585000 non-null  int16         
 8   premis_cd       585000 non-null  int16         
 9   crm_cd          585000 non-null  int16         
 10  vict_sex        585000 non-null  category      
 11  vict_descent    585000 non-null  category      
 12  weapon_used_cd  585000 non-null  int16         
 13  status          585000 non-null  category      
dt

# Create a connection to Big Query

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = locations_data['BQ_Service_Key']

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Create a BigQuery client using the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Test the connection by listing datasets
datasets = list(client.list_datasets())
if datasets:
    print("Datasets in project {}:".format(client.project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(client.project))

Datasets in project mikecancell-development:
	Datasets
	VC_data_job_postings_data_api
	uber_data


## Create a Dataset to Store the Tables (if needed)

In [4]:
# Define the dataset ID
dataset_id = "{}.Datasets".format(client.project)

# Check if the dataset already exists
try:
	client.get_dataset(dataset_id)  # Make an API request.
	print("Dataset {} already exists.".format(dataset_id))
except Exception:
	# Construct a full Dataset object to send to the API
	dataset = bigquery.Dataset(dataset_id)

	# Specify the geographic location where the dataset should reside
	dataset.location = "US"

	# Send the dataset to the API for creation
	dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

	print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Dataset mikecancell-development.Datasets already exists.


# Load the Tables to the Clould

In [12]:
# Import necessary modules
from google.cloud import bigquery
import google.api_core.exceptions

# Update table names to precede with "LA_Crime_"
table_names = {
    'crime_facts': 'LA_Crime_crime_facts',
    'dim_area': 'LA_Crime_dim_area',
    'dim_crime': 'LA_Crime_dim_crime',
    'dim_location': 'LA_Crime_dim_location',
    'dim_premise': 'LA_Crime_dim_premise',
    'dim_status': 'LA_Crime_dim_status',
    'dim_victim': 'LA_Crime_dim_victim',
    'dim_weapon': 'LA_Crime_dim_weapon'
}

# Create a dictionary to store the DataFrames and their corresponding BigQuery table names
dataframes = {
    'crime_facts': crime_facts,
    'dim_area': dim_area,
    'dim_crime': dim_crime,
    'dim_location': dim_location,
    'dim_premise': dim_premise,
    'dim_status': dim_status,
    'dim_victim': dim_victim,
    'dim_weapon': dim_weapon
}

# Convert columns containing lists to strings
for df_name, df in dataframes.items():
    for column in df.columns:
        if df[column].apply(lambda x: isinstance(x, list)).any():
            df[column] = df[column].apply(lambda x: str(x) if isinstance(x, list) else x)

# Ensure all columns have explicit data types
for df_name, df in dataframes.items():
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype('string')
        elif df[column].dtype == 'category':
            df[column] = df[column].astype('string')
        elif df[column].dtype == 'int64':
            df[column] = df[column].astype('int')
        elif df[column].dtype == 'float64':
            df[column] = df[column].astype('float')
        elif df[column].dtype == 'datetime64[ns]':
            df[column] = df[column].astype('datetime64[ns]')

# Define a function to create a BigQuery table with partitioning if it does not exist
def create_table_if_not_exists(table_id, schema, partition_field=None):
    try:
        table = client.get_table(table_id)  # Make an API request.
        if partition_field and table.time_partitioning is None:
            print(f"Table {table_id} exists without partitioning, but partitioning is required. Recreating the table with partitioning.")
            client.delete_table(table_id)  # Delete the existing table
            table = bigquery.Table(table_id, schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field
            )
            table = client.create_table(table)  # Recreate the table with partitioning
            print(f"Recreated table {table_id} with partitioning.")
        print(f"Table {table_id} already exists.")
    except google.api_core.exceptions.NotFound:
        table = bigquery.Table(table_id, schema=schema)
        if partition_field:
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field
            )
        table = client.create_table(table)  # Make an API request.
        print(f"Created table {table_id}")
    except google.api_core.exceptions.Conflict:
        print(f"Table {table_id} already exists and cannot be created again.")

# Define a function to load a DataFrame into a BigQuery table with partitioning
def load_to_bigquery(df, table_id, schema=None, partition_field=None):
    create_table_if_not_exists(table_id, schema, partition_field)
    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        schema=schema,
        time_partitioning=bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field=partition_field
        ) if partition_field else None
    )
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()  # Wait for the job to complete
    print(f"Loaded {df.shape[0]} rows into {table_id}")

# Define the schema for the crime_facts DataFrame
crime_facts_schema = [
    bigquery.SchemaField("dr_no", "STRING"),
    bigquery.SchemaField("date_rptd", "TIMESTAMP"),
    bigquery.SchemaField("datetime_occ", "TIMESTAMP"),
    bigquery.SchemaField("rpt_dist_no", "INTEGER"),
    bigquery.SchemaField("vict_age", "INTEGER"),
    bigquery.SchemaField("lat", "FLOAT"),
    bigquery.SchemaField("lon", "FLOAT"),
    bigquery.SchemaField("area", "INTEGER"),
    bigquery.SchemaField("premis_cd", "INTEGER"),
    bigquery.SchemaField("crm_cd", "INTEGER"),
    bigquery.SchemaField("vict_sex", "STRING"),
    bigquery.SchemaField("vict_descent", "STRING"),
    bigquery.SchemaField("weapon_used_cd", "INTEGER"),
    bigquery.SchemaField("status", "STRING")
]

# Load each DataFrame into its corresponding BigQuery table with partitioning on 'date_rptd' for crime_facts
for df_name, df in dataframes.items():
    table_id = f"mikecancell-development.Datasets.{table_names[df_name]}"
    schema = crime_facts_schema if df_name == 'crime_facts' else None
    partition_field = 'date_rptd' if df_name == 'crime_facts' else None
    if df_name == 'crime_facts':
        load_to_bigquery(df, table_id, schema=schema, partition_field=partition_field)
    else:
        load_to_bigquery(df, table_id, schema=schema)

Table mikecancell-development.Datasets.LA_Crime_crime_facts exists without partitioning, but partitioning is required. Recreating the table with partitioning.
Recreated table mikecancell-development.Datasets.LA_Crime_crime_facts with partitioning.
Table mikecancell-development.Datasets.LA_Crime_crime_facts already exists.
Loaded 585000 rows into mikecancell-development.Datasets.LA_Crime_crime_facts
Table mikecancell-development.Datasets.LA_Crime_dim_area already exists.
Loaded 21 rows into mikecancell-development.Datasets.LA_Crime_dim_area
Table mikecancell-development.Datasets.LA_Crime_dim_crime already exists.
Loaded 139 rows into mikecancell-development.Datasets.LA_Crime_dim_crime
Table mikecancell-development.Datasets.LA_Crime_dim_location already exists.
Loaded 52528 rows into mikecancell-development.Datasets.LA_Crime_dim_location
Table mikecancell-development.Datasets.LA_Crime_dim_premise already exists.
Loaded 310 rows into mikecancell-development.Datasets.LA_Crime_dim_premise
T

# Delete the Tables to Save Space

In [12]:
for table_name in table_names.values():
    table_id = f"mikecancell-development.Datasets.{table_name}"
    try:
        client.delete_table(table_id)
        print(f"Deleted table {table_id}")
    except Exception as e:
        print(f"Error deleting table {table_id}: {e}")

Deleted table mikecancell-development.Datasets.crime_facts
Deleted table mikecancell-development.Datasets.dim_area
Deleted table mikecancell-development.Datasets.dim_crime
Deleted table mikecancell-development.Datasets.dim_location
Deleted table mikecancell-development.Datasets.dim_premise
Deleted table mikecancell-development.Datasets.dim_status
Deleted table mikecancell-development.Datasets.dim_victim
Deleted table mikecancell-development.Datasets.dim_weapon
