# Load Conf and Credentials

## Load Directory Locations

In [7]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


# Load the Pickled Dataframes into memory

In [None]:
import os
import pickle
import pandas as pd
import zipfile

directory = locations_data['Rel_Pickes_Dir']

for filename in os.listdir(directory):
    if filename.endswith('.pkl') or filename.endswith('.pkl.zip'):
        filepath = os.path.join(directory, filename)
        try:
            if filename.endswith('.pkl.zip'):
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    for zip_info in zip_ref.infolist():
                        if zip_info.filename.endswith('.pkl'):
                            with zip_ref.open(zip_info) as file:
                                df_name = os.path.splitext(os.path.splitext(filename)[0])[0]
                                globals()[df_name] = pd.DataFrame(pickle.load(file))
                                print(f"Loaded DataFrame from zip: {df_name}")
                                print(globals()[df_name].info())
            else:
                with open(filepath, 'rb') as file:
                    df_name = os.path.splitext(filename)[0]
                    globals()[df_name] = pd.DataFrame(pickle.load(file))
                    print(f"Loaded DataFrame: {df_name}")
                    print(globals()[df_name].info())
        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Now each pickle file is loaded into its own respective DataFrame variable

Loaded DataFrame from zip: crime_facts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   dr_no           585000 non-null  object        
 1   date_rptd       585000 non-null  datetime64[ns]
 2   datetime_occ    585000 non-null  datetime64[ns]
 3   rpt_dist_no     585000 non-null  int16         
 4   vict_age        585000 non-null  int16         
 5   lat             585000 non-null  float64       
 6   lon             585000 non-null  float64       
 7   area            585000 non-null  int16         
 8   premis_cd       585000 non-null  int16         
 9   crm_cd          585000 non-null  int16         
 10  vict_sex        585000 non-null  category      
 11  vict_descent    585000 non-null  category      
 12  weapon_used_cd  585000 non-null  int16         
 13  status          585000 non-null  category      
dt

# Create a connection to Big Query

In [8]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = locations_data['BQ_Service_Key']

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Create a BigQuery client using the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Test the connection by listing datasets
datasets = list(client.list_datasets())
if datasets:
    print("Datasets in project {}:".format(client.project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(client.project))

Datasets in project mikecancell-development:
	Datasets
	VC_data_job_postings_data_api
	uber_data


## Create a Dataset to Store the Tables (if needed)

In [9]:
# Define the dataset ID
dataset_id = "{}.Datasets".format(client.project)

# Check if the dataset already exists
try:
	client.get_dataset(dataset_id)  # Make an API request.
	print("Dataset {} already exists.".format(dataset_id))
except Exception:
	# Construct a full Dataset object to send to the API
	dataset = bigquery.Dataset(dataset_id)

	# Specify the geographic location where the dataset should reside
	dataset.location = "US"

	# Send the dataset to the API for creation
	dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

	print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Dataset mikecancell-development.Datasets already exists.


# Load the Tables to the Clould

In [13]:
# Install the pandas-gbq package
# %pip install pandas-gbq

from pandas_gbq import to_gbq

# Define the dataset ID
dataset_id = "Datasets"

# Define the table names and dataframes dynamically from the loaded DataFrames
table_names = {df_name: f"LA_Crime_{df_name}" for df_name in globals() if isinstance(globals()[df_name], pd.DataFrame)}
dataframes = {df_name: globals()[df_name] for df_name in table_names}

# Ensure all columns are converted to a suitable datatype
for df_name in dataframes:
    dataframes[df_name] = dataframes[df_name].astype(str)

# Create tables in BigQuery
for df_name, table_name in table_names.items():
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    to_gbq(dataframes[df_name], table_id, project_id=client.project, if_exists='replace', credentials=credentials)
    print(f"Created table {table_id}")

100%|██████████| 1/1 [00:00<00:00, 14513.16it/s]


Created table mikecancell-development.Datasets.LA_Crime_crime_facts


100%|██████████| 1/1 [00:00<00:00, 12633.45it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_area


100%|██████████| 1/1 [00:00<00:00, 25115.59it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_crime


100%|██████████| 1/1 [00:00<00:00, 13751.82it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_location


100%|██████████| 1/1 [00:00<00:00, 27594.11it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_premise


100%|██████████| 1/1 [00:00<00:00, 35544.95it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_status


100%|██████████| 1/1 [00:00<00:00, 11037.64it/s]


Created table mikecancell-development.Datasets.LA_Crime_dim_victim


100%|██████████| 1/1 [00:00<00:00, 5454.23it/s]

Created table mikecancell-development.Datasets.LA_Crime_dim_weapon





# Delete the Tables to Save Space

In [12]:
# Delete tables in BigQuery
for table_name in table_names.values():
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    try:
        client.delete_table(table_id)  # Make an API request.
        print(f"Deleted table {table_id}")
    except Exception as e:
        print(f"Error deleting table {table_id}: {e}")

Deleted table mikecancell-development.Datasets.crime_facts
Deleted table mikecancell-development.Datasets.dim_area
Deleted table mikecancell-development.Datasets.dim_crime
Deleted table mikecancell-development.Datasets.dim_location
Deleted table mikecancell-development.Datasets.dim_premise
Deleted table mikecancell-development.Datasets.dim_status
Deleted table mikecancell-development.Datasets.dim_victim
Deleted table mikecancell-development.Datasets.dim_weapon
