# Load Conf and Credentials

## Load Directory Locations

In [1]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


# Load the Pickled Dataframes into memory

In [2]:
import os
import pickle
import pandas as pd
import zipfile

directory = locations_data['Rel_Pickes_Dir']

for filename in os.listdir(directory):
    if filename.endswith('.pkl') or filename.endswith('.pkl.zip'):
        filepath = os.path.join(directory, filename)
        try:
            if filename.endswith('.pkl.zip'):
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    for zip_info in zip_ref.infolist():
                        if zip_info.filename.endswith('.pkl'):
                            with zip_ref.open(zip_info) as file:
                                df_name = os.path.splitext(os.path.splitext(filename)[0])[0]
                                globals()[df_name] = pd.DataFrame(pickle.load(file))
                                print(f"Loaded DataFrame from zip: {df_name}")
                                print(globals()[df_name].info())
            else:
                with open(filepath, 'rb') as file:
                    df_name = os.path.splitext(filename)[0]
                    globals()[df_name] = pd.DataFrame(pickle.load(file))
                    print(f"Loaded DataFrame: {df_name}")
                    print(globals()[df_name].info())
        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Now each pickle file is loaded into its own respective DataFrame variable

Loaded DataFrame from zip: crime_facts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   dr_no           585000 non-null  object        
 1   date_rptd       585000 non-null  datetime64[ns]
 2   datetime_occ    585000 non-null  datetime64[ns]
 3   rpt_dist_no     585000 non-null  int16         
 4   vict_age        585000 non-null  int16         
 5   lat             585000 non-null  float64       
 6   lon             585000 non-null  float64       
 7   area            585000 non-null  int16         
 8   premis_cd       585000 non-null  int16         
 9   crm_cd          585000 non-null  int16         
 10  vict_sex        585000 non-null  category      
 11  vict_descent    585000 non-null  category      
 12  weapon_used_cd  585000 non-null  int16         
 13  status          585000 non-null  category      
dt

# Create a connection to Big Query

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = locations_data['BQ_Service_Key']

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Create a BigQuery client using the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Test the connection by listing datasets
datasets = list(client.list_datasets())
if datasets:
    print("Datasets in project {}:".format(client.project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(client.project))

Loaded DataFrame from zip: crime_facts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585000 entries, 0 to 584999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   dr_no           585000 non-null  object        
 1   date_rptd       585000 non-null  datetime64[ns]
 2   datetime_occ    585000 non-null  datetime64[ns]
 3   rpt_dist_no     585000 non-null  int16         
 4   vict_age        585000 non-null  int16         
 5   lat             585000 non-null  float64       
 6   lon             585000 non-null  float64       
 7   area            585000 non-null  int16         
 8   premis_cd       585000 non-null  int16         
 9   crm_cd          585000 non-null  int16         
 10  vict_sex        585000 non-null  category      
 11  vict_descent    585000 non-null  category      
 12  weapon_used_cd  585000 non-null  int16         
 13  status          585000 non-null  category      
dt

## Create a Dataset to Store the Tables (if needed)

In [4]:
# Define the dataset ID
dataset_id = "{}.Datasets".format(client.project)

# Check if the dataset already exists
try:
	client.get_dataset(dataset_id)  # Make an API request.
	print("Dataset {} already exists.".format(dataset_id))
except Exception:
	# Construct a full Dataset object to send to the API
	dataset = bigquery.Dataset(dataset_id)

	# Specify the geographic location where the dataset should reside
	dataset.location = "US"

	# Send the dataset to the API for creation
	dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

	print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Datasets in project mikecancell-development:
	Datasets
	VC_data_job_postings_data_api
	uber_data


# Load the Tables to the Clould

## Create a Mapping Dict between the DFs and Tables

In [5]:
import pprint

schema_mappings = []

for df_name, table_name in table_names.items():
    for column in dataframes[df_name].columns:
        field_type = 'STRING'
        if dataframes[df_name][column].dtype.name == 'datetime64[ns]':
            field_type = 'TIMESTAMP'
        elif dataframes[df_name][column].dtype.name == 'float64':
            field_type = 'FLOAT'
        elif dataframes[df_name][column].dtype.name in ['int64', 'int16']:
            field_type = 'INTEGER'
        
        description = column_descriptions.get(df_name, {}).get(column, f"Description for {column}")
        
        schema_mappings.append({
            'dataframe': df_name,
            'table_name': table_name,
            'column_name': column,
            'data_type': field_type,
            'description': description
        })

schema_mappings.extend([
    {
        'dataframe': 'dim_area',
        'table_name': 'LA_Crime_dim_area',
        'columns': [
            {'column_name': 'fk_area', 'data_type': 'INT64', 'description': 'Area Code'},
            {'column_name': 'area_name', 'data_type': 'STRING', 'description': 'Area Name'}
        ]
    },
    {
        'dataframe': 'dim_crime',
        'table_name': 'LA_Crime_dim_crime',
        'columns': [
            {'column_name': 'fk_crm_cd', 'data_type': 'INT64', 'description': 'Crime Code'},
            {'column_name': 'crm_cd_desc', 'data_type': 'STRING', 'description': 'Crime Description'}
        ]
    },
    {
        'dataframe': 'dim_location',
        'table_name': 'LA_Crime_dim_location',
        'columns': [
            {'column_name': 'lat'               , 'data_type': 'FLOAT64', 'description': 'Latitude'},
            {'column_name': 'lon'               , 'data_type': 'FLOAT64', 'description': 'Longitude'},
            {'column_name': 'geo_place_id'      , 'data_type': 'INT64', 'description': 'Geo Place ID'},
            {'column_name': 'geo_osm_type'      , 'data_type': 'STRING' , 'description': 'Geo OSM Type'},
            {'column_name': 'geo_osm_id'        , 'data_type': 'INT64', 'description': 'Geo OSM ID'},
            {'column_name': 'geo_display_name'  , 'data_type': 'STRING' , 'description': 'Geo Display Name'},
            {'column_name': 'geo_road'          , 'data_type': 'STRING' , 'description': 'Geo Road'},
            {'column_name': 'geo_neighbourhood' , 'data_type': 'STRING' , 'description': 'Geo Neighbourhood'},
            {'column_name': 'geo_suburb'        , 'data_type': 'STRING' , 'description': 'Geo Suburb'},
            {'column_name': 'geo_city'          , 'data_type': 'STRING' , 'description': 'Geo City'},
            {'column_name': 'geo_state'         , 'data_type': 'STRING' , 'description': 'Geo State'},
            {'column_name': 'geo_ISO3166-2-lvl4', 'data_type': 'STRING' , 'description': 'Geo ISO3166-2 Level 4'},
            {'column_name': 'geo_postcode'      , 'data_type': 'STRING' , 'description': 'Geo Postcode'},
            {'column_name': 'geo_country'       , 'data_type': 'STRING' , 'description': 'Geo Country'},
            {'column_name': 'geo_country_code'  , 'data_type': 'STRING' , 'description': 'Geo Country Code'},
            {'column_name': 'geo_boundingbox'   , 'data_type': 'STRING' , 'description': 'Geo Bounding Box'}
        ]
    },
    {
        'dataframe': 'dim_premise',
        'table_name': 'LA_Crime_dim_premise',
        'columns': [
            {'column_name': 'fk_premis_cd'      , 'data_type': 'INT64', 'description': 'Premise Code'},
            {'column_name': 'premis_desc'       , 'data_type': 'STRING'  , 'description': 'Premise Description'}
        ]
    },
    {
        'dataframe': 'dim_status',
        'table_name': 'LA_Crime_dim_status',
        'columns': [
            {'column_name': 'fk_status'     , 'data_type': 'STRING', 'description': 'Status Code'},
            {'column_name': 'status_desc'   , 'data_type': 'STRING', 'description': 'Status Description'}
        ]
    },
    {
        'dataframe': 'dim_victim',
        'table_name': 'LA_Crime_dim_victim',
        'columns': [
            {'column_name': 'vict_descent', 'data_type': 'STRING', 'description': 'Victim Descent'},
            {'column_name': 'descent_desc', 'data_type': 'STRING', 'description': 'Descent Description'}
        ]
    },
    {
        'dataframe': 'dim_weapon',
        'table_name': 'LA_Crime_dim_weapon',
        'columns': [
            {'column_name': 'fk_weapon_used_cd' , 'data_type': 'INT64', 'description': 'Weapon Used Code'},
            {'column_name': 'weapon_desc'       , 'data_type': 'STRING'  , 'description': 'Weapon Description'}
        ]
    },
    {
        'dataframe': 'crime_facts',
        'table_name': 'LA_Crime_crime_facts',
        'columns': [
            {'column_name': 'dr_no'         , 'data_type': 'STRING'  , 'description': 'DR Number'},
            {'column_name': 'date_rptd'     , 'data_type': 'DATE'    , 'description': 'Date Reported'},
            {'column_name': 'datetime_occ'  , 'data_type': 'DATETIME', 'description': 'Date and Time of Occurrence'},
            {'column_name': 'rpt_dist_no'   , 'data_type': 'INT64', 'description': 'Report District Number'},
            {'column_name': 'vict_age'      , 'data_type': 'INT64', 'description': 'Victim Age'},
            {'column_name': 'lat'           , 'data_type': 'FLOAT'   , 'description': 'Latitude'},
            {'column_name': 'lon'           , 'data_type': 'FLOAT'   , 'description': 'Longitude'},
            {'column_name': 'area'          , 'data_type': 'INT64', 'description': 'Area'},
            {'column_name': 'premis_cd'     , 'data_type': 'INT64', 'description': 'Premise Code'},
            {'column_name': 'crm_cd'        , 'data_type': 'INT64', 'description': 'Crime Code'},
            {'column_name': 'vict_sex'      , 'data_type': 'STRING'  , 'description': 'Victim Sex'},
            {'column_name': 'vict_descent'  , 'data_type': 'STRING'  , 'description': 'Victim Descent'},
            {'column_name': 'weapon_used_cd', 'data_type': 'INT64', 'description': 'Weapon Used Code'},
            {'column_name': 'status'        , 'data_type': 'STRING'  , 'description': 'Status'}
        ]
    }
])

#pprint.pprint(schema_mappings)

## Create Empty Schemas in BQ

In [6]:
from google.cloud import bigquery

# Create empty tables in BigQuery with schema descriptions
for mapping in schema_mappings:
    table_id = f"{client.project}.{dataset_id}.{mapping['table_name']}"
    
    # Define the schema with descriptions
    schema = []
    if 'columns' in mapping:
        for column_info in mapping['columns']:
            schema_field = bigquery.SchemaField(
                column_info['column_name'],
                column_info['data_type'],
                mode='NULLABLE',
                description=column_info['description']
            )
            schema.append(schema_field)
    else:
        for column in dataframes[mapping['dataframe']].columns:
            field_type = 'STRING'
            if dataframes[mapping['dataframe']][column].dtype.name == 'datetime64[ns]':
                field_type = 'TIMESTAMP'
            elif dataframes[mapping['dataframe']][column].dtype.name == 'float64':
                field_type = 'FLOAT'
            elif dataframes[mapping['dataframe']][column].dtype.name in ['int64', 'int16']:
                field_type = 'INTEGER'
            
            description = column_descriptions.get(mapping['dataframe'], {}).get(column, f"Description for {column}")
            
            schema_field = bigquery.SchemaField(
                column,
                field_type,
                mode='NULLABLE',
                description=description
            )
            schema.append(schema_field)
    
    # Create a table object with the schema
    table = bigquery.Table(table_id, schema=schema)
    
    # Create or update the table in BigQuery
    table = client.create_table(table, exists_ok=True)
    print(f"Created table {table_id} with schema descriptions")

## Now Load the Frame Data

In [8]:
import pandas_gbq

# Load the data into the BigQuery tables
for mapping in schema_mappings:
    table_id = f"{client.project}.{dataset_id}.{mapping['table_name']}"
    
    # Load the data into the table using pandas_gbq
    pandas_gbq.to_gbq(dataframes[mapping['dataframe']], table_id, project_id=client.project, if_exists='replace', credentials=credentials)
    print(f"Loaded data into table {table_id}")

# Delete the Tables to Save Space

In [10]:
# Define dataset_id
dataset_id = "{}.Datasets".format(client.project)

# Define table_names dictionary
table_names = {
    'dim_area': 'LA_Crime_dim_area',
    'dim_crime': 'LA_Crime_dim_crime',
    'dim_location': 'LA_Crime_dim_location',
    'dim_premise': 'LA_Crime_dim_premise',
    'dim_status': 'LA_Crime_dim_status',
    'dim_victim': 'LA_Crime_dim_victim',
    'dim_weapon': 'LA_Crime_dim_weapon',
    'crime_facts': 'LA_Crime_crime_facts'
}

# Delete tables in BigQuery
for table_name in table_names.values():
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    try:
        client.delete_table(table_id)  # Make an API request.
        print(f"Deleted table {table_id}")
    except Exception as e:
        print(f"Error deleting table {table_id}: {e}")

Error deleting table mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_area: table_id must be a fully-qualified ID in standard SQL format, e.g., "project.dataset.table_id", got mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_area
Error deleting table mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_crime: table_id must be a fully-qualified ID in standard SQL format, e.g., "project.dataset.table_id", got mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_crime
Error deleting table mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_location: table_id must be a fully-qualified ID in standard SQL format, e.g., "project.dataset.table_id", got mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_location
Error deleting table mikecancell-development.mikecancell-development.Datasets.LA_Crime_dim_premise: table_id must be a fully-qualified ID in standard SQL format, e.g., "proje