In [None]:
# ------------------------------------------------------------------------BACK UP JOBS IN DATABRICKS--------------------------------------------------------------------------
# This script calls the Databricks API and gets a list of current databricks jobs and saves them in the AWS S3 bucket, s3://use1-s3-bcq-prod-elt-logs/databricks-jobs-settings-weekly/. Some of the settings data for each job is also saved in a google sheet, https://docs.google.com/spreadsheets/d/1C_5SaXWytuWL_7GlNVVypMLTrUza6kmn7v746-0omDg/edit#gid=156737717, for a quick view of the jobs.

In [None]:
import pyspark.sql.functions as F
import pyspark.sql.types as t
import requests
import json
import pandas as pd
import datetime
import boto3

In [None]:
DBATSECRETSCOPE = "Production-ELT"
DBPATKEY = "DB_API_Token"

host_token = dbutils.secrets.get(DBATSECRETSCOPE, DBPATKEY)

# Databricks API key to authenticate with the Databricks REST API
headers = {"Authorization": "Bearer " + host_token}

# Manually configure the Databricks workspace URL
host_name = "<nbwork-host-name>"

bucket = '<work-s3-bucket>'
folder = 'databricks-jobs-settings-weekly/'

In [None]:
%run "/PROD/Reuseable_Code/nb_prod_functions_job_backup_rebuild"

In [None]:
# job ids using the function
job_ids = get_jobs_list(host_name, headers)

# Retrieve job id settings data
jobs_data = retrieve_jobs_data(host_name, headers, job_ids)

In [None]:
# save job setting to aws s3 bucket/folder
for data in jobs_data:
    job_id = data['job_id']
    save_data_to_s3(bucket, folder, job_id, data)

In [None]:
# create a spreadsheet with quick overview of jobs in databricks
def extract_job_data(jobs):
    extracted_data = []

    for job in jobs:
        job_id = job.get('job_id')
        last_run_url = job.get('last_run_url', '')
        settings = job.get('settings', {})

        name = settings.get('name', '')
        
        # Extracting schedule with checks
        schedule_data = settings.get('schedule', {})
        quartz_expr = schedule_data.get('quartz_cron_expression', '')
        timezone = schedule_data.get('timezone_id', '')
        schedule = quartz_expr + " " + timezone if quartz_expr and timezone else ''
        
        format_type = settings.get('format', '')
        
        # Extracting notebook paths with checks
        tasks = settings.get('tasks', [])
        notebooks = [task['notebook_task'].get('notebook_path', '') for task in tasks if 'notebook_task' in task]
        
        # If no clusters are available for the job, add a default row
        clusters = settings.get('job_clusters', [])
        if not clusters:
            extracted_data.append([job_id, name, schedule, str(notebooks), '', '', '', last_run_url])
            continue

        # Processing clusters
        for cluster in clusters:
            try:
                new_cluster = cluster.get('new_cluster', {})
                node_type = new_cluster.get('node_type_id', '')
                spark_version = new_cluster.get('spark_version', '')
                aws_attributes = new_cluster.get('aws_attributes', {})
                instance_profile_arn = aws_attributes.get('instance_profile_arn', '')

                extracted_data.append([str(job_id), name, schedule, str(notebooks), node_type, spark_version, instance_profile_arn, last_run_url])
            except Exception as e:
                print(f"Error processing cluster for job_id {job_id}: {e}")

    return pd.DataFrame(extracted_data, columns=['job_id', 'name', 'schedule', 'notebooks', 'node_type', 'spark_version', 'instance_profile_arn', 'last_run_url'])

df = extract_job_data(jobs_data)


In [None]:
df.count()

In [None]:
%run "/Users/aletia@bondcliq.com/nb_df_googlesheet"

In [None]:
# save the quick view to Google Sheet, along with the previous week's quick view
# we will always have 2 weeks worth of quick views
sheet_id = 'google-sheet-id'
worksheet_name_A = 'jobs'
worksheet_name_B = 'jobs2'
try:
    rotate_and_save(df, sheet_id, worksheet_name_A, worksheet_name_B)
    print('Google Sheet Jobs Data was successfully rotated and updated for Sheet 2.')
except Exception as e:
    print(e)