Getting the environment value from workflow using dbutils.widgets. By default it will be 'dev'

In [None]:
env=dbutils.widgets.text('env','')
env=dbutils.widgets.get('env')
if env=='qa' or env=='QA':
    job_catalog='_qa'
elif env=='prod'or env=='PROD':
    job_catalog='_prod'
else:
    env='DEV'
    job_catalog='_dev'


- Checks the metadata table for any records that have failed status and verifies if incidents are already created for those records. 
- If incidents are created, then ignore and exit, otherwise, further check the incident table if there is any incident created for other tasks from the same job run. 
- If an incident is created then, get the incident number and check the status of the incident to ensure it is still active. 
- If the incident is closed, create a new incident, else update the activity log of the exisiting incident with the failed task info of the same job run.

In [None]:
import json

def retrieve_taskinfo_from_metadata_table_check_incident(incident_url,access_token_url,client_id,client_secret,resource,urgency,impact,cmdb_ci,caller_id,location,assignment_group):


    Query1=(f'''

                select distinct * from
                {job_catalog}._utils.metadata__execution m
                left join {job_catalog}._utils.metadata___incidents i
                    on m.job_id =i.job_id
                    and m.run_id = i.run_id
                    and m.task_id = i.task_id
                where 
                m.LOAD_START_DT>=now()- INTERVAL 6 days 
                and m.JOB_STATUS='Failed'
                and i.incident_number is  null
                    
                ''')
      
    query_data = spark.sql(Query1).collect()
    
    
    if len(query_data)>0:
        # incident doesn't exist but there are records that has failed

        for row in query_data:
        #print(row)
            
            job_id=row['JOB_ID']
            run_id=row['RUN_ID']
            task_id=row['TASK_ID']
            task_info={
                
                'job_id':job_id,
                'run_id':run_id,
                'task_id':task_id,
                'task_name':row['TASK_NAME'],
                'Workspace_URL':row['WORKSPACE_URL'],
                'notebook_path':row['NOTEBOOK_PATH'],
                'result_state':row['JOB_STATUS'],
                'state_message':row['EXCEPTION']

                }
            
            task_info_short={
           
                'task_name':row['TASK_NAME'],
                'result_state':row['JOB_STATUS'],
                }
            
            description=json.dumps(task_info)
            short_description=json.dumps(task_info_short)
        
            print("Incident is not yet created for the provided task. Check if a task already exists with the job_id and run_id")
            Query=(f'''
                select distinct
                    *
                from
                    {job_catalog}._utils.metadata__execution m
                    left join {job_catalog}._utils.metadata___incidents i
                    on m.job_id =i.job_id
                    and m.run_id = i.run_id
                    and i.incident_number is not null
                where 
                i.job_id='{job_id}' and i.run_id='{run_id}' and i.task_id !='{task_id}'
                    order by i.incident_number desc
                    limit 1

                ''')
        
            query_data2 = spark.sql(Query).collect()
            
            if len(query_data2)>0:
                print("Incident already exists with given job_id and run_id, retrieve incident number;check the incident status")
                
                incident_number=query_data2[0]['incident_number']
                print(incident_number)

                state=getincidentstatus(incident_url,incident_number,access_token_url,client_id,client_secret,resource)
                print(state)
                if state in ['800','900']:
                    print('Incident is closed.Create new incident for the new task')
                    create_servicenow_incident(incident_url,access_token_url,client_id,client_secret,resource,urgency,short_description,impact,cmdb_ci,caller_id,location,assignment_group,description,job_id,run_id,task_id)

                else:
                    print("update the existing incident")
                    
                    updation_status=update_existing_incident(incident_url,incident_number,task_info,access_token_url,client_id,client_secret,resource)
                    print(updation_status)
                    if updation_status[0] ==200:
                        print('--------incident updated-------------')
                        insert_into_metadata__execution_failed_runs(job_id,run_id,task_id,incident_number)
                        print('Incident details updated in incidents table')
                    else:
                        print('Error occured while updating the incident.')
                        print(updation_status[1])
                        print(task_info)
                        
                    
            # No incident exists with the job_id and run_id combination. so create a new incident
            else:
                print("No incident exists with the smae job_id and run_id. proceed with incident creation")
                
                create_status=create_servicenow_incident(incident_url,access_token_url,client_id,client_secret,resource,urgency,short_description,impact,cmdb_ci,caller_id,location,assignment_group,description,job_id,run_id,task_id)
                print(create_status)
                print(task_info)
    
    else:
        print('No failed taks found or the incidents already exists for the failed tasks')
        exit()
    

Check the status of the incident to check whether it has been closed/ resolved or still active.

In [None]:
import requests,json

def getincidentstatus(incident_url,incident_number,access_token_url,client_id,client_secret,resource):
    

        #url=f'https://jnj-internal-development.apigee.net/apg-001-servicenow/v1/now/table/incident?sysparm_query=number={incident_number}'
        
        incident_url=incident_url.rstrip('/')
        url=incident_url+'?sysparm_query=number='+incident_number
        access_token=get_access_token(access_token_url,client_id,client_secret,resource)

        headers={
            'Accept':'application/json',
            'Content-Type':'application/json',
            'Authorization':f'Bearer {access_token}'
        }

        response=requests.get(url,headers=headers)

        if response.status_code==200:
            data=response.json().get('result')
            print(data)
            state=data[0]['state']
            return state
           
        else:
            print("error:",response.status_code,response.text)
            exit()
        

If the incident is still active, then go ahead with the updation

In [None]:
import requests,json
 

def update_existing_incident(incident_url,incident_number,task_info,access_token_url,client_id,client_secret,resource):

    # Set the request parameters
   
    api_url=incident_url+incident_number
    access_token=get_access_token(access_token_url,client_id,client_secret,resource)
    # Set the updated field values for the incident
    incident_data={
    "incident_object" : {
                "comments":json.dumps(task_info)
                }
    }
    
    # Set proper headers
    headers = {'Authorization': f'Bearer {access_token}',"Accept":"application/json"}
    
    # Do the HTTP PUT request
    response = requests.put(api_url, headers=headers, json=incident_data)
    
    # Check for HTTP codes other than 200
    if response.status_code != 200: 
        print('Status:', response.status_code, 'Headers:', response.headers, 'Error Response:',response.json())
        exit()
    
    # Decode the JSON response into a dictionary and use the data
    else:
        data = response.json()
        print("Incident updated successfully")
        return response.status_code,data


Once the incident is successfully updated, insert the data into incidents table.

In [None]:
def insert_into_metadata__execution_failed_runs(job_id,run_id,task_id,incident_number):

    try:
        query=f'''insert into {job_catalog}._utils.metadata___incidents(job_id,run_id,task_id,incident_number,insert_ts)
        values('{job_id}','{run_id}','{task_id}','{incident_number}',CURRENT_TIMESTAMP()) '''
        spark.sql(query)
    except Exception as Error:
      print(Error)
      

If an incident was not created and there are no incidents from the same job run, then create a new incident. Also, if there exists an incident, but it is closed or resolved, then go ahead with creating a new incident.


In [None]:
import requests, json

def create_servicenow_incident(url,access_token_url,client_id,client_secret,resource,urgency,short_description,impact,cmdb_ci,caller_id,location,assignment_group,description,job_id,run_id,task_id):

    access_token=get_access_token(access_token_url,client_id,client_secret,resource)
   

    headers={'Authorization': f'Bearer {access_token}',
        'Content-Type':'application/json','Accept':'application/json'}

    data={
    "incident_object":{
        "urgency":urgency,
        "short_description":short_description,
        "impact":impact,
        "cmdb_ci":cmdb_ci,
        "caller_id":caller_id,
        "location": location,
        "assignment_group":assignment_group,
        "description":description
        }
    }
    response=requests.post(url,headers=headers,data=json.dumps(data))
    print(response)

    #check for http codes other than 200

    if response.status_code!=200:
        print('Status:',response.status_code,'Headers:', response.headers, 'Error Response:',response.json())
        exit()

    data=response.json()    
    incident=data['result']['return_response']['number']
    print('---------------Inserting incident data into metadata__execution_failed_runs table ')
    insert_into_metadata__execution_failed_runs(job_id,run_id,task_id,incident)
    return data



Get access token for the  API. Every time the notebook is run, a new access token is generated.

In [None]:


def get_access_token(access_token_url,client_id,client_secret,resource):

    headers={ 'Content-Type':'application/x-www-form-urlencoded'}
    payload=  {
    'f':'pjson',
    'grant_type':'client_credentials',
    'client_id':client_id,
    'client_secret':client_secret,
    'resource':resource

          }
    
    try:

        response=requests.post(access_token_url,headers=headers,data=payload)
        if response.status_code==200:
             token=response.json().get('access_token')
             return (token)
        else:
            print(f'Failed to obtain access token. Status Code:{response.status_code}')
            print(f'Response content: {response.content.decode()}')
        
    except Exception as e:
         print(f'Error:{str(e)}') 




All the details required to create or update a incident can be retrieved from _workflow_integration_parameters table.

In [None]:
def retrieve_workflow_details(environment,var_name):
  
  try:
    
    Query = (f'''
      select 
        *
      from
        {job_catalog}._utils._workflow_integration_parameters 
      where 
           upper(_Env) = upper('{environment}') and variable_name=('{var_name}')
      ''')
    
    query_data = spark.sql(Query).collect()
    print(query_data[0][2])
    return query_data[0][2]
  
   
  except Exception as Error:
    print(Error)


In [None]:

    incident_url=retrieve_workflow_details(env,'incident_url')
    access_token_url=retrieve_workflow_details(env,'access_token_url')
    client_id=retrieve_workflow_details(env,'client_id')
    resource=retrieve_workflow_details(env,'resource')
    scope_name=retrieve_workflow_details(env,'secret_scope_name')
    secret_name=retrieve_workflow_details(env,'secret_name')
    client_secret=dbutils.secrets.get(scope_name,secret_name)
    urgency=retrieve_workflow_details(env,'urgency')
    impact=retrieve_workflow_details(env,'impact')
    cmdb_ci=retrieve_workflow_details(env,'cmdb_ci')
    caller_id=retrieve_workflow_details(env,'caller_id')
    location=retrieve_workflow_details(env,'location')
    assignment_group=retrieve_workflow_details(env,'assignment_group')




In [None]:

tasks_list=retrieve_taskinfo_from_metadata_table_check_incident(incident_url,access_token_url,client_id,client_secret,resource,urgency,impact,cmdb_ci,caller_id,location,assignment_group)
