In [287]:
from constants import common
import pandas as pd
import sys

In [288]:
URL = 'https://api-dev.prison.service.justice.gov.uk/v3/api-docs'
URLS = [
#API Docs
  "https://manage-adjudications-api-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://assess-risks-and-needs-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://raw.githubusercontent.com/ministryofjustice/hmpps-complexity-of-need/main/Complexity%20Of%20Need%20API%20Specification.yaml",
  "https://court-register-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://raw.githubusercontent.com/ministryofjustice/curious-API/main/curious-api-specification.yaml",
  "https://sign-in-dev.hmpps.service.justice.gov.uk/auth/v3/api-docs",
#  "https://keyworker-api-dev.prison.service.justice.gov.uk/v3/api-docs",
#  "https://allocation-manager-staging.apps.live-1.cloud-platform.service.justice.gov.uk/api-docs/index.html",
  "https://community-api.test.probation.service.justice.gov.uk/v3/api-docs/Community%20API",
#  "https://probation-offender-events-dev.hmpps.service.justice.gov.uk/swagger-ui.html",
  "https://probation-offender-search-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://api-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://offender-events-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://prisoner-offender-search-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://offender-dev.aks-dev-1.studio-hosting.service.justice.gov.uk/v3/api-docs",
  "https://prison-register-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://hmpps-allocations-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://probation-teams-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://probation-teams-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://hmpps-interventions-service-dev.apps.live-1.cloud-platform.service.justice.gov.uk/v3/api-docs",
  "https://restricted-patients-api-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://hmpps-staff-lookup-service-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://hmpps-tier-dev.hmpps.service.justice.gov.uk/v3/api-docs",
  "https://token-verification-api-dev.prison.service.justice.gov.uk/v3/api-docs",
  "https://hmpps-workload-dev.hmpps.service.justice.gov.uk/v3/api-docs",

#Microservices
  "https://raw.githubusercontent.com/ministryofjustice/hmpps-complexity-of-need/main/Complexity%20Of%20Need%20API%20Specification.yaml"
]
SEARCH_TERM = 'additional days'
SCHEMA = ''
SCHEMA_RESULT_COLS = ["Schema", "Field", "Field_metadata"]
PATH_RESULT_COLS = ["Path", "Http_method", "Path_metadata"]


## Search scripts
I envisage this section being split into several parts, with the outcome to return a dataframe of results. This could then be interpreted into a report format.
Loose plan:
* Retrieve all schemas and fields, and metadata
* Retrieve all endpoints and metadata
* Devise search logic
  * needs to lower case and remove spaces to be able to catch phrases. No fuzzy searching, just substring searching.
  * we can search dataframes of this kind by applying a mask via the 'df.applymap' to run a string search at cell level. By locating (df.loc[mask.any(axis=1)]) within the dataframe based on this mask we can then return only rows that match the search criteria
  * We can do this search on a single search criteria one at a time very quickly. 
  * The resultant dataframes need an additional column or some kind of identifier to showcase what's been searched on
* Iterate over a list of search terms?
  * 2 seperate reports, schemas versus endpoint search?

In [277]:
#flatten = lambda l: [item for sublist in l for item in sublist] ## only works when its a straight list of list [[a,b,c]] -> [a,b,c]

# Define a search function
def search_string(string_to_search, search_phrase) -> str:
    return str(search_phrase).lower().replace(" ", "") in str(string_to_search).lower().replace(" ", "")

def get_schema_or_path_data(dict_resp, 
                            dict_keys_list=["components","schemas"], 
                            schema_bool=True, 
                            metadata_divider='|') -> pd.DataFrame:
    """ 
    TODO
    Currently only returns all schemas, fields and nested metadata values (not metadata keys).
    Outputs as a dataframe. If errenous schema or field data is attempted to be appended the row will just be empty.
    Needs to handle an empty dictionary.
    """
    if schema_bool:
        df_cols = SCHEMA_RESULT_COLS
    else:
        df_cols = PATH_RESULT_COLS
    list_of_lists = []
    try:
        nested_dict = common.get_nested_dictionary_or_value(dict_resp, dict_keys_list, return_value={})
    except TypeError as t_e:
        print("You need to provide a dictionary object", "\n")
        print(f"Unexpected {t_e=}, {type(t_e)=}")
        sys.exit()
    except KeyError as k_e:
        print("Response is not compatable with this function")
        print("Check it is standardised swagger spec","\n")
        print(f"Unexpected {k_e=}, {type(k_e)=}")
        sys.exit()
    
    try:
        for key_a in nested_dict:
            if isinstance(common.get_nested_dictionary_or_value(nested_dict, [key_a, "properties"]), int) is False:
                modelled_nested_dict = nested_dict[key_a]["properties"]
            else:
                modelled_nested_dict = nested_dict[key_a]            
            for key_b in modelled_nested_dict:
                try:   
                    row_builder = []
                    row_builder.append(key_a)
                    row_builder.append(key_b)
                    string_to_build = str(metadata_divider)
                    for metadata_value in modelled_nested_dict[key_b].values():
                        string_to_build = f"{string_to_build} {metadata_value} |"
                    row_builder.append(string_to_build)
                    list_of_lists.append(row_builder)
                except AttributeError:
                    row_builder = [None, None, None]
        df = pd.DataFrame(list_of_lists, columns=df_cols)

    except KeyError:    
        sys.exit()
    return df     

def search_df_for_phrase(data_frame, search_phrase) -> pd.DataFrame:
    """
    TODO
    """
    mask = data_frame.applymap(lambda df_cell: search_string(df_cell, search_phrase))
    filtered_data_frame = data_frame.loc[mask.any(axis=1)]
    return filtered_data_frame

def add_column_value(data_frame, column_name, value) -> pd.DataFrame:
    """ 
    #TODO - this function is just responsible for adding an extra column to a dataframe for a phrase
    """
    data_frame[column_name] = value
    return data_frame

def find_context(target_str, search_str, col_sep='|') -> str:
    """ 
    TODO
    #Currently doesn't work for enums
    """
    context = ''
    formatted_target = str.lower(target_str)
    formatted_search = str.lower(search_str)
    index_pos = formatted_target.find(formatted_search)
    if index_pos != -1:
        context_left = formatted_target.rfind(col_sep, 0, index_pos) 
        context_right = formatted_target.find(col_sep,index_pos) #if -1, then don't populate?
        if context_left != -1 and context_right != -1:
            context = target_str[context_left+1:context_right] #So then here, if both -1, just return the string.
    #If the term doesn't exist this will actually default to [0,-1], which is everything but the last character
    return context

def search_api_for_phrase(url=URL, search_phrase=SEARCH_TERM) -> (pd.DataFrame, pd.DataFrame):
    json_extract = common.extract_data(url)
    schema_df = get_schema_or_path_data(
                json_extract,
                dict_keys_list=["components","schemas"], 
                schema_bool=True, 
                metadata_divider='|'
            )
    path_df   = get_schema_or_path_data(
                json_extract,
                dict_keys_list=["paths"], 
                schema_bool=False, 
                metadata_divider='|'
            )
    
    filtered_schema_df = search_df_for_phrase(schema_df, search_phrase)
    filtered_path_df = search_df_for_phrase(path_df, search_phrase)
        
    return filtered_schema_df, filtered_path_df
    
#TODO now take this output and handle it for multiple APIs
#TODO doesn't seem to work for other APIs... investigate

def retrieve_context(df, column_name, search_phrase, col_sep='|') -> pd.DataFrame:

    """ 
    #TODO returns a dataframe with a Context column added
    """
    ##Currently not filtering context properly, look into this.
    
    copy_df = df.copy()
    copy_df["Context"] = ''
    column_index = copy_df.columns.get_loc(column_name)
    context_series = copy_df.loc[:, column_name].apply(lambda x: find_context(x, search_phrase, col_sep))#started happening since this was .loc
    copy_df["Context"] = context_series
    for index, row in copy_df.iterrows():
        if str.lower(search_phrase) in str.lower(row[column_index]):
            copy_df.at[index, 'Context'] = str.lower(row[column_index]).replace(search_phrase, f">>>{search_phrase}<<<")
    return copy_df

In [286]:
## emulate main method
common.prepare_directory("outputs/schema_report.csv")
common.prepare_directory("outputs/path_report.csv")
### Need to extract this to a variable possibly
empty_schema_report = pd.DataFrame(columns=["Schema", "Field", "Field_metadata", "Context", "Search Phrase", "API"])
empty_path_report = pd.DataFrame(columns=["Path", "Http_method", "Path_metadata", "Context", "Search Phrase", "API"])
empty_schema_report.to_csv("outputs/schema_report.csv", index=False)
empty_path_report.to_csv("outputs/path_report.csv", index=False)

for url in URLS:
    schema_df, path_df = search_api_for_phrase(url, SEARCH_TERM)
    #Schema search
    schema_w_context = retrieve_context(schema_df, "Field_metadata", SEARCH_TERM)
    full_schema_w_search = add_column_value(schema_w_context, "Search Phrase", SEARCH_TERM)
    full_schema_w_search = add_column_value(schema_w_context, "API", url)
    full_schema_w_search.to_csv("outputs/schema_report.csv", mode='a', header=False, index=False)
    #Path search
    path_w_context = retrieve_context(path_df, "Path_metadata", SEARCH_TERM)
    full_path_w_search = add_column_value(path_w_context, "Search Phrase", SEARCH_TERM)
    full_path_w_search = add_column_value(path_w_context, "API", url)
    full_path_w_search.to_csv("outputs/path_report.csv", mode='a', header=False, index=False)
    

https://assess-risks-and-needs-dev.hmpps.service.justice.gov.uk/v3/api-docs  responded with  403  returning empty dictionary
https://keyworker-api-dev.prison.service.justice.gov.uk/v3/api-docs  responded with  401  returning empty dictionary
https://hmpps-allocations-dev.hmpps.service.justice.gov.uk/v3/api-docs  responded with  403  returning empty dictionary
https://hmpps-staff-lookup-service-dev.hmpps.service.justice.gov.uk/v3/api-docs  responded with  403  returning empty dictionary


In [254]:
common.prepare_directory("outputs/schema_report.csv")
common.prepare_directory("outputs/path_report.csv")

empty_schema_report = pd.DataFrame(columns=["Schema", "Field", "Field_metadata", "Context", "Search Phrase"])
empty_path_report = pd.DataFrame(columns=["Path", "Http_method", "Path_metadata", "Context", "Search Phrase"])
empty_schema_report.to_csv("outputs/schema_report.csv")
empty_path_report.to_csv("outputs/path_report.csv")

In [246]:
schema_w_context = retrieve_context(schema_df, "Field_metadata", SEARCH_TERM)
schema_w_context

Unnamed: 0,Schema,Field,Field_metadata,Context
225,SentenceCalcDates,additionalDaysAwarded,| integer | ADA - days added to sentence term ...,
1142,KeyDates,additionalDaysAwarded,| integer | ADA - days added to sentence term ...,
1186,SentenceAdjustmentDetail,additionalDaysAwarded,| integer | Number of additional days awarded ...,| integer | number of >>>additional days<<< aw...
1189,SentenceAdjustmentDetail,restoredAdditionalDaysAwarded,| integer | Number of restored additional days...,| integer | number of restored >>>additional d...


In [241]:
full_schema_w_search = add_phrase_column(schema_w_context, SEARCH_TERM)
full_schema_w_search

Unnamed: 0,Schema,Field,Field_metadata,Context,Search Term
225,SentenceCalcDates,additionalDaysAwarded,| integer | ADA - days added to sentence term ...,,additional days
1142,KeyDates,additionalDaysAwarded,| integer | ADA - days added to sentence term ...,,additional days
1186,SentenceAdjustmentDetail,additionalDaysAwarded,| integer | Number of additional days awarded ...,| integer | number of >>>additional days<<< aw...,additional days
1189,SentenceAdjustmentDetail,restoredAdditionalDaysAwarded,| integer | Number of restored additional days...,| integer | number of restored >>>additional d...,additional days


In [234]:
path_df

Unnamed: 0,Path,Http_method,Path_metadata
251,/api/bookings/{bookingId}/sentenceDetail,get,| ['bookings'] | Offender sentence detail (key...


### Misc Scripts
This section is for other logic used to test functionality

In [110]:
## Schema metadata analysis    
df_cols = ["Schema", "Field", "Field_metadata"]
metadata_key_list = []
schema_source = json_extract["components"]["schemas"]
for schema in schema_source:
    for field in schema_source[schema]["properties"]:
        for metadata_key in schema_source[schema]["properties"][field]:
            metadata_key_list.append(metadata_key)
metadata_cols = list(set(metadata_key_list))
for item in metadata_cols:
    df_cols.append(item)
print(df_cols)

## Endpoint metadata analysis

df_cols = ["path", "http_method"]
metadata_key_list = []
path_source = json_extract["paths"]
for path in path_source:
    for http_method in path_source[path]:
        for metadata_key in path_source[path][http_method]:
            metadata_key_list.append(metadata_key)
metadata_cols = list(set(metadata_key_list))
for item in metadata_cols:
    df_cols.append(item)
print(df_cols)

['Schema', 'Field', 'Field_metadata', 'items', 'description', 'enum', 'pattern', 'type', 'minLength', 'format', '$ref', 'maxLength', 'maximum', 'example', 'minimum', 'uniqueItems', 'deprecated']
['path', 'http_method', 'parameters', 'tags', 'requestBody', 'description', 'responses', 'summary', 'deprecated', 'operationId']


In [111]:
mask = df.applymap(lambda x: search_string(x, 'sentence'))
filtered_df = df.loc[mask.any(axis=1)]
filtered_df

Unnamed: 0,Path,Http_method,Path_metadata
17,/api/offender-sentences/booking/{bookingId}/ho...,put,| ['offender-sentences'] | Set the HDC checks ...
18,/api/offender-sentences/booking/{bookingId}/ho...,delete,| ['offender-sentences'] | Clear the HDC check...
19,/api/offender-sentences/booking/{bookingId}/ho...,put,| ['offender-sentences'] | Set the HDC approva...
20,/api/offender-sentences/booking/{bookingId}/ho...,delete,| ['offender-sentences'] | Clear the HDC appro...
65,/api/offender-sentences,get,| ['offender-sentences'] | List of offenders (...
66,/api/offender-sentences,post,| ['offender-sentences'] | Retrieves list of o...
67,/api/offender-sentences/home-detention-curfews...,post,| ['offender-sentences'] | Retrieve the latest...
68,/api/offender-sentences/bookings,post,| ['offender-sentences'] | Retrieves list of o...
69,/api/offender-dates/{bookingId},get,| ['offender-dates'] | Get the key dates for a...
70,/api/offender-dates/{bookingId},post,| ['offender-dates'] | Update the key dates fo...
