In [None]:
import psycopg2
from psycopg2 import extras
import pandas as pd
import json
import re
from tabulate import tabulate

In [None]:
postgres = {
    'host': 'localhost',
    'database': 'tenshi',
    'username': 'postgres',
    'password': 'postgres',
}

In [None]:
source_connection = psycopg2.connect(f"host={postgres['host']} dbname={postgres['database']} user={postgres['username']} password={postgres['password']}")

In [None]:
params = {
    'checklist_ids': (373013004340813824, 1),
    'use_case_ids': (1660291903, 1660291904)
}

def get_next_id(df: pd.DataFrame, column: str):
    max_df_id = df[column].max()
    if pd.isna(max_df_id):
        return 1
    return max_df_id + 1

In [None]:
# ETL: Facility

FACILITY_QUERY = """
SELECT id, name FROM facilities WHERE id != -1;
"""
facility_df = pd.read_sql(FACILITY_QUERY, source_connection, params=params)
new_facility_df = facility_df.copy()
new_facility_df.rename(columns={'id': 'facility_id', 'name': 'facility_name'}, inplace=True)
print(tabulate(new_facility_df, headers='keys', tablefmt='pretty'))

In [None]:

CHECKLIST_QUERY = """
SELECT c.id as checklist_id, c.name, c.code, uc.id as use_case_id, uc.name as use_case_name, cfm.facilities_id  
FROM checklists c JOIN use_cases uc ON uc.id = c.use_cases_id JOIN checklist_facility_mapping cfm ON cfm.checklists_id = c.id
WHERE c.state = 'PUBLISHED' AND c.archived = FALSE AND c.use_cases_id IN %(use_case_ids)s AND c.id IN %(checklist_ids)s
"""
checklist_df = pd.read_sql(CHECKLIST_QUERY, source_connection, params=params)
new_process_df = checklist_df.copy()
new_process_df.rename(columns={'checklist_id': 'id', 'use_case_name': 'process_type', 'name': 'process_name',
                               'facilities_id': 'facility_id'}, inplace=True)
new_process_df.drop('code', axis=1, inplace=True)
new_process_df.drop('use_case_id', axis=1, inplace=True)
print(tabulate(new_process_df, headers='keys', tablefmt='pretty'))

In [None]:
STAGE_QUERY = """
SELECT s.id as stage_id, s."name", s.checklists_id as checklist_id, s.order_tree FROM stages s JOIN checklists c ON c.id = s.checklists_id WHERE s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree
"""
stage_df = pd.read_sql(STAGE_QUERY, source_connection, params=params)
new_stage_df = stage_df.copy()
new_stage_df.rename(columns={'stage_id': 'id', 'name': 'stage_name', 'checklist_id': 'process_id'}, inplace=True)
new_stage_df.drop('order_tree', axis=1, inplace=True)
new_stage_df['stage_type'] = ''
print(tabulate(new_stage_df, headers='keys', tablefmt='pretty'))

In [None]:
TASK_QUERY = """
SELECT t.id as task_id, t.name, t.order_tree, t.stages_id as stage_id, s.checklists_id as checklist_id FROM tasks t JOIN stages s ON s.id = t.stages_id JOIN checklists c ON c.id = s.checklists_id WHERE t.archived = FALSE AND s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree, t.order_tree 
"""
task_df = pd.read_sql(TASK_QUERY, source_connection, params=params)
new_step_df = task_df.copy()

print(tabulate(task_df, headers='keys', tablefmt='pretty'))

In [None]:
PARAMETER_QUERY = """
SELECT p.id as parameter_id, p."label" AS name, p."data", p."type", p.order_tree, p.tasks_id as task_id, t.stages_id as stage_id, s.checklists_id as checklist_id FROM parameters p JOIN tasks t ON t.id = p.tasks_id JOIN stages s ON s.id = t.stages_id JOIN checklists c ON c.id = s.checklists_id WHERE p.archived = FALSE  AND t.archived = FALSE AND s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree, t.order_tree, p.order_tree
"""
parameter_df = pd.read_sql(PARAMETER_QUERY, source_connection, params=params)
new_step_attribute_df = pd.DataFrame(
    columns=['id', 'step_id', 'data_type_id', 'attribute_label', 'resource_id', 'expected_value1', 'expected_value2',
             'comparison_operator', 'resource_type'])

new_step_attribute_data_types_df = pd.DataFrame(
    columns=['data_type_id', 'measurement_type', 'measurement_unit', 'measurement_description'])

print(tabulate(parameter_df, headers='keys', tablefmt='pretty'))

In [None]:
parameter_types = ('INSTRUCTION', 'MATERIAL')
# Filter to just instructions 
instruction_df = parameter_df[parameter_df['type'] == 'INSTRUCTION']

# Compile regex once 
regex = re.compile(r'<.*?>')

# Remove HTML tags in a vectorized manner and create a new 'clean_text' column
instruction_df['instruction'] = instruction_df['data'].apply(lambda x: re.sub(regex, '', x['text']))

# Group by 'task_id' and 'type', then join the texts together
grouped = instruction_df.groupby(['task_id', 'type'])['instruction'].apply('\n'.join).reset_index()

# Filter out only the 'INSTRUCTION' type
instructions = grouped[grouped['type'] == 'INSTRUCTION']

new_step_df = new_step_df.merge(instructions[['task_id', 'instruction']], on='task_id', how='left')

new_step_df.rename(columns={'task_id': 'id', 'name': 'step_name'}, inplace=True)
new_step_df.drop('order_tree', axis=1, inplace=True)
new_step_df.drop('stage_id', axis=1, inplace=True)
new_step_df.drop('checklist_id', axis=1, inplace=True)

print(tabulate(new_step_df, headers='keys', tablefmt='pretty'))

In [24]:
def append_to_df(new_rows_attribute_data_types, new_rows_attribute, attribute_data_types_df, attribute_df):
    new_rows_data_types_df = pd.DataFrame(new_rows_attribute_data_types)
    new_rows_attribute_df = pd.DataFrame(new_rows_attribute)
    attribute_data_types_df = pd.concat([attribute_data_types_df, new_rows_data_types_df], ignore_index=True)
    attribute_df = pd.concat([attribute_df, new_rows_attribute_df], ignore_index=True)
    return  attribute_data_types_df, attribute_df

def create_row_attribute_data_type(data_type_id, measurement_type, measurement_unit, measurement_description):
    return {
        'data_type_id': data_type_id,
        'measurement_type': measurement_type,
        'measurement_unit': measurement_unit,
        'measurement_description': measurement_description
    }

def create_row_attribute(attribute_id, step_id, data_type_id, attribute_label, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id):
    return {
        'id': attribute_id,
        'step_id': step_id,
        'data_type_id': data_type_id,
        'attribute_label': attribute_label,
        'expected_value1': expected_value1,
        'expected_value2': expected_value2,
        'comparison_operator': comparison_operator,
        'resource_id': resource_id,
        'resource_type': resource_type,
        'reference_id': reference_id
    }

def create_rows(parameter, identifier):
    new_rows_attribute_data_type = []
    new_rows_attribute = []
    measurement_unit = expected_value1 = expected_value2 = comparison_operator = resource_id = resource_type = reference_id = measurement_type = None
    parameter_type, step_id, name = parameter['type'], parameter['stage_id'], parameter['name']
    if parameter_type not in (['SINGLE_SELECT', 'CHECKLIST', 'MULTISELECT']):
        is_parameter_type_handled = True
        if parameter_type == 'NUMBER':
            measurement_type = 'integer'
        elif parameter_type == 'SHOULD_BE':
            measurement_type = 'float'
            operator = parameter['data']['operator']
            if operator == 'EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '='
            elif operator == 'LESS_THAN':
                expected_value1 = parameter['data']['value']
                comparison_operator = '<'
            elif operator == 'LESS_THAN_EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '<='
            elif operator == 'MORE_THAN':
                expected_value1 = parameter['data']['value']
                comparison_operator = '>'
            elif operator == 'MORE_THAN_EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '>='
            elif operator == 'BETWEEN':                
                expected_value1 = parameter['data']['lowerValue']
                expected_value2 = parameter['data']['upperValue']
                comparison_operator = 'between'
        elif parameter_type == 'SINGLE_LINE' or parameter_type == 'MULTI_LINE':
            measurement_type = 'text'
        elif parameter_type == 'DATE' or parameter_type == 'DATE_TIME':
            measurement_type = parameter_type.lower()
        elif parameter_type == 'YES_NO':
            measurement_type = 'boolean'
        elif parameter_type == 'RESOURCE':
            measurement_type = 'text'
            resource_type = parameter['data']['collection']
        else:
            print(f"Parameter type: {parameter_type} is not implemented")
            is_parameter_type_handled = False
        if is_parameter_type_handled:    
            new_row_attribute_data_type = create_row_attribute_data_type(identifier, measurement_type, measurement_unit, name)
            new_row_attribute = create_row_attribute(identifier, step_id, new_row_attribute_data_type['data_type_id'], name, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id)
            new_rows_attribute_data_type.append(new_row_attribute_data_type)
            new_rows_attribute.append(new_row_attribute)
    else:
        if parameter_type == 'SINGLE_SELECT' or parameter_type == 'CHECKLIST' or parameter_type == 'MULTISELECT':
            measurement_type = 'boolean'
            for choice in row['data']:
                name = parameter['name'] + ' - ' + choice['name']
                reference_id = choice['id']
                new_row_attribute_data_type = create_row_attribute_data_type(identifier, measurement_type, measurement_unit, name)
                new_row_attribute = create_row_attribute(identifier, step_id, new_row_attribute_data_type['data_type_id'], name, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id)
                new_rows_attribute_data_type.append(new_row_attribute_data_type)
                new_rows_attribute.append(new_row_attribute)
                identifier += 1
        else:
            print(f"Parameter type: {parameter_type} is not implemented.")
                
    return new_rows_attribute_data_type, new_rows_attribute

next_id = get_next_id(new_step_attribute_data_types_df, 'data_type_id')
relevant_parameter_df = parameter_df[~parameter_df['type'].isin(['INSTRUCTION', 'MATERIAL', 'MEDIA', 'SIGNATURE', 'FILE_UPLOAD'])]
for index, row in relevant_parameter_df.iterrows():
    attribute_data_types, attributes = create_rows(row, next_id)
    if len(attribute_data_types) != 0 and len(attributes) != 0:
        new_step_attribute_data_types_df, new_step_attribute_df = append_to_df(attribute_data_types, attributes, new_step_attribute_data_types_df, new_step_attribute_df)
    next_id += len(attribute_data_types)

print(tabulate(new_step_attribute_data_types_df, headers='keys', tablefmt='pretty'))
print(tabulate(new_step_attribute_df, headers='keys', tablefmt='pretty'))

+----+--------------+------------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    | data_type_id | measurement_type | measurement_unit |                                                                        measurement_description                                                                         |
+----+--------------+------------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 0  |      1       |       text       |                  |                                                                    Enter the previous product name                                                                     |
| 1  |      2       |       text       |                  |                         

  parameter_df = pd.read_sql(PARAMETER_QUERY, source_connection, params=params)
