In [18]:
import psycopg2
from psycopg2 import extras
import pandas as pd
import json
import urllib.parse
from sqlalchemy import create_engine
import re
from tabulate import tabulate

In [12]:
db_config = {
    'postgres': {
        'source': {
            'host': 'localhost',
            'database': 'tenshi',
            'username': 'postgres',
            'password': 'postgres',
        },
        'destination': {
            'host': 'localhost',
            'database': 'report_tenshi',
            'username': 'postgres',
            'password': 'postgres',
        }
    }
}

table_names = {
    'facilities': 'facilities'
}

In [20]:
def get_postgres_connection(postgres_config):
    username = urllib.parse.quote_plus(postgres_config['username'])
    password = urllib.parse.quote_plus(postgres_config['password'])
    return psycopg2.connect(f"host={postgres_config['host']} dbname={postgres_config['database']} user={username} password={password}")


def get_postgres_engine(postgres_config):
    username = urllib.parse.quote_plus(postgres_config['username'])
    password = urllib.parse.quote_plus(postgres_config['password'])
    connection_string = (f"postgresql+psycopg2://{username}:{password}@"
                         f"{postgres_config['host']}/"
                         f"{postgres_config['database']}")
    return create_engine(connection_string, echo=True)


source_connection = get_postgres_connection(db_config['postgres']['source'])
# destination_connection = get_postgres_connection(db_config['postgres']['destination'])
destination_engine = get_postgres_engine(db_config['postgres']['destination'])

In [14]:
params = {
    'checklist_ids': (373013004340813824, 1),
    'use_case_ids': (1660291903, 1660291904)
}

def get_next_id(df: pd.DataFrame, column: str):
    max_df_id = df[column].max()
    if pd.isna(max_df_id):
        return 1
    return max_df_id + 1

In [22]:
FACILITY_QUERY = """
SELECT id, name FROM facilities WHERE id != -1;
"""
facility_df = pd.read_sql(FACILITY_QUERY, source_connection, params=params)
new_facility_df = facility_df.copy()
new_facility_df.rename(columns={'id': 'facility_id', 'name': 'facility_name'}, inplace=True)
new_facility_df.to_sql(table_names['facilities'], destination_engine, if_exists='replace', index=False)
print(tabulate(new_facility_df, headers='keys', tablefmt='pretty'))

2023-10-21 15:12:48,108 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-10-21 15:12:48,111 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2023-10-21 15:12:48,112 INFO sqlalchemy.engine.Engine [cached since 66.62s ago] {'table_name': 'facilities', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2023-10-21 15:12:48,116 INFO sqlalchemy.engine.Engine 
CREATE TABLE facilities (
	facility_id BIGINT, 
	facility_name TEXT
)


2023-10-21 15:12:48,116 INFO sqlalchemy.engine.Engine [no key 0.00083s] {}
2023-10-21 15:12:48,123 I

  facility_df = pd.read_sql(FACILITY_QUERY, source_connection, params=params)


In [7]:
CHECKLIST_QUERY = """
SELECT c.id as checklist_id, c.name, c.code, uc.id as use_case_id, uc.name as use_case_name, cfm.facilities_id  
FROM checklists c JOIN use_cases uc ON uc.id = c.use_cases_id JOIN checklist_facility_mapping cfm ON cfm.checklists_id = c.id
WHERE c.state = 'PUBLISHED' AND c.archived = FALSE AND c.use_cases_id IN %(use_case_ids)s AND c.id IN %(checklist_ids)s
"""
checklist_df = pd.read_sql(CHECKLIST_QUERY, source_connection, params=params)
new_process_df = checklist_df.copy()
new_process_df.rename(columns={'checklist_id': 'id', 'use_case_name': 'process_type', 'name': 'process_name',
                               'facilities_id': 'facility_id'}, inplace=True)
new_process_df.drop('code', axis=1, inplace=True)
new_process_df.drop('use_case_id', axis=1, inplace=True)
print(tabulate(new_process_df, headers='keys', tablefmt='pretty'))

+---+--------------------+---------------------------------------------------+--------------------+-------------+
|   |         id         |                   process_name                    |    process_type    | facility_id |
+---+--------------------+---------------------------------------------------+--------------------+-------------+
| 0 | 373013004340813824 | Cleaning checklist for Sampling/ Dispensing booth | Equipment Cleaning | 1665458801  |
+---+--------------------+---------------------------------------------------+--------------------+-------------+


  checklist_df = pd.read_sql(CHECKLIST_QUERY, source_connection, params=params)


In [8]:
STAGE_QUERY = """
SELECT s.id as stage_id, s."name", s.checklists_id as checklist_id, s.order_tree FROM stages s JOIN checklists c ON c.id = s.checklists_id WHERE s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree
"""
stage_df = pd.read_sql(STAGE_QUERY, source_connection, params=params)
new_stage_df = stage_df.copy()
new_stage_df.rename(columns={'stage_id': 'id', 'name': 'stage_name', 'checklist_id': 'process_id'}, inplace=True)
new_stage_df.drop('order_tree', axis=1, inplace=True)
new_stage_df['stage_type'] = ''
print(tabulate(new_stage_df, headers='keys', tablefmt='pretty'))

+---+--------------------+-------------------------------+--------------------+------------+
|   |         id         |          stage_name           |     process_id     | stage_type |
+---+--------------------+-------------------------------+--------------------+------------+
| 0 | 373013004378562561 | Product and Equipment details | 373013004340813824 |            |
| 1 | 373013142618628096 |         Prerequisite          | 373013004340813824 |            |
| 2 | 373013143281328128 |      Cleaning procedure       | 373013004340813824 |            |
| 3 | 373013143948222464 |     Verification by PR/WH     | 373013004340813824 |            |
| 4 | 373013144690614272 |     Verification by IPQA      | 373013004340813824 |            |
| 5 | 373013146125066240 | Cleaning verification result  | 373013004340813824 |            |
+---+--------------------+-------------------------------+--------------------+------------+


  stage_df = pd.read_sql(STAGE_QUERY, source_connection, params=params)


In [9]:
TASK_QUERY = """
SELECT t.id as task_id, t.name, t.order_tree, t.stages_id as stage_id, s.checklists_id as checklist_id FROM tasks t JOIN stages s ON s.id = t.stages_id JOIN checklists c ON c.id = s.checklists_id WHERE t.archived = FALSE AND s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree, t.order_tree 
"""
task_df = pd.read_sql(TASK_QUERY, source_connection, params=params)
new_step_df = task_df.copy()

print(tabulate(task_df, headers='keys', tablefmt='pretty'))

+----+--------------------+-------------------------------------------------------------------------------------+------------+--------------------+--------------------+
|    |      task_id       |                                        name                                         | order_tree |      stage_id      |    checklist_id    |
+----+--------------------+-------------------------------------------------------------------------------------+------------+--------------------+--------------------+
| 0  | 373013004378562562 |                            Product and Equipment details                            |     1      | 373013004378562561 | 373013004340813824 |
| 1  | 373039997950222336 |                                    Prerequisites                                    |     1      | 373013142618628096 | 373013004340813824 |
| 2  | 373042088244535296 |                            Pre-preparation for cleaning                             |     1      | 373013143281328128 | 3730130

  task_df = pd.read_sql(TASK_QUERY, source_connection, params=params)


In [10]:
PARAMETER_QUERY = """
SELECT p.id as parameter_id, p."label" AS name, p."data", p."type", p.order_tree, p.tasks_id as task_id, t.stages_id as stage_id, s.checklists_id as checklist_id FROM parameters p JOIN tasks t ON t.id = p.tasks_id JOIN stages s ON s.id = t.stages_id JOIN checklists c ON c.id = s.checklists_id WHERE p.archived = FALSE  AND t.archived = FALSE AND s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree, t.order_tree, p.order_tree
"""
parameter_df = pd.read_sql(PARAMETER_QUERY, source_connection, params=params)

print(tabulate(parameter_df, headers='keys', tablefmt='pretty'))

+----+--------------------+----------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+------------+--------------------+--------------------+-

  parameter_df = pd.read_sql(PARAMETER_QUERY, source_connection, params=params)


In [11]:
TASK_EXECUTION_QUERY = """
SELECT te.id AS id, t.id AS task_id, te.reason, TO_TIMESTAMP(te.started_at) AS started_at, TO_TIMESTAMP(te.ended_at) AS ended_at, te.state AS state, concat( tsu.first_name, ' ', tsu.last_name, ' ( ID: ', tsu.employee_id, ')' ) AS started_by FROM task_executions te JOIN tasks t ON t.id = te.tasks_id JOIN stages s ON s.id = t.stages_id JOIN checklists c ON c.id = s.checklists_id JOIN jobs j ON j.id = te.jobs_id JOIN users tsu ON tsu.id = te.started_by WHERE t.archived = FALSE AND s.archived = FALSE AND c.id IN %(checklist_ids)s ORDER BY c.id, s.order_tree, t.order_tree
"""
task_execution_df = pd.read_sql(TASK_EXECUTION_QUERY, source_connection, params=params)
executed_step_df = task_execution_df.copy()
print(tabulate(task_execution_df, headers='keys', tablefmt='pretty'))

+------+--------------------+--------------------+--------------------------------+---------------------------+---------------------------+--------------------------+-----------------------------------+
|      |         id         |      task_id       |             reason             |        started_at         |         ended_at          |          state           |            started_by             |
+------+--------------------+--------------------+--------------------------------+---------------------------+---------------------------+--------------------------+-----------------------------------+
|  0   | 391874377808207872 | 373013004378562562 |                                | 2023-08-18 08:52:05+00:00 | 2023-08-18 08:55:12+00:00 |        COMPLETED         |   Vinoth Venkatesan ( ID: 3309)   |
|  1   | 410765324685107201 | 373013004378562562 |                                | 2023-10-09 11:54:54+00:00 | 2023-10-09 11:55:25+00:00 |        COMPLETED         |   Vinoth Venkatesan (

  task_execution_df = pd.read_sql(TASK_EXECUTION_QUERY, source_connection, params=params)


In [12]:
PARAMETER_EXECUTION_QUERY = """
SELECT pv.id, te.id AS task_execution_id, pv.parameters_id, pv.value, pv.choices, p."type" AS parameter_type, to_timestamp(pv.modified_at) AS modified_at, te.tasks_id as task_id FROM parameter_values pv JOIN parameters p ON p.id = pv.parameters_id LEFT JOIN tasks t ON t.id = p.tasks_id LEFT JOIN task_executions te ON te.tasks_id = t.id AND te.jobs_id = pv.jobs_id LEFT JOIN jobs j ON j.id = te.jobs_id AND j.id = pv.jobs_id WHERE p.archived = FALSE AND t.archived = FALSE AND j.checklists_id IN %(checklist_ids)s
"""
parameter_execution_df = pd.read_sql(PARAMETER_EXECUTION_QUERY, source_connection, params=params)
executed_step_measurement_df = parameter_execution_df.copy()
print(tabulate(parameter_execution_df, headers='keys', tablefmt='pretty'))

  parameter_execution_df = pd.read_sql(PARAMETER_EXECUTION_QUERY, source_connection, params=params)


+------+--------------------+--------------------+--------------------+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+---------------------------+--------------------+
|      |         id         | task_execution_id  |   parameters_id    |               value                |                                                                                                                                                               choices                                                                                                                                                                | parameter_type |        modified_at        |      task_id     

In [13]:
parameter_types = ('INSTRUCTION', 'MATERIAL')
# Filter to just instructions 
instruction_df = parameter_df[parameter_df['type'] == 'INSTRUCTION']

# Compile regex once 
regex = re.compile(r'<.*?>')

# Remove HTML tags in a vectorized manner and create a new 'clean_text' column
instruction_df['instruction'] = instruction_df['data'].apply(lambda x: re.sub(regex, '', x['text']))

# Group by 'task_id' and 'type', then join the texts together
grouped = instruction_df.groupby(['task_id', 'type'])['instruction'].apply('\n'.join).reset_index()

# Filter out only the 'INSTRUCTION' type
instructions = grouped[grouped['type'] == 'INSTRUCTION']

new_step_df = new_step_df.merge(instructions[['task_id', 'instruction']], on='task_id', how='left')

new_step_df.rename(columns={'task_id': 'id', 'name': 'step_name'}, inplace=True)
new_step_df.drop('order_tree', axis=1, inplace=True)
new_step_df.drop('stage_id', axis=1, inplace=True)
new_step_df.drop('checklist_id', axis=1, inplace=True)

print(tabulate(new_step_df, headers='keys', tablefmt='pretty'))

+----+--------------------+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+
|    |         id         |                                      step_name                                      |                                                       instruction                                                       |
+----+--------------------+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+
| 0  | 373013004378562562 |                            Product and Equipment details                            |                                                           nan                                                           |
| 1  | 373039997950222336 |                             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instruction_df['instruction'] = instruction_df['data'].apply(lambda x: re.sub(regex, '', x['text']))


In [14]:
new_step_attribute_df = pd.DataFrame(
    columns=['id', 'step_id', 'data_type_id', 'attribute_label', 'resource_id', 'expected_value1', 'expected_value2',
             'comparison_operator', 'resource_type', 'parameter_id', 'reference_id'])

new_step_attribute_data_types_df = pd.DataFrame(
    columns=['data_type_id', 'measurement_type', 'measurement_unit', 'measurement_description'])

def append_to_attribute_related_df(new_rows_attribute_data_types, new_rows_attribute, attribute_data_types_df, attribute_df):
    new_rows_data_types_df = pd.DataFrame(new_rows_attribute_data_types)
    new_rows_attribute_df = pd.DataFrame(new_rows_attribute)
    attribute_data_types_df = pd.concat([attribute_data_types_df, new_rows_data_types_df], ignore_index=True)
    attribute_df = pd.concat([attribute_df, new_rows_attribute_df], ignore_index=True)
    return  attribute_data_types_df, attribute_df

def create_row_attribute_data_type(data_type_id, measurement_type, measurement_unit, measurement_description):
    return {
        'data_type_id': data_type_id,
        'measurement_type': measurement_type,
        'measurement_unit': measurement_unit,
        'measurement_description': measurement_description
    }

def create_row_attribute(attribute_id, parameter_id, step_id, data_type_id, attribute_label, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id):
    return {
        'id': attribute_id,
        'step_id': step_id,
        'data_type_id': data_type_id,
        'attribute_label': attribute_label,
        'expected_value1': expected_value1,
        'expected_value2': expected_value2,
        'comparison_operator': comparison_operator,
        'resource_id': resource_id,
        'resource_type': resource_type,
        'parameter_id': parameter_id,
        'reference_id': reference_id
    }

def create_rows_for_attribute_related_data(parameter, identifier):
    new_rows_attribute_data_type = []
    new_rows_attribute = []
    measurement_unit = expected_value1 = expected_value2 = comparison_operator = resource_id = resource_type = reference_id = measurement_type = None
    parameter_type, step_id, name = parameter['type'], parameter['task_id'], parameter['name']
    if parameter_type not in (['SINGLE_SELECT', 'CHECKLIST', 'MULTISELECT']):
        is_parameter_type_handled = True
        if parameter_type == 'NUMBER':
            measurement_type = 'integer'
        elif parameter_type == 'SHOULD_BE':
            measurement_type = 'float'
            operator = parameter['data']['operator']
            if operator == 'EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '='
            elif operator == 'LESS_THAN':
                expected_value1 = parameter['data']['value']
                comparison_operator = '<'
            elif operator == 'LESS_THAN_EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '<='
            elif operator == 'MORE_THAN':
                expected_value1 = parameter['data']['value']
                comparison_operator = '>'
            elif operator == 'MORE_THAN_EQUAL_TO':
                expected_value1 = parameter['data']['value']
                comparison_operator = '>='
            elif operator == 'BETWEEN':                
                expected_value1 = parameter['data']['lowerValue']
                expected_value2 = parameter['data']['upperValue']
                comparison_operator = 'between'
        elif parameter_type == 'SINGLE_LINE' or parameter_type == 'MULTI_LINE':
            measurement_type = 'text'
        elif parameter_type == 'DATE' or parameter_type == 'DATE_TIME':
            measurement_type = parameter_type.lower()
        elif parameter_type == 'YES_NO':
            measurement_type = 'boolean'
        elif parameter_type == 'RESOURCE':
            measurement_type = 'text'
            resource_type = parameter['data']['collection']
        else:
            print(f"Parameter type: {parameter_type} is not implemented")
            is_parameter_type_handled = False
        if is_parameter_type_handled:    
            new_row_attribute_data_type = create_row_attribute_data_type(identifier, measurement_type, measurement_unit, name)
            new_row_attribute = create_row_attribute(identifier, parameter['parameter_id'], step_id, new_row_attribute_data_type['data_type_id'], name, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id)
            new_rows_attribute_data_type.append(new_row_attribute_data_type)
            new_rows_attribute.append(new_row_attribute)
    else:
        if parameter_type == 'SINGLE_SELECT' or parameter_type == 'CHECKLIST' or parameter_type == 'MULTISELECT':
            measurement_type = 'boolean'
            for choice in row['data']:
                name = parameter['name'] + ' - ' + choice['name']
                reference_id = choice['id']
                new_row_attribute_data_type = create_row_attribute_data_type(identifier, measurement_type, measurement_unit, name)
                new_row_attribute = create_row_attribute(identifier, parameter['parameter_id'], step_id, new_row_attribute_data_type['data_type_id'], name, expected_value1, expected_value2, comparison_operator, resource_id, resource_type, reference_id)
                new_rows_attribute_data_type.append(new_row_attribute_data_type)
                new_rows_attribute.append(new_row_attribute)
                identifier += 1
        else:
            print(f"Parameter type: {parameter_type} is not implemented.")
                
    return new_rows_attribute_data_type, new_rows_attribute

next_id = get_next_id(new_step_attribute_data_types_df, 'data_type_id')
relevant_parameter_df = parameter_df[~parameter_df['type'].isin(['INSTRUCTION', 'MATERIAL', 'MEDIA', 'SIGNATURE', 'FILE_UPLOAD'])]
# relevant_parameter_df.rename(columns={'parameter_id' : 'id'}, inplace=True)
for index, row in relevant_parameter_df.iterrows():
    attribute_data_types, attributes = create_rows_for_attribute_related_data(row, next_id)
    if len(attribute_data_types) != 0 and len(attributes) != 0:
        new_step_attribute_data_types_df, new_step_attribute_df = append_to_attribute_related_df(attribute_data_types, attributes, new_step_attribute_data_types_df, new_step_attribute_df)
    next_id += len(attribute_data_types)

# print(tabulate(new_step_attribute_data_types_df, headers='keys', tablefmt='pretty'))
print(tabulate(new_step_attribute_df, headers='keys', tablefmt='pretty'))

+----+----+--------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------+-----------------+---------------------+---------------+--------------------+--------------------------------------+
|    | id |      step_id       | data_type_id |                                                                            attribute_label                                                                             | resource_id | expected_value1 | expected_value2 | comparison_operator | resource_type |    parameter_id    |             reference_id             |
+----+----+--------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------+-----------------+----

In [15]:
executed_step_df.rename(columns={'id': 'execution_id', 'task_id': 'step_id', 'started_at': 'execution_start_time', 'ended_at': 'execution_end_time', 'state': 'status', 'started_by': 'executed_by_employee_id'  }, inplace=True)
executed_step_df['batch_id'] = None
executed_step_df.drop('reason', axis=1, inplace=True)
print(tabulate(executed_step_df, headers='keys', tablefmt='pretty'))

+------+--------------------+--------------------+---------------------------+---------------------------+--------------------------+-----------------------------------+----------+
|      |    execution_id    |      step_id       |   execution_start_time    |    execution_end_time     |          status          |      executed_by_employee_id      | batch_id |
+------+--------------------+--------------------+---------------------------+---------------------------+--------------------------+-----------------------------------+----------+
|  0   | 391874377808207872 | 373013004378562562 | 2023-08-18 08:52:05+00:00 | 2023-08-18 08:55:12+00:00 |        COMPLETED         |   Vinoth Venkatesan ( ID: 3309)   |          |
|  1   | 410765324685107201 | 373013004378562562 | 2023-10-09 11:54:54+00:00 | 2023-10-09 11:55:25+00:00 |        COMPLETED         |   Vinoth Venkatesan ( ID: 3309)   |          |
|  2   | 410764330869940225 | 373013004378562562 | 2023-10-09 11:51:20+00:00 | 2023-10-09 11:51

In [16]:
# executed_step_exception_df = pd.DataFrame(columns=['exception_id', 'execution_id', 'description', 'exception_time'])
executed_step_exception_df = task_execution_df.copy()
executed_step_exception_df = executed_step_exception_df[executed_step_exception_df['state'] == 'COMPLETED_WITH_EXCEPTION']
executed_step_exception_df['exception_id'] = executed_step_exception_df['id']
executed_step_exception_df.rename(columns={'id': 'execution_id', 'reason': 'description', 'ended_at': 'exception_time'}, inplace=True)
executed_step_exception_df.drop('task_id', axis=1, inplace=True)
executed_step_exception_df.drop('state', axis=1, inplace=True)
executed_step_exception_df.drop('started_at', axis=1, inplace=True)
executed_step_exception_df.drop('started_by', axis=1, inplace=True)
print(tabulate(executed_step_exception_df, headers='keys', tablefmt='pretty'))

+------+--------------------+--------------------------------+---------------------------+--------------------+
|      |    execution_id    |          description           |      exception_time       |    exception_id    |
+------+--------------------+--------------------------------+---------------------------+--------------------+
| 106  | 404868767976742914 |   nylon brush not applicable   | 2023-09-23 05:24:55+00:00 | 404868767976742914 |
| 107  | 404877899999961090 |   nylon brush not applicable   | 2023-09-23 06:02:18+00:00 | 404877899999961090 |
| 108  | 398495534738268162 |   Nylon brush not applicable   | 2023-09-05 16:24:44+00:00 | 398495534738268162 |
| 109  | 398495429943582722 |   Nylon brush not applicable   | 2023-09-05 15:22:48+00:00 | 398495429943582722 |
| 110  | 404923513861267458 |   nylon brush not applicable   | 2023-09-23 09:04:32+00:00 | 404923513861267458 |
| 111  | 398824643053969410 |   nylon brush not applicable   | 2023-09-06 13:15:06+00:00 | 3988246430539

In [20]:

parameter_execution_df = pd.read_sql(PARAMETER_EXECUTION_QUERY, source_connection, params=params)
executed_step_measurement_df = pd.DataFrame(columns=['measurement_id', 'execution_id', 'step_attribute_id', 'resource_id', 'measurement_value', 'recorded_time', 'resource_type'])

def create_row_executed_step_measurement(measurement_id, execution_id, step_attribute_id, resource_id, measurement_value, recorded_time, resource_type): 
    return {
        'measurement_id': measurement_id,
        'execution_id': execution_id,
        'step_attribute_id': step_attribute_id,
        'resource_id': resource_id,
        'measurement_value': measurement_value,
        'recorded_time': recorded_time,
        'resource_type': resource_type
    }    

new_rows_executed_step_measurement = []
for index, row in parameter_execution_df.iterrows():
    parameter_type = row['parameter_type']
    measurement_id = row['id']
    execution_id = row['task_execution_id']
    recorded_time = row['modified_at']
    step_id = row['task_id']
    step_attribute_id = resource_id = measurement_value = resource_type = None
    if parameter_type in (['SINGLE_SELECT', 'CHECKLIST', 'MULTISELECT', 'YES_NO', 'RESOURCE']):
        if parameter_type == 'RESOURCE':
            resource_type = row['choices']
            pass
        else:
            if row['choices'] is not None:
                for choice in row['choices']:
                    # Very complicated
                    pass
    else:
        step_attribute_id = new_step_attribute_df.loc[(new_step_attribute_df['step_id'] == step_id)]['id']
        measurement_value = row['value']
        # step_attribute_id = step_id
        new_row_executed_step_measurement = create_row_executed_step_measurement(measurement_id, execution_id, step_attribute_id, resource_id, measurement_value, recorded_time, resource_type)
        new_rows_executed_step_measurement.append(new_row_executed_step_measurement)

if len(new_rows_executed_step_measurement) != 0:
    new_rows_executed_step_measurement_df = pd.DataFrame(new_rows_executed_step_measurement)
    executed_step_measurement_df = pd.concat([executed_step_measurement_df, new_rows_executed_step_measurement_df], ignore_index=True)
#             
# executed_step_measurement_df
new_rows_executed_step_measurement

  parameter_execution_df = pd.read_sql(PARAMETER_EXECUTION_QUERY, source_connection, params=params)
  executed_step_measurement_df = pd.concat([executed_step_measurement_df, new_rows_executed_step_measurement_df], ignore_index=True)


[{'measurement_id': 381651668633509917,
  'execution_id': 381651668427988992,
  'step_attribute_id': 0    1
  1    2
  2    3
  3    4
  Name: id, dtype: object,
  'resource_id': None,
  'measurement_value': '1701108',
  'recorded_time': Timestamp('2023-07-22 02:55:18+0000', tz='UTC'),
  'resource_type': None},
 {'measurement_id': 381651668633509925,
  'execution_id': 381651668427988992,
  'step_attribute_id': 0    1
  1    2
  2    3
  3    4
  Name: id, dtype: object,
  'resource_id': None,
  'measurement_value': 'loperamide hcl odt 2mg ',
  'recorded_time': Timestamp('2023-07-22 02:55:15+0000', tz='UTC'),
  'resource_type': None},
 {'measurement_id': 381651668633509896,
  'execution_id': 381651668427988992,
  'step_attribute_id': 0    1
  1    2
  2    3
  3    4
  Name: id, dtype: object,
  'resource_id': None,
  'measurement_value': None,
  'recorded_time': NaT,
  'resource_type': None},
 {'measurement_id': 381651668633509921,
  'execution_id': 381651668528652288,
  'step_attribut