Generalize upload functions given relation diccionary

In [1]:
import os 
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import date, timedelta

In [2]:
# Importar modulos propios en notebook
project_root = Path(os.getcwd()).parent
sys.path.append(str(project_root))

In [3]:
import dependencies as dp
from db.engines import engine_silver
from utils.functions import create_combined_key, build_upload_query


2024-11-28 12:52:20,937 - INFO - Successful connection to schema bronze
2024-11-28 12:52:20,972 - INFO - Successful connection to schema silver
2024-11-28 12:52:21,012 - INFO - Successful connection to schema gold


In [4]:
from processes.transform.silver import bronze_df

2024-11-28 12:52:23,319 - INFO - Mapping database tables in schema 'bronze' started.
2024-11-28 12:52:23,406 - INFO - Tables in schema 'bronze' were successfully mapped.
2024-11-28 12:52:23,916 - INFO - Data from all tables has been successfully loaded into DataFrames.
2024-11-28 12:52:23,916 - INFO - Extract Process executed successfully in bronze schema.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6321303_Almacenes'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6301303_Articulos'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6301303_Clientes'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6311303_Empresas'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6331303_Operarios'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'U6311303_Talleres'.
2024-11-28 12:52:23,928 - INFO - Columns successfully updated for 'ULSTTHPT_TiposHoras'.
2024-11-28 12:52:23,932 - I

In [8]:
from model.table_relations import related_bron_silv

In [5]:
def get_new_data (df, table_name, key_columns, date_column, engine):
    '''
    Process new values: 
        - Get existing values from the database.
        - Identify new values in the dataframe.

    input: 
        - table_name (str): name of the table in the database.
        - df (DataFrame): dataframe to be compared with existing values.
        - key_columns (list): list ok key column names in the dataframe and table database to compare.
        - date_colum (str): date column name in the dataframe and table database to compare.
        - engine (SQLAlchemy engine): database connection.

    output:
        - df_new_data (DataFrame): Dataframe with new indentified data.
    '''
   
    # Get existing values from the database
    query = build_upload_query(
        table_name=table_name,
        key_columns = key_columns,
        date_column = date_column
        )

    df_existing_data = pd.read_sql(query, con=engine)#[key_column].tolist()

    # Get new data by comparing dataframe with existing values

    # Generate combined keys for comparison to identify new data
    df = df.assign(
        combined_key=create_combined_key(
            df=df,
            key_columns=key_columns,
            date_column=date_column)
        )

    # Crear 'combined_key' en existing_values
    df_existing_data = df_existing_data.assign(
        combined_key=create_combined_key(
            df=df_existing_data,
            key_columns=key_columns,
            date_column=date_column)
        )

    # Filter rows in df that are not in existing_values
    df_new_data = df[~df['combined_key'].isin(df_existing_data['combined_key'])]

    # Drop the temporary 'combined_key' column before saving new values
    df_new_data = df_new_data.drop(columns='combined_key')
    df_existing_data = df_existing_data.drop(columns='combined_key')

    return df_new_data

In [6]:
def upload_new_data(df_new_data, table_name, engine, date_column=None):
    ''''''
    if date_column:
        df_new_data = df_new_data.sort_values(date_column, ascending=True)
    
    df_new_data.to_sql(table_name, con=engine, if_exists='append', index=False)

In [26]:
related_bron_silv = {
    'fact_table_mult': [
        {'tbl_bron': 'U533_OrdenesReparacion', 
         'tbl_silv': 'OrdenesReparacion', 
         'key_columns': ['ReferenciaOR'],
         'date_column': 'FechaApertura'}]}

In [9]:
for key in related_bron_silv.keys():
    #if key in {'master_table', 'master_table_mult', 'fact_table', 'fact_table_mult'}:
    for table in related_bron_silv[key]:
        dp.logger.info(f'Processing updloading data process fot table: "{table['tbl_silv']}"')
        # Accede a los valores del diccionario interno
        df = bronze_df[table['tbl_bron']]
        table_name = table['tbl_silv']
        key_columns = table['key_columns']
        date_column = table['date_column']

        # Execute upload new data function
        df_new_data = get_new_data (
            df=df, 
            table_name=table_name,
            key_columns=key_columns,
            date_column=date_column,
            engine=engine_silver) 
            
        if not df_new_data.empty:
            dp.logger.info(f'There is new data to upload in table "{table_name}"')
            try:
                upload_new_data(
                    df_new_data=df_new_data,
                    table_name=table_name, 
                    date_column=date_column, 
                    engine=engine_silver)
                
                dp.logger.info(f'New data has been uploaded succesfully into table "{table_name}"')

            except Exception as e:
                dp.logger.error(f"An error has occurred trying to insert new data into table '{table_name}': {e}")
                continue
            
        else:
            dp.logger.info(f'There is not update data to insert into table "{table_name}"')
    

2024-11-28 12:53:11,666 - INFO - Processing updloading data process fot table: "Almacenes"
2024-11-28 12:53:11,702 - INFO - There is not update data to insert into table "Almacenes"
2024-11-28 12:53:11,702 - INFO - Processing updloading data process fot table: "Clientes"
2024-11-28 12:53:11,716 - INFO - There is not update data to insert into table "Clientes"
2024-11-28 12:53:11,716 - INFO - Processing updloading data process fot table: "Empresas"
2024-11-28 12:53:11,716 - INFO - There is not update data to insert into table "Empresas"
2024-11-28 12:53:11,716 - INFO - Processing updloading data process fot table: "Operarios"
2024-11-28 12:53:11,716 - INFO - There is not update data to insert into table "Operarios"
2024-11-28 12:53:11,716 - INFO - Processing updloading data process fot table: "Talleres"
2024-11-28 12:53:11,716 - INFO - There is not update data to insert into table "Talleres"
2024-11-28 12:53:11,716 - INFO - Processing updloading data process fot table: "TiposHoras"
2024