# New Incremental Data

Assuming that we have already done the first load and our data is in the datawarehouse.
For new incremental data or second and subsequent runs, we will run this file which makes sure that if there is a new field in the dimensions then we will add that and make appropriate addition to the surrogate keys as well, including changes to the data warehouse

In [293]:
import pandas as pd
import os
from datetime import datetime

# Function to save DataFrame to CSV
def save_to_csv(df, file_name):
    df.to_csv(file_name, index=False)


def add_scd2_columns(df):
    df['start_date'] = datetime.now()
    df['end_date'] = pd.to_datetime('2262-04-11')
    df['active_flag'] = 'Y'
    return df

In [294]:


# Existing Dimensional tables
existing_customers_df = pd.read_csv('Data_warehouse/unique_customers.csv')
existing_countries_df = pd.read_csv('Data_warehouse/unique_countries.csv')
existing_currencies_df = pd.read_csv('Data_warehouse/unique_currencies.csv')
existing_account_names_df = pd.read_csv('Data_warehouse/unique_account_names.csv')
existing_date_dimension_df = pd.read_csv('Data_warehouse/date_dimension.csv')
existing_loan_type_df = pd.read_csv('Data_warehouse/loan_type.csv')
existing_deposit_type_df = pd.read_csv('Data_warehouse/deposit_type.csv')


# Existing Fact tables

existing_fact_deposits_df = pd.read_csv('Data_warehouse/fact_deposits.csv')
existing_fact_accounts_df = pd.read_csv('Data_warehouse/fact_accounts.csv')
existing_fact_loans_df = pd.read_csv('Data_warehouse/fact_loans.csv')

# New data

new_accounts_df = pd.read_excel("Processed_file\Cleaned_data\cleaned_accounts.xlsx", sheet_name='Sheet1')
new_deposits_df = pd.read_excel("Processed_file\Cleaned_data\cleaned_deposits.xlsx", sheet_name='Sheet1')
new_loans_df = pd.read_excel("Processed_file\Cleaned_data\cleaned_loans.xlsx", sheet_name='Sheet1')


new_accounts_df.rename(columns={'amount': 'account_amount'}, inplace=True)
new_deposits_df.rename(columns={'amount': 'deposit_amount'}, inplace=True)
new_loans_df.rename(columns={'amount': 'loan_amount'}, inplace=True)

# Update Dimension Tables

## Update new Customer Dimension

In [295]:
import pandas as pd
import numpy as np



# Identify unique customers from new deposits and loans
unique_new_customers_deposits = new_deposits_df[['customer', 'customer_type']].drop_duplicates()
unique_new_customers_loans = new_loans_df[['customer', 'customer_type']].drop_duplicates()

# Combine the new unique customers and remove duplicates
all_unique_new_customers = pd.concat([unique_new_customers_deposits, unique_new_customers_loans]).drop_duplicates()

# Find new customers that are not already in the existing customers table
new_customers = pd.merge(all_unique_new_customers, existing_customers_df[['customer', 'customer_type']],
                         on=['customer', 'customer_type'], how='left', indicator=True)
new_customers = new_customers[new_customers['_merge'] == 'left_only'].drop(columns='_merge')

# print(new_customers)
# Assign new surrogate keys to the new customers and scd type 2 
if not new_customers.empty:
    max_existing_key = existing_customers_df['customer_key'].max()
    new_customers['customer_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_customers))

    new_customers = add_scd2_columns(new_customers)
    
    ## apending the data to the existing dimensional table in data warehouse if there is any new customer
    new_customers.to_csv("Data_warehouse/unique_customers.csv", mode='a', header=False, index=False)



## Update new Country Dimension

In [296]:
import pandas as pd
import numpy as np


# Identify unique countries from new deposits and loans
unique_countries_deposits = new_deposits_df['country'].unique()
unique_countries_loans = new_loans_df['country'].unique()

# Combine the unique countries and remove duplicates
all_unique_countries = np.unique(np.concatenate((unique_countries_deposits, unique_countries_loans)))


# Create a DataFrame for unique new countries
unique_countries_df = pd.DataFrame(all_unique_countries, columns=['country'])


# Find new countries that are not already in the existing countries table
new_countries = pd.merge(unique_countries_df, existing_countries_df[['country']],
                         on='country', how='left', indicator=True)
new_countries = new_countries[new_countries['_merge'] == 'left_only'].drop(columns='_merge')



# Assign new surrogate keys to the new countries
if not new_countries.empty:
    max_existing_key = existing_countries_df['country_key'].max() if not existing_countries_df.empty else 0
    new_countries['country_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_countries))

    new_countries = add_scd2_columns(new_countries)

    new_countries.to_csv("Data_warehouse/unique_countries.csv", mode='a', header=False, index=False)



##  Update new Currency Dimension

In [297]:
import pandas as pd
import numpy as np


# Identify unique currencies from new deposits and loans
unique_currencies_deposits = new_deposits_df['currency'].unique()
unique_currencies_loans = new_loans_df['currency'].unique()

# Combine the unique currencies and remove duplicates
all_unique_currencies = np.unique(np.concatenate((unique_currencies_deposits, unique_currencies_loans)))

# Create a DataFrame for unique new currencies
unique_currencies_df = pd.DataFrame(all_unique_currencies, columns=['currency'])

# Find new currencies that are not already in the existing currencies table
new_currencies = pd.merge(unique_currencies_df, existing_currencies_df[['currency']],
                          on='currency', how='left', indicator=True)
new_currencies = new_currencies[new_currencies['_merge'] == 'left_only'].drop(columns='_merge')

# Assign new surrogate keys to the new currencies
if not new_currencies.empty:
    max_existing_key = existing_currencies_df['currency_key'].max() if not existing_currencies_df.empty else 0
    new_currencies['currency_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_currencies))

    new_currencies = add_scd2_columns(new_currencies)
    # Append new currencies to the existing currencies DataFrame
    new_currencies.to_csv("Data_warehouse/unique_currencies.csv", mode='a', header=False, index=False)


## Update new Accounts_Name_Dimension

In [298]:


# Identify unique account names from new accounts
unique_account_names = pd.DataFrame(new_accounts_df['account_name'].unique(), columns=['account_name'])

# Find new account names that are not already in the existing account names table
new_account_names = pd.merge(unique_account_names, existing_account_names_df[['account_name']],
                             on='account_name', how='left', indicator=True)
new_account_names = new_account_names[new_account_names['_merge'] == 'left_only'].drop(columns='_merge')

# Assign new surrogate keys to the new account names
if not new_account_names.empty:
    max_existing_key = existing_account_names_df['account_name_key'].max() if not existing_account_names_df.empty else 0
    new_account_names['account_name_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_account_names))


    new_account_names = add_scd2_columns(new_account_names)
    # Append new account names to the existing account names DataFrame

    new_account_names.to_csv("Data_warehouse/unique_account_names.csv", mode='a', header=False, index=False)


## Update new Loan Type Dimension

In [299]:
import pandas as pd
import numpy as np


# Identify unique loan types from new loans
unique_new_loan_types = new_loans_df['loan_type'].unique()

# Create a DataFrame for unique new loan types
new_loan_types_df = pd.DataFrame(unique_new_loan_types, columns=['loan_type'])

# Find new loan types that are not already in the existing loan types table
new_loan_types = pd.merge(new_loan_types_df, existing_loan_type_df[['loan_type']],
                          on='loan_type', how='left', indicator=True)
new_loan_types = new_loan_types[new_loan_types['_merge'] == 'left_only'].drop(columns='_merge')

# Assign new surrogate keys to the new loan types
if not new_loan_types.empty:
    max_existing_key = existing_loan_type_df['loan_type_key'].max() if not existing_loan_type_df.empty else 0
    new_loan_types['loan_type_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_loan_types))

    new_loan_types = add_scd2_columns(new_loan_types)
    # Append new loan types to the existing loan types DataFrame
    new_loan_types.to_csv("Data_warehouse/loan_type.csv", mode='a', header=False, index=False)


## Update new Deposit Type Dimension

In [300]:
import pandas as pd
import numpy as np


# Identify unique deposit types from new deposits
unique_new_deposit_types = new_deposits_df['deposit_type'].unique()

# Create a DataFrame for unique new deposit types
new_deposit_types_df = pd.DataFrame(unique_new_deposit_types, columns=['deposit_type'])

# Find new deposit types that are not already in the existing deposit types table
new_deposit_types = pd.merge(new_deposit_types_df, existing_deposit_type_df[['deposit_type']],
                             on='deposit_type', how='left', indicator=True)
new_deposit_types = new_deposit_types[new_deposit_types['_merge'] == 'left_only'].drop(columns='_merge')

# Assign new surrogate keys to the new deposit types
if not new_deposit_types.empty:
    max_existing_key = existing_deposit_type_df['deposit_type_key'].max() if not existing_deposit_type_df.empty else 0
    new_deposit_types['deposit_type_key'] = range(max_existing_key + 1, max_existing_key + 1 + len(new_deposit_types))

    new_deposit_types = add_scd2_columns(new_deposit_types)
    # Append new deposit types to the existing deposit types in data warehouse
    new_deposit_types.to_csv("Data_warehouse/deposit_type.csv", mode='a', header=False, index=False)

# Maping Dimensions to Facts 

In [301]:
new_fact_accounts_df = new_accounts_df.copy()
new_fact_deposits_df = new_deposits_df.copy()
new_fact_loans_df = new_loans_df.copy()

### Maping date_dimension to Fact_tables
1. Map all the dates with the key 
2. Drop the original columns

In [302]:
# Function to map dates to date keys
def map_date_to_key(df, date_column):
    df[date_column] = pd.to_datetime(df[date_column])
    df[date_column + '_key'] = df[date_column].dt.strftime('%Y%m%d').astype(int)
    return df


# Add date keys to fact_accounts
fact_accounts = map_date_to_key(new_fact_accounts_df, 'reference_date')

# Add date keys to fact_deposits
fact_deposits = map_date_to_key(new_fact_deposits_df, 'start_date')
fact_deposits = map_date_to_key(new_fact_deposits_df, 'maturity_date')
fact_deposits = map_date_to_key(new_fact_deposits_df, 'reference_date')

# Add date keys to fact_loans
fact_loans = map_date_to_key(new_fact_loans_df, 'start_date')
fact_loans = map_date_to_key(new_fact_loans_df, 'maturity_date')
fact_loans = map_date_to_key(new_fact_loans_df, 'reference_date')



In [303]:
new_fact_deposits_df.drop(columns = ['start_date','maturity_date','reference_date'], inplace=True)
new_fact_loans_df.drop(columns = ['start_date','maturity_date','reference_date'], inplace=True)
new_fact_accounts_df.drop(columns=['reference_date'], inplace = True)

 ## We have all the updated dimension tables in the data warehouse which now includes additional dimensions if included in the incremental data

In [304]:
updated_customers_df = pd.read_csv('Data_warehouse/unique_customers.csv')
updated_countries_df = pd.read_csv('Data_warehouse/unique_countries.csv')
updated_currencies_df = pd.read_csv('Data_warehouse/unique_currencies.csv')
updated_account_names_df = pd.read_csv('Data_warehouse/unique_account_names.csv')
updated_date_dimension_df = pd.read_csv('Data_warehouse/date_dimension.csv')
updated_loan_type_df = pd.read_csv('Data_warehouse/loan_type.csv')
updated_deposit_type_df = pd.read_csv('Data_warehouse/deposit_type.csv')

### Maping updated_cutomer_dimension to fact_tables

In [305]:
def map_customer_to_key(df, unique_customers_df):
    df = df.merge(updated_customers_df[['customer', 'customer_key']], on='customer', how='left')
    # df.drop(columns=['customer'], inplace=True)
    return df

# Add customer_key to fact_deposits and drop the original customer column
new_fact_deposits_df = map_customer_to_key(fact_deposits,updated_customers_df )

# Add customer_key to fact_loans and drop the original customer column
new_fact_loans_df = map_customer_to_key(fact_loans,updated_customers_df )





In [306]:
new_fact_deposits_df.drop(columns=['customer','customer_type'], inplace=True)
new_fact_loans_df.drop(columns=['customer','customer_type'], inplace=True)


### Maping updated_account_name dimensions to fact tables

In [307]:
def map_account_name_to_key(df, account_name_df):
    df = df.merge(updated_account_names_df, on='account_name', how='left')

    return df


new_fact_accounts_df = map_account_name_to_key(new_fact_accounts_df, updated_account_names_df)

In [308]:
new_fact_accounts_df.drop(columns=['account_name'],inplace=True)

### Maping Updated_Currency dimension to  fact tables

In [309]:
def map_currency_to_key(df, unique_currencies_df):
    df = df.merge(updated_currencies_df, on='currency', how='left')
    return df


# Add currency_key to fact_deposits and drop the original currency column
new_fact_deposits_df = map_currency_to_key(new_fact_deposits_df, updated_currencies_df)

# Add currency_key to fact_loans and drop the original currency column
new_fact_loans_df = map_currency_to_key(new_fact_loans_df, updated_currencies_df)

In [310]:
new_fact_deposits_df.drop(columns=['currency'], inplace= True)
new_fact_loans_df.drop(columns=['currency'], inplace=True)


### Mapping Updated Country dimension to Fact tables

In [311]:
def map_country_to_key(df, unique_countries_df):
    df = df.merge(updated_countries_df, on='country', how='left')
    return df



# Add country_key to fact_deposits and drop the original country column
new_fact_deposits_df = map_country_to_key(new_fact_deposits_df, updated_countries_df)

# Add country_key to fact_loans and drop the original country column
new_fact_loans_df = map_country_to_key(new_fact_loans_df, updated_countries_df)


In [312]:
new_fact_deposits_df.drop(columns=['country'], inplace= True)
new_fact_loans_df.drop(columns=['country'], inplace=True)

### Mapping Updated Deposit_type dimensions to  Fact tables

In [313]:
def map_deposit_type_to_key(df, deposit_type_df):
    df = df.merge(updated_deposit_type_df, on='deposit_type', how='left')
    return df


# Add deposit_type_key to fact_deposits and drop the original deposit_type column
new_fact_deposits_df = map_deposit_type_to_key(new_fact_deposits_df, updated_deposit_type_df)

In [314]:
new_fact_deposits_df.drop(columns=['deposit_type'], inplace=True)

### Mapping Updated Loan_type dimensions to Fact tables

In [315]:
def map_loan_type_to_key(df, loan_type_df):
    df = df.merge(updated_loan_type_df, on='loan_type', how='left')
    return df



# Add loan_type_key to fact_loans and drop the original loan_type column
new_fact_loans_df = map_loan_type_to_key(new_fact_loans_df, updated_loan_type_df)



In [316]:
new_fact_loans_df.drop(columns=['loan_type'], inplace = True)

# Creating Final new incremental data for Loading in data warehouse

### Getting the Persisted maximum surrogate keys to maintain primary key in fact tables

In [317]:
max_keys_df = pd.read_csv('Data_warehouse/max_surr_keys.csv')

max_keys_df

Unnamed: 0,table_name,max_deposits_surr_primarykey,max_account_surr_primarykey,max_surr_primarykey
0,fact_deposits,90,0,0
1,fact_accounts,0,16,0
2,fact_loans,0,0,300


## Final new Fact accounts data creation

In [318]:

max_accounts_key = max_keys_df[max_keys_df['table_name'] == 'fact_accounts']['max_account_surr_primarykey'].iloc[0]

# Increment the surrogate primary keys for the new records
new_fact_accounts_df['account_surr_primarykey'] = range(int(max_accounts_key) + 1, int(max_accounts_key) + 1 + len(new_fact_accounts_df))

new_fact_accounts_df = new_fact_accounts_df[['account_number', 'account_amount','account_surr_primarykey','account_type', 'ingest_date_time', 'reference_date_key', 'account_name_key']]



## Final new Fact loans data creation

In [319]:
max_loans_key = max_keys_df[max_keys_df['table_name'] == 'fact_loans']['max_surr_primarykey'].iloc[0]

new_fact_loans_df['loans_surr_primarykey'] =  range(int(max_loans_key) + 1, int(max_loans_key) + 1 + len(new_fact_loans_df))
new_fact_loans_df = new_fact_loans_df[['loan_amount','exchange_rate','ingest_date_time','loans_surr_primarykey','start_date_key','maturity_date_key','reference_date_key','customer_key','currency_key','country_key','loan_type_key']]

## Final new Facts Deposits Data creation

In [320]:
max_deposits_key = max_keys_df[max_keys_df['table_name'] == 'fact_deposits']['max_deposits_surr_primarykey'].iloc[0]


new_fact_deposits_df['deposits_surr_primarykey'] = range(int(max_deposits_key) + 1, int(max_deposits_key) + 1 + len(new_fact_deposits_df))
new_fact_deposits_df = new_fact_deposits_df[['deposit_amount','exchange_rate','ingest_date_time','deposits_surr_primarykey','start_date_key','maturity_date_key','reference_date_key','customer_key','currency_key','country_key','deposit_type_key']]

## Append fact tables to the data warehouse

In [321]:
# Append fact_deposits_df to fact_deposits.csv
new_fact_deposits_df.to_csv('fact_deposits.csv', mode='a', header=False, index=False)

# Append fact_loans_df to fact_loans.csv
new_fact_loans_df.to_csv('fact_loans.csv', mode='a', header=False, index=False)

# Append fact_accounts_df to fact_accounts.csv
new_fact_accounts_df.to_csv('fact_accounts.csv', mode='a', header=False, index=False)


# Updating the max surrogate key and Persisting the same in the data warehouse

In [322]:
import pandas as pd
import os

# Create the Data_warehouse folder if it doesn't exist
os.makedirs('Data_warehouse', exist_ok=True)


max_deposits_key = new_fact_deposits_df['deposits_surr_primarykey'].max()
max_accounts_key = new_fact_accounts_df['account_surr_primarykey'].max()
max_loans_key = new_fact_loans_df['loans_surr_primarykey'].max()



max_keys_data = [
    {"table_name": "fact_deposits", "max_deposits_surr_primarykey": max_deposits_key},
    {"table_name": "fact_accounts", "max_account_surr_primarykey": max_accounts_key},
    {"table_name": "fact_loans", "max_surr_primarykey": max_loans_key}
]
max_keys_df = pd.DataFrame(max_keys_data)

max_keys_df['max_deposits_surr_primarykey'] = max_keys_df['max_deposits_surr_primarykey'].fillna(0).astype(int)
max_keys_df['max_account_surr_primarykey'] = max_keys_df['max_account_surr_primarykey'].fillna(0).astype(int)
max_keys_df['max_surr_primarykey'] = max_keys_df['max_surr_primarykey'].fillna(0).astype(int)



# Save the max keys data to a CSV file
max_keys_df.to_csv('Data_warehouse/max_keys.csv', index=False)

# print("Max keys data saved to Data_warehouse/max_surr_keys.csv")
max_keys_df

Unnamed: 0,table_name,max_deposits_surr_primarykey,max_account_surr_primarykey,max_surr_primarykey
0,fact_deposits,180,0,0
1,fact_accounts,0,32,0
2,fact_loans,0,0,600


## Appending and Updating Data in the Data Warehouse

This section describes the process for appending new data to the existing fact tables and replacing the dimension tables with updated versions in the data warehouse.

### Steps:

1. **Append New Data to Fact Tables**:
   - Append the new records: new_fact_accounts_df, new_fact_loans_df, new_fact_deposits_df  with the updated surrogate primary keys to the existing fact tables.


2. **Changes with Dimensional Tables**
-  For the Dimensional tables in the data warehouse, if there are any deprecated dimensions, we can change the active_flag from 'Y' to 'N' and update the end_date according to the business rules
 

3. **Update the Max surrogate Primary key dataframe to the warehouse**
- Make sure to update the max surrogate keys in the data warehouse, every time Fact tables are filled and save it to the data warehouse
