In [23]:
import pandas as pd

inventory_df = pd.read_csv('../processed_data/cleaned_data/cleaned_inventory.csv')
orders_df = pd.read_csv('../processed_data/cleaned_data/cleaned_orders.csv')

# Data Modelling and Schema preparation

## Inserting Data into Data Model

Use Slowly Changing Dimensions Type 2 for Dimension table Inventory and appending data for Fact Table (Always check if no duplicates before loading in datawarehouse)

In [20]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

# Path to the service account key file
key_path = "C:/Users/epranei/Downloads/calm-cove-423918-t0-ce8d5f6922f1.json"

# Authenticate with Google Cloud
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Dataset and table names
project_id = 'calm-cove-423918-t0'
dataset_id = 'Dema'
orders_table_id = f'{project_id}.{dataset_id}.Orders'
inventory_table_id = f'{project_id}.{dataset_id}.Inventory'

# Load data


# Convert dateTime column to datetime type
orders_df['dateTime'] = pd.to_datetime(orders_df['dateTime'])

# Replace NaN values in 'campaign' column with None
orders_df['campaign'] = orders_df['campaign'].replace({pd.NA: None, float('nan'): None})

# Define schema for Orders table
orders_schema = [
    bigquery.SchemaField("orderId", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("productId", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("currency", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("shippingCost", "FLOAT64", mode="NULLABLE"),
    bigquery.SchemaField("amount", "FLOAT64", mode="NULLABLE"),
    bigquery.SchemaField("channel", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("channelGroup", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("campaign", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("dateTime", "TIMESTAMP", mode="REQUIRED")
]

# Define schema for Inventory table (SCD Type 2)
inventory_schema = [
    bigquery.SchemaField("productId", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("category", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("subCategory", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("startDate", "TIMESTAMP", mode="REQUIRED"),
    bigquery.SchemaField("endDate", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("isCurrent", "BOOLEAN", mode="NULLABLE")
]

# Load data into Orders table
job_config = bigquery.LoadJobConfig(schema=orders_schema)
job = client.load_table_from_dataframe(orders_df, orders_table_id, job_config=job_config)
job.result()  # Wait for the job to complete.
print("Orders table loaded successfully.")



## Create additional columns for Dimension Tables and save max surrogate key

Columns such as 'startDate','endDate', 'isCurrent' and 'Inventory surrogate Key' for Dataware house in order to implement Slowly Changing Dimensions Type 2 and also add similar schema change  for Terraform file

Save max surrogate key for incremental and future loads



In [25]:
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to the service account key file
key_path = "C:/Users/epranei/Downloads/calm-cove-423918-t0-ce8d5f6922f1.json"

# Authenticate with Google Cloud
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Dataset and table names
project_id = 'calm-cove-423918-t0'
dataset_id = 'Dema'
inventory_table_id = f'{project_id}.{dataset_id}.Inventory'

# Define schema for the inventory table with the surrogate key
inventory_schema = [
    bigquery.SchemaField("productId", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("category", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("subCategory", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("quantity", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("inventory_skey", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("startDate", "TIMESTAMP", mode="REQUIRED"),
    bigquery.SchemaField("endDate", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("isCurrent", "BOOLEAN", mode="NULLABLE")
]

# Function to perform the initial load of inventory data into BigQuery
def initial_load_inventory(client, inventory_table_id, inventory_df):
    # Add SCD Type 2 columns
    inventory_df['startDate'] = pd.to_datetime('now')
    inventory_df['endDate'] = None
    inventory_df['isCurrent'] = True
    inventory_df['inventory_skey'] = range(1, len(inventory_df) + 1)

    # Load the DataFrame into BigQuery
    job_config = bigquery.LoadJobConfig(schema=inventory_schema, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE)
    job = client.load_table_from_dataframe(inventory_df, inventory_table_id, job_config=job_config)
    job.result()  # Wait for the job to complete.

    print(f"Initial load of inventory table {inventory_table_id} completed successfully.")



# Perform the initial load
initial_load_inventory(client, inventory_table_id, inventory_df)


Initial load of inventory table calm-cove-423918-t0.Dema.Inventory completed successfully.
