---Start by setting up PostgreSQL server and user, then running the ingestion script database_setup.py---


In [1]:
import os
from google.colab import userdata

# Get username and password from Colab secrets
db_user = userdata.get('DB_USER')
db_pass = userdata.get('DB_PASS')

# Install PostgreSQL
!apt-get -y -qq install postgresql-14 >/dev/null

# Start the PostgreSQL server
!sudo service postgresql start

# Create the new database user and set its password from the secret
!sudo -u postgres psql -c "CREATE USER {db_user} WITH PASSWORD '{db_pass}';"

# Create the database and grant permissions
!sudo -u postgres createdb financial_data

# Grant all privileges on the new database to the new user
!sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE financial_data TO {db_user};"

# Install dbt and its postgres adapter
!pip install dbt-postgres

 * Starting PostgreSQL 14 database server
   ...done.
ERROR:  role "root" already exists
createdb: error: database creation failed: ERROR:  database "financial_data" already exists
GRANT


In [2]:
# Change to project's root directory
%cd "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard"

/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard


In [3]:
# check if raw_saas_data.json is in directory
!ls -F

financial_forecasting/	logs/	    raw_saas_data.json	scripts/
LICENSE			notebooks/  README.md


In [4]:
# Get the rest of the environment variables from Colab secrets.
db_host = userdata.get('DB_HOST')
db_port = userdata.get('DB_PORT')
db_name = userdata.get('DB_NAME')

!DB_HOST={db_host} DB_PORT={db_port} DB_NAME={db_name} DB_USER={db_user} DB_PASS={db_pass} python "scripts/database_setup.py"

Successfully connected to the database.
Table 'raw_saas_metrics' is ready.
Successfully ingested 5000 records into raw_saas_metrics.
Database connection closed.


--- Set up dbt ---

In [5]:
!pip install dbt-postgres



In [6]:
# Initialize a new dbt project without prompting for input
!dbt init financial_forecasting --profiles-dir /root/.dbt

[0m06:40:38  Running with dbt=1.10.11
[0m06:40:38  Creating dbt configuration folder at /root/.dbt
[0m06:40:38  A project called financial_forecasting already exists here.


In [7]:
from google.colab import userdata
import os

# Create the .dbt directory, if it doesn't exist.
dbt_dir = os.path.join(os.path.expanduser("~"), ".dbt")
if not os.path.exists(dbt_dir):
  os.makedirs(dbt_dir)

# Define the content of profiles.yml file.
profiles_content = f"""
financial_forecasting:
  target: dev
  outputs:
    dev:
      type: postgres
      host: {userdata.get('DB_HOST')}
      port: {userdata.get('DB_PORT')}
      user: {userdata.get('DB_USER')}
      password: {userdata.get('DB_PASS')}
      dbname: {userdata.get('DB_NAME')}
      schema: public
"""

# Write the content to the profiles.yml file
profiles_path = os.path.join(dbt_dir, "profiles.yml")
with open(profiles_path, "w") as f:
  f.write(profiles_content)
print("dbt profiles.yml file has been created successfully.")

dbt profiles.yml file has been created successfully.


In [8]:
# Change to dbt project's directory
%cd "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting"

/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting


In [9]:
# Verify the set up, test connection to db
!dbt debug --profiles-dir ~/.dbt --project-dir .

[0m06:42:09  Running with dbt=1.10.11
[0m06:42:09  dbt version: 1.10.11
[0m06:42:09  python version: 3.12.11
[0m06:42:09  python path: /usr/bin/python3
[0m06:42:09  os info: Linux-6.6.97+-x86_64-with-glibc2.35
[0m06:42:09  Using profiles dir at /root/.dbt
[0m06:42:09  Using profiles.yml file at /root/.dbt/profiles.yml
[0m06:42:09  Using dbt_project.yml file at ./dbt_project.yml
[0m06:42:09  adapter type: postgres
[0m06:42:09  adapter version: 1.9.1
[0m06:42:10  Configuration:
[0m06:42:10    profiles.yml file [[32mOK found and valid[0m]
[0m06:42:10    dbt_project.yml file [[32mOK found and valid[0m]
[0m06:42:10  Required dependencies:
[0m06:42:10   - git [[32mOK found[0m]

[0m06:42:10  Connection:
[0m06:42:10    host: localhost
[0m06:42:10    port: 5432
[0m06:42:10    user: root
[0m06:42:10    database: financial_data
[0m06:42:10    schema: public
[0m06:42:10    connect_timeout: 10
[0m06:42:10    role: None
[0m06:42:10    search_path: None
[0m06:42:10    k

In [10]:
# Change to dbt project's directory
%cd "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting"

/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting


In [12]:
# Create a folder for the raw_saas_data staging models
!mkdir -p models/staging

In [13]:
%%writefile "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/sources.yml"

# Write a sources.yml model for dbt to recognize and document raw data tables

version: 2

sources:
  - name: raw
    description: "Raw data ingested from external sources."
    schema: public
    tables:
      - name: raw_saas_metrics
        description: "Raw financial metrics data ingested from a JSON file."
        tests:
          - not_null:
              column_name: "customer_id"

Writing /content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/sources.yml


---For building the staging layer---

In [14]:
%%writefile "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/stg_raw_saas_metrics_data.sql"

-- This staging model performs initial cleaning on the raw saas data.
-- It selects data from the source table (from sources.yml), standardizes column names,
  --and casts the data into the correct types.

WITH source_data AS (
    SELECT
        customer_id,
        subscription_start_date,
        monthly_recurring_revenue,
        churn_date,
        plan_type
    FROM
        {{ source('raw', 'raw_saas_metrics') }}
)

SELECT
    CAST(customer_id AS INTEGER) AS customer_id,
    CAST(subscription_start_date AS DATE) AS subscription_start_date,
    CAST(monthly_recurring_revenue AS NUMERIC) AS monthly_recurring_revenue,
    CAST(churn_date AS DATE) AS churn_date,
    CAST(plan_type AS VARCHAR) AS plan_type
FROM
    source_data

Writing /content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/stg_raw_saas_metrics_data.sql


In [16]:
%%writefile "/content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/schema.yml"

# This file holds the tests for the new staging model.
# It performs basic data quality checks for the customer_id & subscription_start_date cols.


version: 2

models:
  - name: stg_raw_saas_metrics
    description: "Staging model for raw SaaS metrics data."
    columns:
      - name: customer_id
        description: "The unique identifier for a customer."
        tests:
          - unique
          - not_null
      - name: subscription_start_date
        description: "The date the customer's subscription began."
        tests:
          - not_null

Writing /content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/models/staging/schema.yml


In [20]:
# This command will update dbt's internal project graph and make it aware of the new
  # staging model and its tests.
!dbt parse

[0m08:06:36  Running with dbt=1.10.11
[0m08:06:37  Registered adapter: postgres=1.9.1
[0m08:06:37  Performance info: /content/drive/MyDrive/Colab Notebooks/FinancialForecastingDashboard/financial_forecasting/target/perf_info.json


In [21]:
# Run the dbt model
!dbt run

[0m08:06:44  Running with dbt=1.10.11
[0m08:06:45  Registered adapter: postgres=1.9.1
[0m08:06:46  Found 3 models, 5 data tests, 1 source, 434 macros
[0m08:06:46  
[0m08:06:46  Concurrency: 1 threads (target='dev')
[0m08:06:46  
[0m08:06:46  1 of 3 START sql table model public.my_first_dbt_model ......................... [RUN]
[0m08:06:46  1 of 3 OK created sql table model public.my_first_dbt_model .................... [[32mSELECT 2[0m in 0.18s]
[0m08:06:46  2 of 3 START sql view model public.stg_raw_saas_metrics_data ................... [RUN]
[0m08:06:46  2 of 3 OK created sql view model public.stg_raw_saas_metrics_data .............. [[32mCREATE VIEW[0m in 0.10s]
[0m08:06:46  3 of 3 START sql view model public.my_second_dbt_model ......................... [RUN]
[0m08:06:46  3 of 3 OK created sql view model public.my_second_dbt_model .................... [[32mCREATE VIEW[0m in 0.09s]
[0m08:06:46  
[0m08:06:46  Finished running 1 table model, 2 view models in 0 hour