
# DriveSight Data Analysis with DuckDB

**Purpose:** Analyze FINN car subscription data to validate dbt transformations and explore business insights

In [158]:
# Installing required packages
# !pip install numpy
# !pip install pandas

In [1]:
# Import required libraries for data analysis
import duckdb
import numpy
import pandas

In [2]:
# Connect to the DuckDB database file
# NOTE: Path is relative to notebook location
dev_dbdir = '../../FINN.duckdb'

In [9]:
# Validating the count of records in each table after running dbt

schemas = ["main_bronze", "main_silver", "main_gold"]

with duckdb.connect(f'{dev_dbdir}') as conn:
    for schema in schemas:
        get_table_query = f"""
        SELECT table_name
        FROM information_schema.tables 
        WHERE table_schema = '{schema}'
        """
        tables = conn.sql(get_table_query).fetchall()

        for table_row in tables:
            table_name = table_row[0]
            count_query = f"""
            SELECT COUNT(*) as count FROM {schema}.{table_name}
            """
            count = conn.sql(count_query).fetchone()[0]
            print(f"Count of records in {schema}.{table_name}: {count}")


Count of records in main_bronze.brz_cars: 1200
Count of records in main_bronze.brz_customers: 550
Count of records in main_bronze.brz_subscriptions: 1200
Count of records in main_silver.slv_cars: 1200
Count of records in main_silver.slv_customers: 550
Count of records in main_silver.slv_subscriptions: 1200
Count of records in main_gold.gld_active_customer_type_share: 0
Count of records in main_gold.gld_cars_infleet_defleet_volume: 1295
Count of records in main_gold.gld_city_weekly_delivery_increase: 778
Count of records in main_gold.gld_customer_type_distribution_per_term: 4


In [4]:
# Query to explore available tables in the main schema
# This shows the source tables available for analysis
query = """
SELECT table_name
FROM information_schema.tables 
WHERE table_schema = 'main'
ORDER BY table_name
"""
with duckdb.connect(f'{dev_dbdir}') as conn:
    src_df = conn.sql(query).fetchdf()
print("Source tables available for analysis:")
src_df.head(100)

Source tables available for analysis:


Unnamed: 0,table_name
0,cars
1,customers
2,invoice_line_items
3,invoices
4,subscriptions


In [5]:
schemas_query = """
    SELECT distinct table_schema
    FROM information_schema.tables 
"""

with duckdb.connect(f'{dev_dbdir}') as conn:
    schema_df = conn.sql(schemas_query).fetchdf()
print("Schemas available for analysis:")
schema_df.head(100)

Schemas available for analysis:


Unnamed: 0,table_schema
0,main
1,main_main_gold
2,main_main_silver
3,main_dbt_test__audit
4,main_main_bronze
5,main_bronze
6,main_gold
7,main_silver
8,main_silver_snapshots


In [6]:
schemas_query = """
    SHOW DATABASES
"""

with duckdb.connect(f'{dev_dbdir}') as conn:
    schema_df = conn.sql(schemas_query).fetchdf()
schema_df.head(100)

Unnamed: 0,database_name
0,finn
