# Imports

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from vulcan.readers.csv import read_csv
import vulcan.generators.metadata as vgm
import vulcan.utils.llm_helpers as vuo
from vulcan.parsers.dependency import determine_table_creation_order
from vulcan.parsers.graph import create_query_dependent_graph, get_table_creation_order
from vulcan.database.core import initialize_database, execute_queries
from vulcan.database.load import push_data_in_db

# Parameters

In [3]:
file_name = "data/spotify.csv"
db_type = "postgres"
db_uri = "postgresql://vulcan_user:Zneelin12!@localhost/vulcandb"
single_table = True

# Read Data

In [4]:
csv_file = file_name

dataframe = read_csv(csv_file)

print(dataframe.head())
print(dataframe.info())

                            track_name       artist_name  artist_count  \
0  Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook             2   
1                                 LALA       Myke Towers             1   
2                              vampire    Olivia Rodrigo             1   
3                         Cruel Summer      Taylor Swift             1   
4                       WHERE SHE GOES         Bad Bunny             1   

   released_year  released_month  released_day  in_spotify_playlists  \
0           2023               7            14                   553   
1           2023               3            23                  1474   
2           2023               6            30                  1397   
3           2019               8            23                  7858   
4           2023               5            18                  3133   

   in_spotify_charts    streams  in_apple_playlists  ...  key   mode  \
0                147  141381703                  4

# Generate Schema, Constraints, and Queries

In [5]:
info = vgm.get_dataframe_description(dataframe)  # columns, non-null counts, dtypes
print(info)

Column             Non-Null             Dtype
----------------------------------------
track_name           non-null        object
artist_name          non-null        object
artist_count         non-null        int64
released_year        non-null        int64
released_month       non-null        int64
released_day         non-null        int64
in_spotify_playlists non-null        int64
in_spotify_charts    non-null        int64
streams              non-null        object
in_apple_playlists   non-null        int64
in_apple_charts      non-null        int64
in_deezer_playlists  non-null        object
in_deezer_charts     non-null        int64
in_shazam_charts     non-null        object
bpm                  non-null        int64
key                  non-null        object
mode                 non-null        object
danceability_%       non-null        int64
valence_%            non-null        int64
energy_%             non-null        int64
acousticness_%       non-null        int64
ins

In [6]:
samples = vgm.get_dataframe_samples(dataframe, 30)  # 30-row sample
print(samples)

                                                                                   track_name                                       artist_name  artist_count  released_year  released_month  released_day  in_spotify_playlists  in_spotify_charts    streams  in_apple_playlists  in_apple_charts in_deezer_playlists  in_deezer_charts in_shazam_charts  bpm key  mode  danceability_%  valence_%  energy_%  acousticness_%  instrumentalness_%  liveness_%  speechiness_%                                                        cover_url
                                                                                   Yandel 150                                      Yandel, Feid             2           2022              12            20                  3618                 38  585695368                  47               74                  80                14              194  168  F# Minor              78         58        73               5                   0          10              7 https://i.s

In [7]:
data_dict = {
    "database": "postgres",
    "raw_data": samples,
    "structure": info,
    "single_table": single_table,
}

In [8]:
data_dict = vuo.generate_schema(data_dict)

>> GENERATED SCHEMA  #### High Level Explanation ####
The raw data represents music tracks with various attributes such as track name, artist details, release information, streaming statistics, and musical characteristics. To efficiently organize this data, we will create a relational schema with two main tables: `tracks` and `artists`. The `tracks` table will store information specific to each track, while the `artists` table will consolidate unique artist names. This structure will allow us to manage the data efficiently, especially when dealing with multiple artists for a single track.

#### Table: tracks ####
## Traits:
- 1:1 with raw data
- depends on:
  - artists (which is 1:N with raw data)

## Columns:
- id: INTEGER PRIMARY KEY (1:1 row id)
- track_name: VARCHAR NOT NULL (direct mapping from raw$track_name)
- artist_id: INTEGER FOREIGN KEY REFERENCES artists(artist_id) (foreign key to 1:N table artists)
- released_year: INTEGER (direct mapping from raw$released_year)
- released

In [9]:
data_dict = vuo.generate_table_list(data_dict)

>> GENERATED TABLE LIST  ['tracks', 'artists']


In [10]:
data_dict = vuo.generate_table_traits(data_dict)

>> GENERATED TRAITS FOR TABLE: tracks
>> GENERATED TRAITS FOR TABLE: artists
>> ALL TABLE TRAITS GENERATED:  ['{\n  "relation_to_raw": "1:1",\n  "mapping": [\n    {\n      "raw_csv_col": "danceability_%",\n      "table_col": "danceability"\n    },\n    {\n      "raw_csv_col": "valence_%",\n      "table_col": "valence"\n    },\n    {\n      "raw_csv_col": "energy_%",\n      "table_col": "energy"\n    },\n    {\n      "raw_csv_col": "acousticness_%",\n      "table_col": "acousticness"\n    },\n    {\n      "raw_csv_col": "instrumentalness_%",\n      "table_col": "instrumentalness"\n    },\n    {\n      "raw_csv_col": "liveness_%",\n      "table_col": "liveness"\n    },\n    {\n      "raw_csv_col": "speechiness_%",\n      "table_col": "speechiness"\n    }\n  ],\n  "one_to_n": null,\n  "dependencies": [\n    {\n      "parent_table_name": "artists",\n      "local_fk_col": "artist_id"\n    }\n  ],\n  "name": "tracks"\n}', '{\n  "relation_to_raw": "1:n",\n  "mapping": [\n    {\n      "raw_csv

In [11]:
data_dict = vuo.generate_constraints(data_dict)

>> GENERATED CONSTRAINTS  ### Constrained Schema ###

#### Table: tracks ####
- **id**: INTEGER PRIMARY KEY
  - Auto-incrementing unique identifier for each track.
- **track_name**: VARCHAR NOT NULL
  - The name of the track, must be provided for each entry.
- **artist_id**: INTEGER NOT NULL
  - FOREIGN KEY REFERENCES `artists(artist_id)`
  - Ensures that each track is associated with a valid artist.
- **released_year**: INTEGER CHECK (released_year >= 1900 AND released_year <= EXTRACT(YEAR FROM CURRENT_DATE))
  - The year the track was released, must be a valid year.
- **released_month**: INTEGER CHECK (released_month >= 1 AND released_month <= 12)
  - The month the track was released, must be between 1 and 12.
- **released_day**: INTEGER CHECK (released_day >= 1 AND released_day <= 31)
  - The day the track was released, must be between 1 and 31.
- **in_spotify_playlists**: INTEGER CHECK (in_spotify_playlists >= 0)
  - Number of Spotify playlists the track is in, must be non-negative

In [12]:
data_dict = vuo.generate_sql_queries(data_dict)

>> GENERATED QUERIES  ['CREATE TABLE "artists" (\n    "artist_id" SERIAL PRIMARY KEY,\n    "artist_name" VARCHAR NOT NULL UNIQUE\n);', 'CREATE TABLE "tracks" (\n    "id" SERIAL PRIMARY KEY,\n    "track_name" VARCHAR NOT NULL,\n    "artist_id" INTEGER NOT NULL REFERENCES "artists"("artist_id"),\n    "released_year" INTEGER CHECK ("released_year" >= 1900 AND "released_year" <= EXTRACT(YEAR FROM CURRENT_DATE)),\n    "released_month" INTEGER CHECK ("released_month" >= 1 AND "released_month" <= 12),\n    "released_day" INTEGER CHECK ("released_day" >= 1 AND "released_day" <= 31),\n    "in_spotify_playlists" INTEGER CHECK ("in_spotify_playlists" >= 0),\n    "in_spotify_charts" INTEGER CHECK ("in_spotify_charts" >= 0),\n    "streams" BIGINT CHECK ("streams" >= 0),\n    "in_apple_playlists" INTEGER CHECK ("in_apple_playlists" >= 0),\n    "in_apple_charts" INTEGER CHECK ("in_apple_charts" >= 0),\n    "in_deezer_playlists" INTEGER CHECK ("in_deezer_playlists" >= 0),\n    "in_deezer_charts" INTEG

In [13]:
table_order = determine_table_creation_order(data_dict["table_traits"], data_dict["table_list"])
print(table_order)

['artists', 'tracks']


In [14]:
queries = data_dict["queries"]

# Create the dependent graph
dependent_graph, tables = create_query_dependent_graph(queries)

print("Dependent Graph:", dependent_graph)
print("Tables Dict:", tables)

Dependent Graph: {'artists': ['tracks'], 'tracks': []}
Tables Dict: {'artists': {'query': 'CREATE TABLE "artists" (\n    "artist_id" SERIAL PRIMARY KEY,\n    "artist_name" VARCHAR NOT NULL UNIQUE\n);', 'name': 'artists', 'columns': ['artist_id', 'artist_name'], 'foreign_keys': []}, 'tracks': {'query': 'CREATE TABLE "tracks" (\n    "id" SERIAL PRIMARY KEY,\n    "track_name" VARCHAR NOT NULL,\n    "artist_id" INTEGER NOT NULL REFERENCES "artists"("artist_id"),\n    "released_year" INTEGER CHECK ("released_year" >= 1900 AND "released_year" <= EXTRACT(YEAR FROM CURRENT_DATE)),\n    "released_month" INTEGER CHECK ("released_month" >= 1 AND "released_month" <= 12),\n    "released_day" INTEGER CHECK ("released_day" >= 1 AND "released_day" <= 31),\n    "in_spotify_playlists" INTEGER CHECK ("in_spotify_playlists" >= 0),\n    "in_spotify_charts" INTEGER CHECK ("in_spotify_charts" >= 0),\n    "streams" BIGINT CHECK ("streams" >= 0),\n    "in_apple_playlists" INTEGER CHECK ("in_apple_playlists" >=

In [15]:
raise Exception("Stop here")

Exception: Stop here

# Create Tables in the Database

In [16]:
# 5.1: Database connection info

# 5.2: Initialize the engine
engine = initialize_database(db_uri=db_uri, db_type=db_type)

Initializing POSTGRESQL Database


In [17]:
# 5.3: Create tables by executing the CREATE statements in the correct order
success, error = execute_queries(engine, table_order, tables)
if not success:
    print("Table creation error:", error)
else:
    print("Tables created successfully!")

2025-05-14 01:06:36,702 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-05-14 01:06:36,703 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 01:06:36,707 INFO sqlalchemy.engine.Engine select current_schema()
2025-05-14 01:06:36,707 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 01:06:36,709 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-05-14 01:06:36,710 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 01:06:36,711 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-14 01:06:36,712 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS "tracks" CASCADE
2025-05-14 01:06:36,712 INFO sqlalchemy.engine.Engine [generated in 0.00045s] {}
Table tracks dropped
2025-05-14 01:06:36,729 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS "artists" CASCADE
2025-05-14 01:06:36,729 INFO sqlalchemy.engine.Engine [generated in 0.00054s] {}
Table artists dropped
2025-05-14 01:06:36,731 INFO sqlalchemy.engine.Engine CREATE TABLE "artists" (
    "artist_

# Populate Tables with CSV Data

In [18]:
# import logging

# # Suppress SQLAlchemy debug logs
# logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)

push_data_in_db(engine, dataframe, table_order, data_dict["table_traits"])
print("Data insertion complete!")

2025-05-14 01:06:44,695 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-14 01:06:44,696 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s]) AND pg_catalog.pg_class.relpersistence != %(relpersistence_1)s AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-05-14 01:06:44,696 INFO sqlalchemy.engine.Engine [generated in 0.00172s] {'param_1': 'r', 'param_2': 'p', 'relpersistence_1': 't', 'nspname_1': 'pg_catalog'}
2025-05-14 01:06:44,712 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_attribute.attname AS name, pg_catalog.format_type(pg_catalog.pg_attribute.atttypid, pg_catalog.pg_attribute.atttypmod) AS format_type, (SELECT pg_catalog.pg_get_expr(pg_catalog.pg_attrdef.adbin, pg_catalog.pg_attrdef.adrelid) AS p

TypeError: 'TableTraitsWithName' object is not subscriptable