Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: cascade experiment id fk deletion in datasets table #11966

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""add cascading delete on experiments fk in datasets table

Revision ID: 949ead8d4f49
Revises: 867495a8f9d4
Create Date: 2024-05-09 17:43:49.128493

"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "949ead8d4f49"
down_revision = "867495a8f9d4"
branch_labels = None
depends_on = None


def upgrade():
dialect_name = op.get_context().dialect.name
if dialect_name == "sqlite":
# Only way to drop unnamed fk constraint in sqllite
# See https://alembic.sqlalchemy.org/en/latest/batch.html#dropping-unnamed-or-named-foreign-key-constraints
with op.batch_alter_table(
"datasets",
schema=None,
naming_convention={
"fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
},
) as batch_op:
batch_op.drop_constraint("fk_datasets_experiment_id_experiments", type_="foreignkey")
# Need to explicitly name the fk constraint with batch alter table
batch_op.create_foreign_key(
"fk_datasets_experiment_id_experiments",
"experiments",
["experiment_id"],
["experiment_id"],
ondelete="CASCADE",
)
else:
if dialect_name == "postgresql":
fk_constraint_name = "datasets_experiment_id_fkey"
elif dialect_name == "mysql":
fk_constraint_name = "datasets_ibfk_1"
elif dialect_name == "mssql":
# mssql fk constraint name at 867495a8f9d4
fk_constraint_name = "FK__datasets__experi__6477ECF3"

# don't use batch alter table here so `create_foreign_key()` can be
# called with `None` as the `constraint_name` and have it set
# automatically based on the conventions of the sql flavor
op.drop_constraint(fk_constraint_name, "datasets", type_="foreignkey")
op.create_foreign_key(
None,
"datasets",
"experiments",
["experiment_id"],
["experiment_id"],
ondelete="CASCADE",
)


def downgrade():
pass
2 changes: 1 addition & 1 deletion mlflow/store/tracking/dbmodels/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class SqlExperimentTag(Base):
"""
Value associated with tag: `String` (limit 5000 characters). Could be *null*.
"""
experiment_id = Column(Integer, ForeignKey("experiments.experiment_id"))
experiment_id = Column(Integer, ForeignKey("experiments.experiment_id", ondelete="CASCADE"))
"""
Experiment ID to which this tag belongs: *Foreign Key* into ``experiments`` table.
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/db/schemas/mssql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ CREATE TABLE datasets (
dataset_schema VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS",
dataset_profile VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS",
CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
CONSTRAINT "FK__datasets__experi__6477ECF3" FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
CONSTRAINT "FK__datasets__experi__71D1E811" FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id) ON DELETE CASCADE
)


Expand Down Expand Up @@ -198,7 +198,7 @@ CREATE TABLE trace_request_metadata (
CREATE TABLE trace_tags (
key VARCHAR(250) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
value VARCHAR(8000) COLLATE "SQL_Latin1_General_CP1_CI_AS",
request_id VARCHAR(50)COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
request_id VARCHAR(50) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
CONSTRAINT trace_tag_pk PRIMARY KEY (key, request_id),
CONSTRAINT fk_trace_tags_request_id FOREIGN KEY(request_id) REFERENCES trace_info (request_id)
)
8 changes: 4 additions & 4 deletions tests/db/schemas/mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ CREATE TABLE datasets (
dataset_schema TEXT,
dataset_profile MEDIUMTEXT,
PRIMARY KEY (experiment_id, name, digest),
CONSTRAINT datasets_ibfk_1 FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
CONSTRAINT datasets_ibfk_1 FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id) ON DELETE CASCADE
)


Expand Down Expand Up @@ -133,7 +133,7 @@ CREATE TABLE trace_info (
timestamp_ms BIGINT NOT NULL,
execution_time_ms BIGINT,
status VARCHAR(50) NOT NULL,
CONSTRAINT trace_info_pk PRIMARY KEY (request_id),
PRIMARY KEY (request_id),
Copy link
Contributor Author

@chilir chilir May 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@harupy can you double check if this is okay? Running tests/db/update_schemas.sh is automatically removing the primary key constraint name in the mysql schema for the trace tables despite this PR not touching the trace models at all. It looks like mysql doesn't allow for primary key constraints to be named?

EDIT: on second look, all named primary key constraints are already unnamed in the mysql schema, so this is expected

CONSTRAINT fk_trace_info_experiment_id FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
)

Expand Down Expand Up @@ -197,7 +197,7 @@ CREATE TABLE trace_request_metadata (
key VARCHAR(250) NOT NULL,
value VARCHAR(8000),
request_id VARCHAR(50) NOT NULL,
CONSTRAINT trace_request_metadata_pk PRIMARY KEY (key, request_id),
chilir marked this conversation as resolved.
Show resolved Hide resolved
PRIMARY KEY (key, request_id),
CONSTRAINT fk_trace_request_metadata_request_id FOREIGN KEY(request_id) REFERENCES trace_info (request_id)
)

Expand All @@ -206,6 +206,6 @@ CREATE TABLE trace_tags (
key VARCHAR(250) NOT NULL,
value VARCHAR(8000),
request_id VARCHAR(50) NOT NULL,
CONSTRAINT trace_tag_pk PRIMARY KEY (key, request_id),
chilir marked this conversation as resolved.
Show resolved Hide resolved
PRIMARY KEY (key, request_id),
CONSTRAINT fk_trace_tags_request_id FOREIGN KEY(request_id) REFERENCES trace_info (request_id)
)
2 changes: 1 addition & 1 deletion tests/db/schemas/postgresql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ CREATE TABLE datasets (
dataset_schema TEXT,
dataset_profile TEXT,
CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
CONSTRAINT datasets_experiment_id_fkey FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
CONSTRAINT datasets_experiment_id_fkey FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id) ON DELETE CASCADE
)


Expand Down
2 changes: 1 addition & 1 deletion tests/db/schemas/sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ CREATE TABLE datasets (
dataset_schema TEXT,
dataset_profile TEXT,
CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
CONSTRAINT fk_datasets_experiment_id_experiments FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id) ON DELETE CASCADE
)


Expand Down
2 changes: 1 addition & 1 deletion tests/resources/db/latest_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ CREATE TABLE datasets (
dataset_schema TEXT,
dataset_profile TEXT,
CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
CONSTRAINT fk_datasets_experiment_id_experiments FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id) ON DELETE CASCADE
)


Expand Down
14 changes: 14 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
import mlflow
from mlflow import pyfunc
from mlflow.cli import doctor, gc, server
from mlflow.data import numpy_dataset
from mlflow.entities import ViewType
from mlflow.entities.dataset_input import DatasetInput
from mlflow.exceptions import MlflowException
from mlflow.server import handlers
from mlflow.store.tracking.file_store import FileStore
Expand Down Expand Up @@ -383,6 +385,18 @@ def invoke_gc(*args):
[exp_id_5, store.DEFAULT_EXPERIMENT_ID]
)

exp_id_6 = store.create_experiment("6")
run_id_2 = store.create_run(exp_id_6, user_id="user", start_time=1, tags=[], run_name="2")
run_id_2_datasets = [
DatasetInput(dataset=numpy_dataset.from_numpy(np.array([1, 2, 3]))._to_mlflow_entity())
]
store.log_inputs(run_id_2.info.run_id, datasets=run_id_2_datasets)
store.delete_experiment(exp_id_6)
invoke_gc("--backend-store-uri", uri, "--experiment-ids", exp_id_6)
assert sorted([e.experiment_id for e in experiments]) == sorted(
[exp_id_5, store.DEFAULT_EXPERIMENT_ID]
)


@pytest.mark.parametrize(
"enable_mlserver",
Expand Down