Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metabase.analyze API namespace #42469

Merged
merged 9 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions .clj-kondo/config.edn
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,7 @@
metabase.analytics #{metabase.analytics.prometheus
metabase.analytics.snowplow
metabase.analytics.stats} ; TODO -- consolidate these into a real API namespace.
metabase.analyze #{metabase.analyze.classifiers.core
metabase.analyze.classifiers.name
metabase.analyze.fingerprint.fingerprinters
metabase.analyze.fingerprint.schema
metabase.analyze.query-results} ; TODO -- consolidate these into a real API namespace.
metabase.analyze #{metabase.analyze}
metabase.api #{metabase.api.common
metabase.api.dataset
metabase.api.permission-graph
Expand Down
36 changes: 36 additions & 0 deletions src/metabase/analyze.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
(ns metabase.analyze
"API namespace for the `metabase.analyze` module."
(:require
[metabase.analyze.classifiers.category]
[metabase.analyze.classifiers.core]
[metabase.analyze.classifiers.name]
[metabase.analyze.fingerprint.fingerprinters]
[metabase.analyze.fingerprint.schema]
[metabase.analyze.query-results]
[potemkin :as p]))

(comment
metabase.analyze.classifiers.category/keep-me
metabase.analyze.classifiers.core/keep-me
metabase.analyze.classifiers.name/keep-me
metabase.analyze.fingerprint.fingerprinters/keep-me
metabase.analyze.fingerprint.schema/keep-me
metabase.analyze.query-results/keep-me)

(p/import-vars
[metabase.analyze.classifiers.category
auto-list-cardinality-threshold
category-cardinality-threshold]
[metabase.analyze.classifiers.core
run-classifiers]
[metabase.analyze.classifiers.name
infer-entity-type-by-name]
[metabase.analyze.fingerprint.fingerprinters
col-wise
constant-fingerprinter
fingerprint-fields]
[metabase.analyze.fingerprint.schema
Fingerprint]
[metabase.analyze.query-results
ResultsMetadata
insights-rf])
20 changes: 15 additions & 5 deletions src/metabase/analyze/classifiers/category.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,21 @@
[metabase.analyze.fingerprint.schema :as fingerprint.schema]
[metabase.analyze.schema :as analyze.schema]
[metabase.lib.schema.metadata :as lib.schema.metadata]
[metabase.models.field-values :as field-values]
[metabase.sync.util :as sync-util]
[metabase.util.log :as log]
[metabase.util.malli :as mu]))

(def ^Long category-cardinality-threshold
"Fields with less than this many distinct values should automatically be given a semantic type of `:type/Category`.
This no longer has any meaning whatsoever as far as the backend code is concerned; it is used purely to inform
frontend behavior such as widget choices."
30)

(def ^Long auto-list-cardinality-threshold
"Fields with less than this many distincy values should be given a `has_field_values` value of `list`, which means
the Field should have FieldValues."
1000)

(defn- cannot-be-category-or-list?
[{base-type :base_type, semantic-type :semantic_type}]
(or (isa? base-type :type/Temporal)
Expand All @@ -38,11 +48,11 @@
(when (and (nil? (:semantic_type field))
(or (some-> nil% (< 1))
(isa? (:base_type field) :type/Boolean))
(some-> distinct-count (<= field-values/category-cardinality-threshold)))
(some-> distinct-count (<= category-cardinality-threshold)))
(log/debugf "%s has %d distinct values. Since that is less than %d, we're marking it as a category."
(sync-util/name-for-logging field)
distinct-count
field-values/category-cardinality-threshold)
category-cardinality-threshold)
true)))

(mu/defn ^:private field-should-be-auto-list? :- [:maybe :boolean]
Expand All @@ -53,11 +63,11 @@
;; manually by an admin, and we don't want to stomp over their choices.
(let [distinct-count (get-in fingerprint [:global :distinct-count])]
(when (and (nil? (:has-field-values field))
(some-> distinct-count (<= field-values/auto-list-cardinality-threshold)))
(some-> distinct-count (<= auto-list-cardinality-threshold)))
(log/debugf "%s has %d distinct values. Since that is less than %d, it should have cached FieldValues."
(sync-util/name-for-logging field)
distinct-count
field-values/auto-list-cardinality-threshold)
auto-list-cardinality-threshold)
true)))

(mu/defn infer-is-category-or-list :- [:maybe analyze.schema/Field]
Expand Down
2 changes: 1 addition & 1 deletion src/metabase/analyze/classifiers/name.clj
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@
[(prefix-or-postfix "companies") :entity/CompanyTable]
[(prefix-or-postfix "vendor") :entity/CompanyTable]])

(mu/defn infer-entity-type :- analyze.schema/Table
(mu/defn infer-entity-type-by-name :- analyze.schema/Table
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I renamed this because classifiers.name/infer-entity-type clearly implied this was based on column name, however analyze/infer-entity-type lost that clarity

"Classifer that infers the semantic type of a `table` based on its name."
[table :- analyze.schema/Table]
(let [table-name (-> table :name u/lower-case-en)]
Expand Down
6 changes: 3 additions & 3 deletions src/metabase/api/card.clj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
[clojure.java.io :as io]
[compojure.core :refer [DELETE GET POST PUT]]
[medley.core :as m]
[metabase.analyze.query-results :as qr]
[metabase.analyze :as analyze]
[metabase.api.common :as api]
[metabase.api.common.validation :as validation]
[metabase.api.dataset :as api.dataset]
Expand Down Expand Up @@ -463,7 +463,7 @@
visualization_settings ms/Map
collection_id [:maybe ms/PositiveInt]
collection_position [:maybe ms/PositiveInt]
result_metadata [:maybe qr/ResultsMetadata]
result_metadata [:maybe analyze/ResultsMetadata]
cache_ttl [:maybe ms/PositiveInt]}
(check-if-card-can-be-saved dataset_query type)
;; check that we have permissions to run the query that we're trying to save
Expand Down Expand Up @@ -524,7 +524,7 @@
embedding_params [:maybe ms/EmbeddingParams]
collection_id [:maybe ms/PositiveInt]
collection_position [:maybe ms/PositiveInt]
result_metadata [:maybe qr/ResultsMetadata]
result_metadata [:maybe analyze/ResultsMetadata]
cache_ttl [:maybe ms/PositiveInt]
collection_preview [:maybe :boolean]}
(check-if-card-can-be-saved dataset_query type)
Expand Down
4 changes: 2 additions & 2 deletions src/metabase/models/card.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[clojure.walk :as walk]
[malli.core :as mc]
[medley.core :as m]
[metabase.analyze.query-results :as qr]
[metabase.analyze :as analyze]
[metabase.api.common :as api]
[metabase.compatibility :as compatibility]
[metabase.config :as config]
Expand Down Expand Up @@ -587,7 +587,7 @@
This is also complicated because everything is optional, so we cannot assume the client will provide metadata and
might need to save a metadata edit, or might need to use db-saved metadata on a modified dataset."
[{:keys [original-query query metadata original-metadata dataset?]}]
(let [valid-metadata? (and metadata (mc/validate qr/ResultsMetadata metadata))]
(let [valid-metadata? (and metadata (mc/validate analyze/ResultsMetadata metadata))]
(cond
(or
;; query didn't change, preserve existing metadata
Expand Down
18 changes: 4 additions & 14 deletions src/metabase/models/field_values.clj
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
[java-time.api :as t]
[malli.core :as mc]
[medley.core :as m]
[metabase.analyze :as analyze]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This dependency bothers me a bit, in my mind models shouldn't depend on higher level modules, if possible. We could "fix" this by moving the constants out to a general config namespace at some point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed, but that's a step 2 thing and step 1 is just getting things in modules. It's going to take a while and several iterations to clean everything up

[metabase.db.metadata-queries :as metadata-queries]
[metabase.db.query :as mdb.query]
[metabase.models.interface :as mi]
Expand All @@ -40,24 +41,13 @@
[methodical.core :as methodical]
[toucan2.core :as t2]))

(def ^Long category-cardinality-threshold
"Fields with less than this many distinct values should automatically be given a semantic type of `:type/Category`.
This no longer has any meaning whatsoever as far as the backend code is concerned; it is used purely to inform
frontend behavior such as widget choices."
30)

(def ^Long auto-list-cardinality-threshold
"Fields with less than this many distincy values should be given a `has_field_values` value of `list`, which means
the Field should have FieldValues."
1000)

(def ^:private ^Long entry-max-length
"The maximum character length for a stored FieldValues entry."
100)

(def ^:dynamic ^Long *total-max-length*
"Maximum total length for a FieldValues entry (combined length of all values for the field)."
(long (* auto-list-cardinality-threshold entry-max-length)))
(long (* analyze/auto-list-cardinality-threshold entry-max-length)))

(def ^java.time.Period advanced-field-values-max-age
"Age of an advanced FieldValues in days.
Expand Down Expand Up @@ -403,7 +393,7 @@
field-name (or (:name field) (:id field))]
(cond
;; If this Field is marked `auto-list`, and the number of values in now over
;; the [[auto-list-cardinality-threshold]] or the accumulated length of all values exceeded
;; the [[analyze/auto-list-cardinality-threshold]] or the accumulated length of all values exceeded
;; the [[*total-max-length*]] threshold we need to unmark it as `auto-list`. Switch it to `has_field_values` =
;; `nil` and delete the FieldValues; this will result in it getting a Search Widget in the UI when
;; `has_field_values` is automatically inferred by the [[metabase.models.field/infer-has-field-values]] hydration
Expand All @@ -414,7 +404,7 @@
;; way that could make this work. Thus, we are stuck doing it here :(
(and (= :auto-list (keyword (:has_field_values field)))
(or has_more_values
(> (count values) auto-list-cardinality-threshold)))
(> (count values) analyze/auto-list-cardinality-threshold)))
(do
(log/infof
(str "Field %s was previously automatically set to show a list widget, but now has %s values."
Expand Down
6 changes: 3 additions & 3 deletions src/metabase/query_processor/middleware/annotate.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[clojure.set :as set]
[clojure.string :as str]
[medley.core :as m]
[metabase.analyze.fingerprint.fingerprinters :as fingerprinters]
[metabase.analyze :as analyze]
[metabase.driver.common :as driver.common]
[metabase.legacy-mbql.normalize :as mbql.normalize]
[metabase.legacy-mbql.schema :as mbql.s]
Expand Down Expand Up @@ -592,11 +592,11 @@
If the driver returned a base type more specific than :type/*, use that; otherwise look at the sample
of rows and infer the base type based on the classes of the values"
[{:keys [cols]}]
(apply fingerprinters/col-wise
(apply analyze/col-wise
(for [{driver-base-type :base_type} cols]
(if (contains? #{nil :type/*} driver-base-type)
(driver.common/values->base-type)
(fingerprinters/constant-fingerprinter driver-base-type)))))
(analyze/constant-fingerprinter driver-base-type)))))

(defn- add-column-info-xform
[query metadata rf]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
and returns that metadata (which can be passed *back* to the backend when saving a Card) as well
as a checksum in the API response."
(:require
[metabase.analyze.query-results :as qr]
[metabase.analyze :as analyze]
[metabase.driver :as driver]
[metabase.lib.metadata :as lib.metadata]
[metabase.query-processor.reducible :as qp.reducible]
Expand Down Expand Up @@ -68,7 +68,7 @@
rf :- ifn?]
(qp.reducible/combine-additional-reducing-fns
rf
[(qr/insights-rf orig-metadata)]
[(analyze/insights-rf orig-metadata)]
(fn combine [result {:keys [metadata insights]}]
(let [metadata (merge-final-column-metadata (-> result :data :cols) metadata)]
(record! metadata)
Expand Down
10 changes: 4 additions & 6 deletions src/metabase/sync/analyze/classify.clj
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
In the future, we plan to add more classifiers, including ML ones that run offline."
(:require
[clojure.data :as data]
[metabase.analyze.classifiers.core :as classifiers]
[metabase.analyze.classifiers.name :as classifiers.name]
[metabase.analyze.fingerprint.schema :as fingerprint.schema]
[metabase.analyze :as analyze]
[metabase.lib.metadata :as lib.metadata]
[metabase.models.interface :as mi]
[metabase.query-processor.store :as qp.store]
Expand Down Expand Up @@ -76,9 +74,9 @@
(t2/select-one-fn :fingerprint :model/Field :id (u/the-id field)))))

([field :- i/FieldInstance
fingerprint :- [:maybe fingerprint.schema/Fingerprint]]
fingerprint :- [:maybe analyze/Fingerprint]]
(sync-util/with-error-handling (format "Error classifying %s" (sync-util/name-for-logging field))
(let [updated-field (classifiers/run-classifiers field fingerprint)]
(let [updated-field (analyze/run-classifiers field fingerprint)]
(when-not (= field updated-field)
(save-model-updates! field updated-field))))))

Expand Down Expand Up @@ -112,7 +110,7 @@
[table :- i/TableInstance]
(let [updated-table (sync-util/with-error-handling (format "Error running classifier on %s"
(sync-util/name-for-logging table))
(classifiers.name/infer-entity-type table))]
(analyze/infer-entity-type-by-name table))]
(if (instance? Exception updated-table)
table
(save-model-updates! table updated-table))))
Expand Down
7 changes: 3 additions & 4 deletions src/metabase/sync/analyze/fingerprint.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
(:require
[clojure.set :as set]
[honey.sql.helpers :as sql.helpers]
[metabase.analyze.fingerprint.fingerprinters :as fingerprinters]
[metabase.analyze.fingerprint.schema :as fingerprint.schema]
[metabase.analyze :as analyze]
[metabase.db.metadata-queries :as metadata-queries]
[metabase.db.query :as mdb.query]
[metabase.driver :as driver]
Expand All @@ -28,7 +27,7 @@

(mu/defn ^:private save-fingerprint!
[field :- i/FieldInstance
fingerprint :- [:maybe fingerprint.schema/Fingerprint]]
fingerprint :- [:maybe analyze/Fingerprint]]
(log/debugf "Saving fingerprint for %s" (sync-util/name-for-logging field))
;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll
;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the
Expand Down Expand Up @@ -64,7 +63,7 @@
fields :- [:maybe [:sequential i/FieldInstance]]]
(let [rff (fn [_metadata]
(redux/post-complete
(fingerprinters/fingerprint-fields fields)
(analyze/fingerprint-fields fields)
(fn [fingerprints]
(reduce (fn [count-info [field fingerprint]]
(cond
Expand Down
8 changes: 4 additions & 4 deletions src/metabase/xrays/automagic_dashboards/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@

_Most_ tables and _all_ models (as of this writing) will bottom out at `:entity/GenericTable` and thus, use the
`resources/automagic_dashboards/table/GenericTable.yaml` template. `:entity_type` for a given table type is made in
the `metabase.sync.analyze.classifiers.name/infer-entity-type` function, where the primary logic is table naming
based on the `prefix-or-postfix` var in that ns.
the [[metabase.analyze/infer-entity-type-by-name]] function, where the primary logic is table naming based on the
`prefix-or-postfix` var in that ns.

ProTip: If you want to introduce a new template type, do the following:

Expand Down Expand Up @@ -149,7 +149,7 @@
[kixi.stats.core :as stats]
[kixi.stats.math :as math]
[medley.core :as m]
[metabase.analyze.classifiers.core :as classifiers]
[metabase.analyze :as analyze]
[metabase.db.query :as mdb.query]
[metabase.legacy-mbql.normalize :as mbql.normalize]
[metabase.models.card :refer [Card]]
Expand Down Expand Up @@ -441,7 +441,7 @@
(update field :base_type keyword)
(update field :semantic_type keyword)
(mi/instance Field field)
(classifiers/run-classifiers field {})
(analyze/run-classifiers field {})
(assoc field :db db)))))]
(constantly source-fields)))))

Expand Down
4 changes: 2 additions & 2 deletions src/metabase/xrays/automagic_dashboards/util.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[cheshire.core :as json]
[clojure.string :as str]
[medley.core :as m]
[metabase.analyze.classifiers.core :as classifiers]
[metabase.analyze :as analyze]
[metabase.legacy-mbql.predicates :as mbql.preds]
[metabase.legacy-mbql.schema :as mbql.s]
[metabase.legacy-mbql.util :as mbql.u]
Expand Down Expand Up @@ -89,6 +89,6 @@
(update field :base_type keyword)
(update field :semantic_type keyword)
(mi/instance Field field)
(classifiers/run-classifiers field {}))))
(analyze/run-classifiers field {}))))
;; otherwise this isn't returning something, and that's probably an error. Log it.
(log/warnf "Cannot resolve Field %s in automagic analysis context\n%s" field-id-or-name-or-clause (u/pprint-to-str root)))))
2 changes: 1 addition & 1 deletion test/metabase/analyze/classifiers/name_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
(deftest ^:parallel infer-entity-type-test
(testing "name matches"
(let [classify (fn [table-name] (-> (mi/instance Table {:name table-name})
classifiers.name/infer-entity-type
classifiers.name/infer-entity-type-by-name
:entity_type))]
(testing "matches simple"
(is (= :entity/TransactionTable
Expand Down
4 changes: 2 additions & 2 deletions test/metabase/analyze/classify_test.clj
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
(ns metabase.analyze.classify-test
(:require
[clojure.test :refer :all]
[metabase.analyze :as analyze]
[metabase.analyze.classifiers.core :as classifiers]
[metabase.models.field :as field :refer [Field]]
[metabase.models.field-values :as field-values]
[metabase.models.interface :as mi]))

(defn- ->field [field]
Expand All @@ -22,7 +22,7 @@
(let [field (->field {:name "state", :base_type :type/Text})
classified (classifiers/run-classifiers field {:global
{:distinct-count
(dec field-values/category-cardinality-threshold)
(dec analyze/category-cardinality-threshold)
:nil% 0.3}})]
(is (= {:has_field_values :auto-list, :semantic_type :type/Category}
(select-keys classified [:has_field_values :semantic_type]))))))
Expand Down
2 changes: 1 addition & 1 deletion test/metabase/sync/analyze_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@

(deftest survive-classify-table-errors
(testing "Make sure we survive table classification failing"
(sync-survives-crash? classifiers.name/infer-entity-type)))
(sync-survives-crash? classifiers.name/infer-entity-type-by-name)))

(defn- classified-semantic-type [values]
(let [field (mi/instance Field {:base_type :type/Text})]
Expand Down
Loading
Loading