Skip to content

Commit

Permalink
Use sqlite Full Text Search for series title search
Browse files Browse the repository at this point in the history
  • Loading branch information
mizaki committed Dec 24, 2023
1 parent f28214c commit b625275
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 18 deletions.
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -9,7 +9,7 @@ repos:
- id: name-tests-test
- id: requirements-txt-fixer
- repo: https://github.com/asottile/setup-cfg-fmt
rev: v2.4.0
rev: v2.5.0
hooks:
- id: setup-cfg-fmt
- repo: https://github.com/PyCQA/autoflake
Expand All @@ -18,17 +18,17 @@ repos:
- id: autoflake
args: [-i, --remove-all-unused-imports, --ignore-init-module-imports]
- repo: https://github.com/asottile/pyupgrade
rev: v3.13.0
rev: v3.15.0
hooks:
- id: pyupgrade
args: [--py39-plus]
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
args: [--af,--add-import, 'from __future__ import annotations']
- repo: https://github.com/psf/black
rev: 23.9.1
rev: 23.12.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
Expand All @@ -38,7 +38,7 @@ repos:
args: [--max-line-length=120, '--ignore=E203, E501, A003']
additional_dependencies: [toml, flake8-encodings, flake8-warnings, flake8-builtins, flake8-length, flake8-print]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.5.1
rev: v1.7.1
hooks:
- id: mypy
additional_dependencies: [types-setuptools, types-requests]
Expand Down
94 changes: 84 additions & 10 deletions gcd_talker/gcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def __init__(self, version: str, cache_folder: pathlib.Path):
self.download_tag_covers: bool = False

self.has_issue_id_type_id_index: bool = False
self.has_fts5: bool = False
self.has_fts5_checked: bool = False

def register_settings(self, parser: settngs.Manager) -> None:
parser.add_setting(
Expand Down Expand Up @@ -244,6 +246,48 @@ def check_db_filename_not_empty(self):
if not pathlib.Path(self.db_file).is_file():
raise TalkerDataError(self.name, 3, "Database path or filename is invalid!")

def check_db_fts5(self):
try:
with sqlite3.connect(self.db_file) as con:
con = sqlite3.connect(":memory:")
cur = con.cursor()
cur.execute("pragma compile_options;")

if ("ENABLE_FTS5",) not in cur.fetchall():
logger.debug("SQLite has no FTS5 support!")
self.has_fts5_checked = True
return

except sqlite3.Error as e:
logger.debug(f"DB error: {e}")
raise TalkerDataError(self.name, 0, str(e))

try:
with sqlite3.connect(self.db_file) as con:
con.row_factory = sqlite3.Row
con.text_factory = str
cur = con.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'fts';")

if cur.fetchone():
self.has_fts5 = True
self.has_fts5_checked = True
return
else:
# Create the FTS5 table
cur.execute(
"CREATE VIRTUAL TABLE fts USING fts5(name, content='gcd_series', content_rowid='id', "
"tokenize = 'porter unicode61 remove_diacritics 1');"
)
cur.execute("INSERT INTO fts(fts) VALUES('rebuild');")

except sqlite3.DataError as e:
logger.debug(f"DB data error: {e}")
raise TalkerDataError(self.name, 1, str(e))
except sqlite3.Error as e:
logger.debug(f"DB error: {e}")
raise TalkerDataError(self.name, 0, str(e))

def search_for_series(
self,
series_name: str,
Expand All @@ -252,30 +296,60 @@ def search_for_series(
literal: bool = False,
series_match_thresh: int = 90,
) -> list[ComicSeries]:
sql_search: str = ""
sql_search_fields: str = """SELECT gcd_series.id AS 'id', gcd_series.name AS 'series_name',
gcd_series.sort_name AS 'sort_name', gcd_series.notes AS 'notes',
gcd_series.year_began AS 'year_began', gcd_series.year_ended AS 'year_ended',
gcd_series.issue_count AS 'issue_count', gcd_publisher.name AS 'publisher_name' """

sql_literal_search: str = """FROM gcd_publisher
LEFT JOIN gcd_series ON gcd_series.publisher_id=gcd_publisher.id
WHERE gcd_series.name = ?"""

sql_like_search: str = """FROM gcd_publisher
LEFT JOIN gcd_series ON gcd_series.publisher_id=gcd_publisher.id
WHERE gcd_series.name LIKE ?"""

sql_ft_search: str = """FROM fts
LEFT JOIN gcd_series on fts.rowid=gcd_series.id
LEFT JOIN gcd_publisher ON gcd_series.publisher_id=gcd_publisher.id
WHERE fts MATCH ?;"""

self.check_db_filename_not_empty()
if not self.has_fts5_checked:
self.check_db_fts5()

search_series_name = series_name
if not literal:
if literal:
# This will be literally literal: "the" will not match "The" etc.
sql_search = sql_search_fields + sql_literal_search
elif not self.has_fts5:
# Make the search fuzzier
search_series_name = search_series_name.replace(" ", "%") + "%"
sql_search = sql_search_fields + sql_like_search
else:
# Order is important
# Escape any single and double quotes
search_series_name = search_series_name.replace("'", "''")
search_series_name = search_series_name.replace('"', '""')
# Now format for full-text search by tokenizing each word with surrounding double quotes
search_series_name = '"' + search_series_name + '"'
search_series_name = search_series_name.replace(" ", '" "')

# Use FTS5 for search
sql_search = sql_search_fields + sql_ft_search

results = []

logger.info(f"{self.name} searching: {search_series_name}")

self.check_db_filename_not_empty()

try:
with sqlite3.connect(self.db_file) as con:
con.row_factory = sqlite3.Row
con.text_factory = str
cur = con.cursor()
cur.execute(
"SELECT gcd_series.id AS 'id', gcd_series.name AS 'series_name', "
"gcd_series.sort_name AS 'sort_name', gcd_series.notes AS 'notes', "
"gcd_series.year_began AS 'year_began', gcd_series.year_ended AS 'year_ended', "
"gcd_series.issue_count AS 'issue_count', gcd_publisher.name AS 'publisher_name' "
"FROM gcd_publisher "
"LEFT JOIN gcd_series ON gcd_series.publisher_id=gcd_publisher.id "
"WHERE gcd_series.name LIKE ?",
sql_search,
[search_series_name],
)
rows = cur.fetchall()
Expand Down
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ url = https://github.com/comictagger/comictagger-gcd-talker
author = ComicTagger team
author_email = comictagger@gmail.com
license = Apache-2.0
license_file = LICENSE
license_files = LICENSE
classifiers =
Development Status :: 3 - Alpha
Environment :: Console
Expand Down Expand Up @@ -35,7 +35,6 @@ keywords =

[options]
packages = find:
install_requires =
python_requires = >=3.9

[options.packages.find]
Expand Down

0 comments on commit b625275

Please sign in to comment.