In [11]:
from common import *

cursor = connect()

# Full Text Search

PostgreSQL 的全文搜索包括：
- Indexing
- Text analysis
- Querying
- Ranking
- Highlighting

提供全文搜索特有的两种数据类型：`tsvector` 和 `tsquery`

## tsvector

`tsvector` 是一种数据类型，可以以优化的格式存储预处理文档，以便高效搜索和检索文本。`tsvector` 值包含一个词目（单词）排序列表及其在文档中的位置和权重。
> 请注意，词目是没有后缀变化的单词，例如，watch、watched 和 watched 单词的词素是 watch。

In [2]:
sql = """
SELECT to_tsvector('waches'), 
       to_tsvector('wached'), 
       to_tsvector('waching');
"""
run_sql(cursor, sql)

  to_tsvector to_tsvector  to_tsvector
0  'waches':1  'wached':1  'waching':1


In [3]:
sql = """
SELECT to_tsvector('The quick brown fox jumps over the lazy dog.');
"""
run_sql(cursor, sql)

                                         to_tsvector
0  'brown':3 'dog':9 'fox':4 'jumps':5 'lazy':8 '...


- tsvector 值中的每个条目代表一个单词（词素）及其在字符串（或文档）中的位置。例如，单词 quick 出现在位置 2，单词 brown 出现在位置 3，以此类推。
- 单词按字母顺序排序。
- 冠词和停顿词会被省略，如 The 和 over。

## tsquery

`tsquery` 是一种数据类型，用于表示全文搜索中的搜索查询。它允许指定包含索引文档单词或短语的搜索条件。

`to_tsquery()` 将字符串转换为 `tsquery`。例如，下面的语句使用 `to_tsquery()` 将单词 "`jumping`" 转换为 `tsquery`：

In [4]:
sql = """
SELECT to_tsquery('jumping');
"""
run_sql(cursor, sql)

  to_tsquery
0  'jumping'


In [5]:
sql = """
SELECT 
  to_tsvector(
    'The quick brown fox jumps over the lazy dog.'
  ) @@ to_tsquery('jumping') result;
"""
run_sql(cursor, sql)

   result
0   False


In [6]:
sql = """
SELECT 
  to_tsvector(
    'The quick brown fox jumps over the lazy dog.'
  ) @@ to_tsquery('cat') result;
"""
run_sql(cursor, sql)

   result
0   False


# 在表格数据中使用全文搜索

In [12]:
# 创建表格
sql = """
-- 创建一个不可变函数，用于生成虚拟列的表达式
CREATE FUNCTION immutable_to_tsvector(body TEXT) RETURNS TSVECTOR AS $$
BEGIN
    RETURN to_tsvector(body);
END;
$$ LANGUAGE plpgsql IMMUTABLE;

-- 创建表，并使用自定义的不可变函数来生成虚拟列的表达式
CREATE TABLE posts(
   id SERIAL PRIMARY KEY,
   title TEXT NOT NULL,
   body TEXT,
   body_search TSVECTOR GENERATED ALWAYS AS (immutable_to_tsvector(body)) STORED
);

INSERT INTO posts (title, body)
VALUES
    ('Introduction to PostgreSQL', 'This is an introductory post about PostgreSQL. It covers basic concepts and features.'),
    ('Advanced PostgresSQL Techniques', 'In this post, we delve into advanced PostgreSQL techniques for efficient querying and data manipulation.'),
    ('PostgreSQL Optimization Strategies', 'This post explores various strategies for optimizing PostgreSQL database performance and efficiency.');
"""
cursor.execute(sql)

<psycopg.Cursor [COMMAND_OK] [INTRANS] (host=localhost user=postgres database=dvdrental) at 0x1863b939fd0>

In [13]:
sql = """
SELECT 
  id, 
  body_search 
FROM 
  posts;
"""
run_sql(cursor, sql)

   id                                        body_search
0   1  'about':6 'an':3 'and':12 'basic':10 'concepts...
1   2  'advanced':7 'and':13 'data':14 'delve':5 'eff...
2   3  'and':11 'database':9 'efficiency':12 'explore...


In [14]:
# 全文搜索
sql = """
SELECT 
  id, 
  body 
FROM 
  posts 
WHERE 
  body_search @@ to_tsquery('PostgreSQL');
"""
run_sql(cursor, sql)

   id                                               body
0   1  This is an introductory post about PostgreSQL....
1   2  In this post, we delve into advanced PostgreSQ...
2   3  This post explores various strategies for opti...


In [15]:
# 全文搜索 AND
sql = """
SELECT 
  id, 
  body 
FROM 
  posts 
WHERE 
  body_search @@ to_tsquery('PostgreSQL & techniques');
"""
run_sql(cursor, sql)

   id                                               body
0   2  In this post, we delve into advanced PostgreSQ...


In [16]:
# 全文搜索 OR
sql = """
SELECT 
  id, 
  body 
FROM 
  posts 
WHERE 
  body_search @@ to_tsquery('efficient | optimization');
"""
run_sql(cursor, sql)

   id                                               body
0   2  In this post, we delve into advanced PostgreSQ...


In [17]:
# 全文搜索 phrase
sql = """
SELECT 
  id, 
  body 
FROM 
  posts 
WHERE 
  body_search @@ to_tsquery('''PostgreSQL technique''');
"""
run_sql(cursor, sql)

Empty DataFrame
Columns: [id, body]
Index: []


In [18]:
# 全文搜索 phrase
sql = """
SELECT id, body
FROM posts 
WHERE NOT body_search @@ to_tsquery('efficient');
"""
run_sql(cursor, sql)

   id                                               body
0   1  This is an introductory post about PostgreSQL....
1   3  This post explores various strategies for opti...


# 使用 GIN 索引进行全文搜索

In [19]:
sql = """
DROP TABLE IF EXISTS posts;

CREATE TABLE posts(
   id SERIAL PRIMARY KEY,
   title TEXT NOT NULL,
   body TEXT
);

INSERT INTO posts (title, body)
VALUES
    ('Introduction to PostgreSQL', 'This is an introductory post about PostgreSQL. It covers basic concepts and features.'),
    ('Advanced PostgresSQL Techniques', 'In this post, we delve into advanced PostgreSQL techniques for efficient querying and data manipulation.'),
    ('PostgreSQL Optimization Strategies', 'This post explores various strategies for optimizing PostgreSQL database performance and efficiency.');
"""
cursor.execute(sql)

<psycopg.Cursor [COMMAND_OK] [INTRANS] (host=localhost user=postgres database=dvdrental) at 0x1863b939fd0>

In [20]:
sql = """
CREATE INDEX body_fts
ON posts
USING GIN ((to_tsvector('english',body)));
"""
cursor.execute(sql)

<psycopg.Cursor [COMMAND_OK] [INTRANS] (host=localhost user=postgres database=dvdrental) at 0x1863b939fd0>

In [21]:
sql = """
SELECT 
  id, 
  body 
FROM 
  posts 
WHERE 
  body @@ to_tsquery('basic | advanced');
"""
run_sql(cursor, sql)

   id                                               body
0   1  This is an introductory post about PostgreSQL....
1   2  In this post, we delve into advanced PostgreSQ...
