In [2]:
# Assumes you have Python 3.10+ installed
# !pip install bs4 python-dotenv langchain p_tqdm tqdm tiktoken weaviate-client
import dataclasses
from dataclasses import dataclass
import json
from multiprocessing import Pool
from pathlib import Path
from pprint import pprint
import os

from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from p_tqdm import p_map
from tqdm.notebook import tqdm
import tiktoken
import weaviate
from weaviate.util import generate_uuid5

load_dotenv('../.env')  # assumes you have a .env file in the root of the repo
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'), client=None)
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=8000, chunk_overlap=0)
encoder = tiktoken.get_encoding('cl100k_base')

# Set path to files

In [3]:
ptt_files = list(Path('../other_data/ptt/').glob('**/*.xml'))
print(f'Number of files: {len(ptt_files)}')

Number of files: 69921


In [8]:
# https://realpython.com/python-data-classes/
# https://realpython.com/python-type-checking/
@dataclass
class ContentItem:
    media: str  # media source of the post or comment
    content_type: str  # post or comment
    author: str  # author of the post or comment
    post_id: str  # id of the post
    year: str  # year of the post
    board: str  # board of the post
    title: str  # title of the post
    text: str  # text of the post or comment
    rating: str  # rating of the comment
    order: int  # 0 for post, 1, 2, 3, ... for comments
    chunk: int  # if text too long, split into chunks
    total_chunks: int  # total number of chunks


def get_comments(parent: ContentItem, soup: bs) -> list[ContentItem]:
    """
    Get comments from a post

    Args:
        parent: parent post
        soup: BeautifulSoup object of the post

    Returns:
        List of ContentItem objects
    """
    res = []
    comments = soup.find_all("comment")
    for comment_idx, comment in enumerate(comments, 1):
        author = comment["author"]
        rating = comment["c_type"]
        text = comment.get_text().replace("\n", "")
        chunks = splitter.split_text(text)
        if not chunks:
            chunks = [""]
        content_type = "comment"
        for chunk_idx, chunk in enumerate(chunks, 1):
            res.append(
                ContentItem(
                    media=parent.media,
                    content_type=content_type,
                    post_id=parent.post_id,
                    author=author,
                    rating=rating,
                    text=chunk,
                    year=parent.year,
                    board=parent.board,
                    title=parent.title,
                    order=comment_idx,  # 0 for post, 1, 2, 3, ... for comments
                    chunk=chunk_idx,
                    total_chunks=len(chunks),
                )
            )
    return res


def get_post_info(path: Path) -> list[ContentItem]:
    """
    Get post information from a post

    Args:
        path: path to the post

    Returns:
        List of ContentItem objects
    """
    with path.open() as f:
        soup = bs(f.read(), "xml")
    media = soup.find("metadata", attrs={"name": "media"}).get_text().replace("\n", "")
    content_type = "post"
    author = (
        soup.find("metadata", attrs={"name": "author"}).get_text().replace("\n", "")
    )
    post_id = (
        soup.find("metadata", attrs={"name": "post_id"}).get_text().replace("\n", "")
    )
    year = soup.find("metadata", attrs={"name": "year"}).get_text().replace("\n", "")
    board = soup.find("metadata", attrs={"name": "board"}).get_text().replace("\n", "")
    title = soup.find("metadata", attrs={"name": "title"}).get_text().replace("\n", "")
    text = soup.find("body").get_text().replace("\n", "")
    chunks = splitter.split_text(text)
    if not chunks:
        chunks = [""]
    posts = []
    for idx, chunk in enumerate(chunks, 1):
        posts.append(
            ContentItem(
                media=media,
                author=author,
                post_id=post_id,
                year=year,
                board=board,
                title=title,
                text=chunk,
                rating="",
                content_type=content_type,
                order=0,  # 0 for post, 1, 2, 3, ... for comments
                chunk=idx,
                total_chunks=len(chunks),
            )
        )
    if not posts:
        print(f"Empty post: {path}")
        raise ValueError(path)
    comments = get_comments(posts[0], soup)

    return posts + comments


def dedupe(items: list[ContentItem]) -> list[ContentItem]:
    """
    Dedupe items

    Args:
        items: list of ContentItem objects

    Returns:
        List of ContentItem objects
    """
    res = []
    seen = set()
    for item in items:
        dumps = json.dumps(dataclasses.asdict(item))
        if dumps not in seen:
            seen.add(dumps)
            res.append(item)
    return res

# Inspect a post

In [58]:
with ptt_files[5000].open() as f:
    soup = bs(f.read(), 'xml')
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<TEI.2>
 <teiHeader>
  <metadata name="media">
   ptt
  </metadata>
  <metadata name="author">
   kero2377 (愛gina福利熊)
  </metadata>
  <metadata name="post_id">
   M.1599958779.A.D7E
  </metadata>
  <metadata name="year">
   2020
  </metadata>
  <metadata name="board">
   HatePolitics-ptt
  </metadata>
  <metadata name="title">
   [公告] ellis5566 2-10
  </metadata>
 </teiHeader>
 <text>
  <body author="kero2377 (愛gina福利熊)">
   <s>
    <w type="Na">
     當事人
    </w>
    <w type="COLONCATEGORY">
     :
    </w>
    <w type="FW">
     ellis
    </w>
    <w type="Neu">
     5566
    </w>
   </s>
   <s>
    <w type="VE">
     判決
    </w>
    <w type="Na">
     依據
    </w>
    <w type="COLONCATEGORY">
     ︰
    </w>
   </s>
   <s>
    <w type="Neu">
     1
    </w>
    <w type="DASHCATEGORY">
     -3
    </w>
    <w type="PERIODCATEGORY">
     .
    </w>
    <w type="Na">
     文章
    </w>
    <w type="Na">
     標題
    </w>
    <w type="PAUSECATEGORY">
 

# Use multiprocessing to speed up the process
Use multiprocessing when you are CPU-bound

More info: https://realpython.com/python-concurrency/#multiprocessing-version

In [71]:
res = p_map(get_post_info, ptt_files)
res = [item for sublist in res for item in sublist]  # flatten list of lists

print(f'Number of posts: {len(res)}')

  0%|          | 0/69921 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Number of posts: 3141828


# Save

In [6]:
# Custom JSON encoder
class EnhancedJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if dataclasses.is_dataclass(o):
            return dataclasses.asdict(o)
        return super().default(o)


with open("../other_data/ptt/ptt-joined.jsonl", "x") as f:  # x for fail if exists
    for r in res:
        f.write(json.dumps(r, ensure_ascii=False, cls=EnhancedJSONEncoder) + "\n")


# Load

In [2]:
with open('../other_data/ptt/ptt-joined.jsonl') as f:
    res = [json.loads(line) for line in f]
print(f'Number of posts: {len(res)}')

Number of posts: 3141828


# Vector DB Weaviate

In [4]:
client = weaviate.Client(
    url=os.environ["WEAVIATE_URL"],
    auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_ADMIN_PASS"]),
    timeout_config=(5, 30), # (connect timeout, read timeout) # type: ignore  
    additional_headers={'X-OpenAI-Api-Key': os.environ["OPENAI_API_KEY"]}
)

# Creating a schema
Each Weaviate class requires a schema. Weaviate's schema defines its data structure in a formal language. In other words, it is a blueprint of how the data is to be organized and stored.

The schema defines data classes (i.e. collections of objects), the properties within each class (name, type, description, settings), possible graph links between data objects (cross-references), and the vectorizer module (if any) to be used for the class, as well as settings such as the vectorizer module, and index configurations.

More info: https://weaviate.io/developers/weaviate/tutorials/schema

In [11]:
schema = {
    "class": "ContentItem",
    "description": "General content item",
    "moduleConfig": {"text2vec-openai": {"vectorizeClassName": False}},
    "vectorizer": "text2vec-openai",  # This could be any vectorizer
    "properties": [
        {
            "name": "media",
            "description": "Source of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "content_type",
            "description": "Type of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "author",
            "description": "Author of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "post_id",
            "description": "Post id of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "year",
            "description": "Year of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "board",
            "description": "Board of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": False,
                    "vectorizePropertyName": True,
                }
            },
        },
        {
            "name": "title",
            "description": "Title of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": False,
                    "vectorizePropertyName": True,
                }
            },
        },
        {
            "name": "text",
            "description": "Text of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": False,
                    "vectorizePropertyName": True,
                }
            },
        },
        {
            "name": "rating",
            "description": "Rating of the content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": False,
                    "vectorizePropertyName": True,
                }
            },
        },
        {
            "name": "order",
            "description": "0 for post, 1, 2, 3, ... for comments",
            "dataType": ["int"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "chunk",
            "description": "Chunk of the current content",
            "dataType": ["int"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
        {
            "name": "total_chunks",
            "description": "Total chunks of the content",
            "dataType": ["int"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True,
                    "vectorizePropertyName": False,
                }
            },
        },
    ],
}


# Prepare for vectorization
Letting Weaviate do vectorization didn't work well for me, so I'm doing it myself. I'm using the same method Weaviate uses to vectorize text.

More info: https://weaviate.io/developers/weaviate/tutorials/schema

In [None]:
counts = []
to_vectorize = []
for idx, r in tqdm(enumerate(res)):
    if dataclasses.is_dataclass(r):
        r = dataclasses.asdict(r)
    keep = ['board', 'rating', 'text', 'title']
    out = ""
    content_type = r['content_type']
    if content_type == 'post':
        keep.remove('rating')
    for k in keep:
        out += f"{k} {r[k].lower()} "
    counts.append(len(encoder.encode(out)))
    to_vectorize.append(out)

# https://openai.com/pricing
print(f"Calculated cost of vectorizing using OpenAI: {sum(counts) / 1000 * 0.0004}")

## Save to format that vectorizing script expects
`row_id` is the index of the post in `res`. It is used to link the vector to the post.

In [9]:
# {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}}
with open('../other_data/ptt/ptt-joined-to-vectorize.jsonl', 'w') as f:
    for idx, r in enumerate(to_vectorize):
        f.write(json.dumps({"model": "text-embedding-ada-002", "input": r, "metadata": {"row_id": idx}}) + '\n', ensure_ascii=False)

* Run this script to vectorize the posts: https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py

# How to upload to Weaviate

Just a sample of how to upload to Weaviate.

In [42]:
def get_vec(output: list[dict]) -> tuple[list[float], int]:
    """Extract vector and row_id from output of vectorization"""
    embedding = output[1]["data"][0]["embedding"]
    row_id = output[2]["row_id"]
    return embedding, row_id


limit = 10000

with client.batch(
    num_workers=16,  # more workers, more betterer, if you have the CPU (probably)
    batch_size=100,
    dynamic=True,
) as batch:
    with open("../other_data/ptt/ptt-joined-to-vectorize_results.jsonl") as f:
        for idx, line in enumerate(f):  # we use line by line, so we don't have to load everything into memory (it's a lot of GBs)
            vector, content_idx = get_vec(json.loads(line))
            r = res[content_idx]
            batch.add_data_object(
                data_object=r,
                class_name="ContentItem",
                uuid=generate_uuid5(r),
                vector=vector,
            )
            if idx == limit:
                break

# Check if it worked

In [5]:
client.query.aggregate('ContentItem').with_meta_count().do()

{'data': {'Aggregate': {'ContentItem': [{'meta': {'count': 3141828}}]}}}

# Query using Langchain
* https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/weaviate-hybrid.html

In [9]:
import dataclasses
from dataclasses import dataclass
from pprint import pprint
import os
import weaviate
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever


@dataclass
class ContentItem:
    media: str  # media source of the post or comment
    content_type: str  # post or comment
    author: str  # author of the post or comment
    post_id: str  # id of the post
    year: str  # year of the post
    board: str  # board of the post
    title: str  # title of the post
    text: str  # text of the post or comment
    rating: str  # rating of the comment
    order: int  # 0 for post, 1, 2, 3, ... for comments
    chunk: int  # if text too long, split into chunks
    total_chunks: int  # total number of chunks


In [10]:
client = weaviate.Client(
    url=os.environ["WEAVIATE_URL"],
    auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_ADMIN_PASS"]),
    timeout_config=(5, 30), # (connect timeout, read timeout) # type: ignore
    additional_headers={'X-OpenAI-Api-Key': os.environ["OPENAI_API_KEY"]}
)

In [11]:
# https://weaviate.io/blog/hybrid-search-explained
attributes = [field.name for field in dataclasses.fields(ContentItem)]
print(attributes)
retriever = WeaviateHybridSearchRetriever(
    client=client,
    k=10,
    alpha=0.5,  # weighting for each search algorithm (alpha = 0 (sparse, BM25), alpha = 1 (dense), alpha = 0.5 (equal weight for sparse and dense))
    index_name="ContentItem",
    text_key="text",
    attributes=attributes,  # include these attributes in the 'metadata' field of the search results
)


['media', 'content_type', 'author', 'post_id', 'year', 'board', 'title', 'text', 'rating', 'order', 'chunk', 'total_chunks']


In [12]:
r = retriever.get_relevant_documents("水桶")
pprint(r)

[Document(page_content='水桶', metadata={'author': 'jeffy84123', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 19, 'post_id': 'M.1607107323.A.9E9', 'rating': 'neg', 'title': '[優雷] 魔物獵人根本沒有輸！', 'total_chunks': 1, 'year': '2020'}),
 Document(page_content='ID', metadata={'author': 'ericisfish', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 3, 'post_id': 'M.1616658750.A.C26', 'rating': 'neg', 'title': '[公告] 水桶', 'total_chunks': 1, 'year': '2021'}),
 Document(page_content='水桶', metadata={'author': 'saimeitetsu', 'board': 'HatePolitics-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 30, 'post_id': 'M.1624681733.A.D21', 'rating': 'neu', 'title': '[討論] 蔡英文的手...常用來自慰嗎?', 'total_chunks': 1, 'year': '2021'}),
 Document(page_content='缺水還能浸水桶，水喔', metadata={'author': 'icexyz', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 5, 'post_id': 'M.1620851870.A.047', 'ra

## Use filters
* https://weaviate.io/developers/weaviate/api/graphql/filters

### One filter

In [15]:
where_filter = {
    "path": ["author"],
    "operator": "Equal",
    "valueString": "peterW"
}
r = retriever.get_relevant_documents("怒", where_filter=where_filter)
pprint(r)

[Document(page_content='老闆？', metadata={'author': 'peterw', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 32, 'post_id': 'M.1650945380.A.25A', 'rating': 'neu', 'title': '[新聞] 《媽的》字幕翻譯觀眾怒！譯者嗆「回你', 'total_chunks': 1, 'year': '2022'}),
 Document(page_content='推樓上', metadata={'author': 'peterw', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 27, 'post_id': 'M.1635682754.A.BA6', 'rating': 'pos', 'title': '[討論] 漫威浩克的角色形象有很憤怒嗎', 'total_chunks': 1, 'year': '2021'}),
 Document(page_content='本來要大聲斥責過氣的，但是實在太大了', metadata={'author': 'peterw', 'board': 'Gossiping-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 46, 'post_id': 'M.1589453622.A.A69', 'rating': 'pos', 'title': '[問卦] 何庭歡(歡歡)是不是無法無天?', 'total_chunks': 1, 'year': '2020'}),
 Document(page_content='股東也怒了吧', metadata={'author': 'peterw', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 32, 'post_id': 'M.16

### Multiple filters

In [17]:
where_filter = {
    "operator": "And",  # And or Or
    "operands": [  # use operands for multiple filters
        {"path": ["content_type"], "operator": "Equal", "valueString": "comment"},
        {"path": ["rating"], "operator": "Equal", "valueString": "pos"},
        {"path": ["author"], "operator": "NotEqual", "valueString": "peterW"}
    ],
}
r = retriever.get_relevant_documents("水桶", where_filter=where_filter)
pprint(r)


[Document(page_content='水桶', metadata={'author': 'bigbang923', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 3, 'post_id': 'M.1649674930.A.0C5', 'rating': 'pos', 'title': '[討論] 雷，三十歲魔法師電影沒x戲真不爽', 'total_chunks': 1, 'year': '2022'}),
 Document(page_content='缺水還能浸水桶，水喔', metadata={'author': 'icexyz', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 5, 'post_id': 'M.1620851870.A.047', 'rating': 'pos', 'title': '[公告] 水桶 sonans', 'total_chunks': 1, 'year': '2021'}),
 Document(page_content='水桶', metadata={'author': 'tompi', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 292, 'post_id': 'M.1641292294.A.E00', 'rating': 'pos', 'title': '[討論] 千萬別抬頭片名有沒有更好的翻譯？', 'total_chunks': 1, 'year': '2022'}),
 Document(page_content='XD', metadata={'author': 'leemt', 'board': 'movie-ptt', 'chunk': 1, 'content_type': 'comment', 'media': 'ptt', 'order': 2, 'post_id': 'M.1668317049.A.732', 'rating