In [1]:
import mailbox
import bs4
from pathlib import Path
from tqdm import tqdm

from datetime import datetime
import dateparser

import sqlite3
import pickle


from datasets import Dataset, load_dataset

from transformers import AutoTokenizer

import quopri
import base64
import re

In [2]:
MAILING_LISTS = [
    "pgsql-hackers",
    "pgsql-performance",
]

In [3]:
if False:
    for mbox_path in sorted(Path("/home/kapi/Downloads/").glob("pgsql-*")):
        mbox_name = mbox_path.name
        with open(mbox_path, "r", encoding="utf-8", errors="replace") as f_read:
            with open(Path(".") / mbox_name, "w", encoding="utf-8") as f_write:
                for line in f_read:
                    line = line
                    if line.startswith("From "):
                        if "@" not in line and not line.startswith("From bouncefilter"):
                            # The pgsql mailing lists are not mboxo-compliant.
                            line = f" {line}"
                    print(line, end="", file=f_write)

In [4]:
def stringify(m, depth=0):
    if m.is_multipart():
        parts = m.get_payload()
        res = []
        for part in parts:
            res.append(stringify(part, depth=depth + 1))
        return "\n".join(res).strip()
    content_type = m.get_content_type()
    if content_type == "text/plain":
        if "Content-Disposition" in m and "attachment" in str(m["Content-Disposition"]).lower():
            return ""
        content_type_str = str(m["Content-Type"])
        if "patch" in content_type_str:
            return ""
        if "name=" in content_type_str:
            return ""
        charset = "utf-8"
        if "charset" in content_type_str:
            charset = content_type_str[content_type_str.find("charset=") + len("charset="):].strip()
            if ";" in charset:
                charset = charset[:charset.rfind(";")].strip()
        content = m.get_payload(decode=True)
        try:
            content = content.decode(charset, errors="replace")
        except LookupError:
            # e.g., UNKNOWN-8BIT, LookupError
            content = content.decode("utf-8", errors="replace")
        return content
    elif content_type == "text/html":
        if depth > 1:
            # Skip - USUALLY duplicate of text/plain
            return ""
        content_type_str = str(m["Content-Type"])
        if "patch" in content_type_str:
            return ""
        if "name=" in content_type_str:
            return ""
        charset = "utf-8"
        if "charset" in content_type_str:
            charset = content_type_str[content_type_str.find("charset=") + len("charset="):].strip()
            if ";" in charset:
                charset = charset[:charset.rfind(";")].strip()
        content = m.get_payload(decode=True)
        try:
            content = content.decode(charset, errors="replace")
        except LookupError:
            # e.g., UNKNOWN-8BIT, LookupError
            content = content.decode("utf-8", errors="replace")
        soup = bs4.BeautifulSoup(content, 'lxml')
        return soup.get_text()
    else:
        # Skip - images etc
        return ""
    
class UnionFind:
    def __init__(self):
        self.parent = {}
        self.size = {}

    def __contains__(self, x):
        return x in self.parent

    def add(self, x):
        if x not in self.parent:
            self.parent[x] = x
            self.size[x] = 1
            return True
        return False

    def find(self, x):
        if self.parent[x] == x:
            return x
        self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
        
    def union(self, a, b):
        a = self.find(a)
        b = self.find(b)
        if a != b:
            if self.size[a] < self.size[b]:
                a, b = b, a
            self.parent[b] = a
            self.size[a] += self.size[b]

In [5]:
MBOXES = {}
for mailing_list in MAILING_LISTS:
    MBOXES[mailing_list] = {}
    for mbox_path in sorted(Path("./data").glob(f"{mailing_list}*")):
        mbox_name = mbox_path.name
        MBOXES[mailing_list][mbox_name] = mailbox.mbox(mbox_path)

In [6]:
# for msg_i, msg in enumerate(MBOXES["pgsql-hackers"]["pgsql-hackers.202401"]):
#     if msg["Subject"] == "pg_stat_advisor extension":
#         print(msg["Subject"], msg_i)
#         break


# msg = MBOXES["pgsql-hackers"]["pgsql-hackers.202401"][2711]
# # for x in msg.get_payload():
# #     print(x["Content-Type"])
# #     print(x.get_payload()[:500])

# msg_contents = stringify(msg)
# print(msg_contents[:50000])
    
# # msg_ts_str = msg["Date"].strip()
# # # There are a bazillion ways that dates appear on the mailing list.
# # if msg_ts_str.endswith(")"):
# #     # (%Z) doesn't work for "AST" in "Sun, 4 Jan 1998 03:26:34 -0400 (AST)"
# #     # fmt = "%a, %d %b %Y %H:%M:%S %z (%Z)"
# #     msg_ts_str = msg_ts_str[:msg_ts_str.rfind("(")].strip()
# # if "," in msg_ts_str:
# #     # And at some point they dropped the "%a,"
# #     msg_ts_str = msg_ts_str[msg_ts_str.find(",")+1:].strip()
# # # And there are some completely wack dates like "28 Feb 1998 22:25:42 -7700"
# # msg_ts = dateparser.parse(msg_ts_str)
# # if msg_ts is None:
# #     print(msg_ts_str, msg)
# # msg_ts = msg_ts.isoformat(" ")
# # # This is stupid. We'll use an offset instead.
# # cur.execute("INSERT INTO email VALUES (?, ?, ?, ?)", (msg_id, msg_contents, mbox_name, msg_i))

# threads = {}
# for m in mbox:
#     key = m["Message-ID"].strip()
#     rep = uf.find(key)
#     thread_rep = mbox[mbox_map[key]]
#     msg = mbox[mbox_map[key]]
#     if rep not in threads:
#         threads[rep] = []
#     threads[rep].append(msg["Subject"])

# threads

In [7]:
def get_refs(s):
    def get_mime(s):
        encoded_word_regex = r'=\?{1}(.+)\?{1}([B|Q])\?{1}(.+)\?{1}='
        charset, encoding, encoded_text = re.match(encoded_word_regex, s).groups()
        if encoding == 'B':
            byte_string = base64.b64decode(encoded_text)
        elif encoding == 'Q':
            byte_string = quopri.decodestring(encoded_text)
        return byte_string.decode(charset)

    s_old = s
    s = s.strip()
    if s.startswith("=?"):
        tmp = []
        for line in s.split("\n"):
            line = line.strip()
            tmp.append(get_mime(line))
        s = "".join(tmp)

    refs = []
    while "<" in s:
        s = s[s.find("<"):]
        ref = s[:s.find(">") + 1]
        # Some emails will just have two @'s, so you can't check that.
        # e.g., <20000717201117.AAA24984@mailserver.sixdegrees.com@sorrow>
        # And you're not guaranteed complete references either, so only save the good ones.
        if ref.startswith("<") and ref.endswith(">"):
            refs.append(ref)
        if ">" not in s:
            break
        s = s[s.find(">") + 1:]
    return refs



blah = """=?utf-8?Q?_<AANLkTikaQj-YKgYRTWg5UKOZ4uIO13q8gOZRevOnR6Iz@mail.gma?=
	=?utf-8?Q?il.com>_<AANLkTikFqI88TQDI359=5FpWgOa98RAApHaC3rj9ZNB-4V@mail.gmai?=
	=?utf-8?Q?l.com>_<20100524160843.5158c3d5.wmoran@potentialtech.com>_<AAN?=
	=?utf-8?Q?LkTilvwQXHVkeAMmAlAnYYNEMRsTZG7pmzZxoyMRBz@mail.gmail.com>?="""
blah2 = """<80c80b4a-b87b-456f-bd46-1ae326601d79.xiyuan.zr@alibaba-inc.com>"""

blah3 = """ =?utf-8?Q?_<AANLkTikaQj-YKgYRTWg5UKOZ4uIO13q8gOZRevOnR6Iz@mail.gma?=
	=?utf-8?Q?il.com>_<AANLkTikFqI88TQDI359=5FpWgOa98RAApHaC3rj9ZNB-4V@mail.gmai?=
	=?utf-8?Q?l.com>_<20100524160843.5158c3d5.wmoran@potentialtech.com>_<AAN?=
	=?utf-8?Q?LkTilvwQXHVkeAMmAlAnYYNEMRsTZG7pmzZxoyMRBz@mail.gmail.com>?="""
blah4 ="""<AANLkTilvwQXHVkeAMmAlAnYYNEMRsTZG7pmzZxoyMRBz@mail.gmail.com><asdasd>"""

get_refs(blah4)

['<AANLkTilvwQXHVkeAMmAlAnYYNEMRsTZG7pmzZxoyMRBz@mail.gmail.com>', '<asdasd>']

In [8]:
UF = {}

for mailing_list, mboxes in MBOXES.items():
    uf_path = Path(f"uf_{mailing_list}.pickle")
    if not uf_path.exists():
        uf = UnionFind()
        with tqdm(total=len(mboxes)) as pbar:
            for mbox_name, mbox in mboxes.items():
                pbar.set_description(f"Processing UF {mbox_name}")
                for msg_i, msg in enumerate(mbox):
                    if msg["Message-Id"] is None:
                        continue
                    msg_id = msg["Message-ID"].strip()
                    uf.add(msg_id)
                    for ref_key in ["References", "In-Reply-To"]:
                        if ref_key not in msg:
                            continue
                        ref_cands = str(msg[ref_key])
                        for ref_id in get_refs(ref_cands):
                            uf.add(ref_id)
                            uf.union(msg_id, ref_id)
                pbar.update(1)
        with open(f"uf_{mailing_list}.pickle", "wb") as f:
            pickle.dump(uf, f)
    with open(f"uf_{mailing_list}.pickle", "rb") as f:
        UF[mailing_list] = pickle.load(f)

print("Note that num_known_messages may not equal num_available_messages because some emails are not available on the mailing list.")
for key in UF:
    num_known_messages = len(UF[key].parent)
    print(f"{key}: {num_known_messages=}")            

Note that num_known_messages may not equal num_available_messages because some emails are not available on the mailing list.
pgsql-hackers: num_known_messages=588636
pgsql-performance: num_known_messages=65043


In [9]:
EMAILS = {}
THREADS = {}

for mailing_list, mboxes in MBOXES.items():
    threads_path = Path(f"threads_{mailing_list}.pickle")
    if not threads_path.exists():
        emails = {}
        threads = {}
        uf = UF[mailing_list]
        max_messages = len(uf.parent)
        with tqdm(total=max_messages) as pbar:
            for mbox_name, mbox in mboxes.items():
                pbar.set_description(f"Building threads out of messages from: {mbox_name}")
                for msg_i, msg in enumerate(mbox):
                    pbar.update(1)
                    if msg["Message-Id"] is None:
                        continue
                    msg_id = msg["Message-ID"].strip()
                    assert msg_id not in emails
                    emails[msg_id] = msg

                    thread_rep = uf.find(msg_id)
                    thread_key = msg_id
                    if thread_rep not in threads:
                        threads[thread_rep] = [thread_key]
                    else:
                        threads[thread_rep].append(thread_key)
        with open(f"emails_{mailing_list}.pickle", "wb") as f:
            pickle.dump(emails, f)
        with open(f"threads_{mailing_list}.pickle", "wb") as f:
            pickle.dump(threads, f)
    with open(f"emails_{mailing_list}.pickle", "rb") as f:
        EMAILS[mailing_list] = pickle.load(f)
    with open(f"threads_{mailing_list}.pickle", "rb") as f:
        THREADS[mailing_list] = pickle.load(f)
    
for mailing_list in MAILING_LISTS:
    num_emails = len(EMAILS[mailing_list])
    num_threads = len(THREADS[mailing_list])
    print(f"{mailing_list}: {num_emails=} {num_threads=}")

pgsql-hackers: num_emails=579909 num_threads=58504
pgsql-performance: num_emails=63689 num_threads=9718


In [10]:
DATASET = {}

for mailing_list in MAILING_LISTS:
    dataset = []
    dataset_path = Path(f"dataset_{mailing_list}.pickle")
    if not dataset_path.exists():
        emails = EMAILS[mailing_list]
        threads = THREADS[mailing_list]
        num_threads = len(THREADS[mailing_list])
        with tqdm(total=num_threads) as pbar:
            pbar.set_description(f"Processing threads for {mailing_list}")
            for thread_rep, msg_ids in threads.items():
                pbar.update(1)
                if len(msg_ids) == 0:
                    continue

                excessively_long_emails = [
                    # pgsql-hackers
                    # https://www.postgresql.org/message-id/Pine.UW2.4.21.0110281928170.2268-100000%40server.pyrenet.fr
                    "<Pine.UW2.4.21.0110281928170.2268-100000@server.pyrenet.fr>",
                    # https://www.postgresql.org/message-id/AANLkTimU421rHhBhurUMHN9UhJbh38k657vWMsXNs5i-%40mail.gmail.com
                    "<AANLkTimU421rHhBhurUMHN9UhJbh38k657vWMsXNs5i-@mail.gmail.com>",
                    # https://www.postgresql.org/message-id/20191105024819.GB12780%40momjian.us
                    "<20191105024819.GB12780@momjian.us>",
                    # pgsql-performance
                    # https://www.postgresql.org/message-id/CAN0SRDFgPhyyP2yBphfVEYO0-K2uFXUk%2BC6euTCdx9jt5QkYwQ%40mail.gmail.com
                    "<CAN0SRDFgPhyyP2yBphfVEYO0-K2uFXUk+C6euTCdx9jt5QkYwQ@mail.gmail.com>",
                    # https://www.postgresql.org/message-id/0C3B1B2B-6571-477D-A89C-6FA7B6259840%40spectralogic.com
                    "<0C3B1B2B-6571-477D-A89C-6FA7B6259840@spectralogic.com>",
                    # https://www.postgresql.org/message-id/CAAUL%3DcFcvUo%3D7b4T-K5PqiqrF6etp59qcgv77DyK2Swa4VhYuQ%40mail.gmail.com
                    "<CAAUL=cFcvUo=7b4T-K5PqiqrF6etp59qcgv77DyK2Swa4VhYuQ@mail.gmail.com>",
                ]
                if any(x_id in msg_ids for x_id in excessively_long_emails):
                    continue
                if any(str(emails[msg_id]["Subject"]).lower() == "unsubscribe" for msg_id in msg_ids):
                    # Signatures.
                    continue

                all_empty = True
                data_row = []
                msg_op = emails[msg_ids[0]]["From"]
                for msg_i, msg_id in enumerate(msg_ids):
                    msg = emails[msg_id]
                    msg_date = str(msg["Date"])
                    msg_subject = str(msg["Subject"])
                    msg_from = str(msg["From"])
                    try:
                        msg_contents = stringify(msg)
                    except Exception as e:
                        print(msg)
                        raise e
                    if len(msg_contents) > 0:
                        all_empty = False

                    msg_from_op = msg_from == msg_op

                    msg_dict = {
                        "msg_date": msg_date,
                        "msg_subject": msg_subject,
                        "msg_from": msg_from,
                        "msg_contents": msg_contents,
                        "msg_from_op": msg_from_op,
                    }
                    data_row.append(msg_dict)
                if not all_empty:
                    dataset.append(data_row)
        with open(dataset_path, "wb") as f:
            pickle.dump(dataset, f)
    with open(dataset_path, "rb") as f:
        DATASET[mailing_list] = {}
        DATASET[mailing_list]["threads"] = pickle.load(f)

In [11]:
TOKENIZER_MODEL = "unsloth/Llama-3.2-3B-bnb-4bit"
MAX_SEQ_LEN = 2048

SYSTEM_PROMPT = """You are an expert PostgreSQL developer helping people on the '{}' mailing list.
You are composing a reply to resolve the email chain below.

### Original Email:
Date: {}
Subject: {}
Contents: {}

### Some emails may be omitted for brevity."""

USER_PROMPT = """### Latest Email:
Date: {}
Subject: {}
Contents: {}
"""

GPT_PROMPT = """### Your Response:
Date: {}
Subject: {}
Contents: {}"""

# unsloth
# {'conversations': [
#   {'from': 'human', 'value': 'Give three tips for staying healthy.'},
#   {'from': 'gpt', 'value': '1. Eat a balanced and nutritious ...'},
#   {'from': 'human', 'value': 'Describe what a monotheistic religion is.'},
#   {'from': 'gpt', 'value': 'A monotheistic religion is a type of ...'}
# ]}
CONVERSATIONS = {}

for mailing_list in MAILING_LISTS:
    conversations = []
    conversations_path = Path(f"conversations_{mailing_list}.pickle")
    if not conversations_path.exists():
        threads = DATASET[mailing_list]["threads"]
        num_threads = len(threads)
        with tqdm(total=num_threads) as pbar:
            pbar.set_description(f"Processing threads for {mailing_list}")
            for thread in threads:
                pbar.update(1)
                num_emails = len(thread)
                if num_emails <= 1:
                    # Zetsubo.
                    continue

                first_email = thread[0]

                for email_idx in range(1, num_emails):
                    # To make tokenizing possible.
                    conversation = []
                    conversation.append({
                        "from": "system",
                        "value": SYSTEM_PROMPT.format(
                            mailing_list,
                            first_email["msg_date"],
                            first_email["msg_subject"],
                            first_email["msg_contents"],
                        ),
                    })
                    prev_email = thread[email_idx - 1]
                    conversation.append({
                        "from": "user",
                        "value": USER_PROMPT.format(
                            prev_email["msg_date"],
                            prev_email["msg_subject"],
                            prev_email["msg_contents"],
                        )
                    })

                    latest_email = thread[email_idx]
                    conversation.append({
                        "from": "assistant",
                        "value": GPT_PROMPT.format(
                            latest_email["msg_date"],
                            latest_email["msg_subject"],
                            latest_email["msg_contents"],
                        )
                    })
                    conversations.append(conversation)
        with open(conversations_path, "wb") as f:
            pickle.dump(conversations, f)
    with open(conversations_path, "rb") as f:
        CONVERSATIONS[mailing_list] = {}
        CONVERSATIONS[mailing_list]["conversations"] = pickle.load(f)

In [12]:
TOKENIZER_MODEL = "unsloth/Llama-3.2-3B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)

In [14]:
TEXTS = {}
for mailing_list in MAILING_LISTS:
    conversations = CONVERSATIONS[mailing_list]["conversations"]
    texts_path = Path(f"texts_{mailing_list}.pickle")
    untokenized_texts_path = Path(f"untokenized_texts_{mailing_list}.pickle")
    if not texts_path.exists():
        texts = []
        untokenized_texts = []
        num_conversations = len(conversations)
        with tqdm(total=num_conversations) as pbar:
            pbar.set_description(f"Forming prompts for {mailing_list}")
            for conversation in conversations:
                pbar.update(1)
                prompt = []
                prompt.append(tokenizer.bos_token)
                for msg_idx, msg in enumerate(conversation):
                    if msg_idx == 0:
                        assert msg["from"] == "system"
                    prompt.append(r"""<|start_header_id|>""")
                    prompt.append(msg["from"])
                    prompt.append(r"""<|end_header_id|>""" + "\n\n")
                    prompt.append(msg["value"])
                    prompt.append(r"""<|eot_id|>""")
                prompt.append(tokenizer.eos_token)
                text = "".join(prompt)
                if len(text) > 7500:
                    untokenized_texts.append(text)
                    continue
                toklen = len(tokenizer.tokenize(text))
                texts.append((text, toklen))
        with open(untokenized_texts_path, "wb") as f:
            pickle.dump(untokenized_texts, f)
        with open(texts_path, "wb") as f:
            pickle.dump(texts, f)
    with open(texts_path, "rb") as f:
        TEXTS[mailing_list] = pickle.load(f)

Forming prompts for pgsql-hackers:   0%|          | 1978/520899 [00:08<30:11, 286.49it/s]

In [17]:
FILTERED_TEXTS = {}
for mailing_list in MAILING_LISTS:
    num_emails = 0
    conversations = CONVERSATIONS[mailing_list]["conversations"]
    for conversation in conversations:
        num_emails += len(conversation)
    print(mailing_list, num_emails)
    filtered_texts_path = Path(f"filteredtexts_{mailing_list}.pickle")
    if not filtered_texts_path.exists():
        filtered_texts = {"text": [text for (text, toklen) in TEXTS[mailing_list] if toklen <= 2048]}
        with open(filtered_texts_path, "wb") as f:
            pickle.dump(filtered_texts, f)
    with open(filtered_texts_path, "rb") as f:
        FILTERED_TEXTS[mailing_list] = {}
        FILTERED_TEXTS[mailing_list] = pickle.load(f)

pgsql-hackers 1562697
pgsql-performance 161640


In [5]:
for mailing_list in MAILING_LISTS:
    for dsname in [mailing_list, f"{mailing_list}-processed"]:
        ds = load_dataset(f"wanshenl/{dsname}", split="train")
        print(dsname, ds)

pgsql-hackers Dataset({
    features: ['threads'],
    num_rows: 58371
})
pgsql-hackers-processed Dataset({
    features: ['text'],
    num_rows: 384601
})
pgsql-performance Dataset({
    features: ['threads'],
    num_rows: 9686
})
pgsql-performance-processed Dataset({
    features: ['text'],
    num_rows: 33722
})


In [41]:
import pandas as pd
print("pgsql-performance #char")
s = pd.Series([sum([len(x["value"]) for x in ds[i]["conversations"]]) for i in range(len(ds))])
print(s.describe([.75, .8, .85, .9, .95, .99]).apply(lambda x: format(x, 'g')))
print("pgsql-performance #space")
s = pd.Series([sum([x["value"].count(" ") for x in ds[i]["conversations"]]) for i in range(len(ds))])
print(s.describe([.75, .8, .85, .9, .95, .99]).apply(lambda x: format(x, 'g')))
print("pgsql-hackers #char")
s = pd.Series([sum([len(x["value"]) for x in ds2[i]["conversations"]]) for i in range(len(ds2))])
print(s.describe([.75, .8, .85, .9, .95, .99]).apply(lambda x: format(x, 'g')))
print("pgsql-hackers #space")
s = pd.Series([sum([x["value"].count(" ") for x in ds2[i]["conversations"]]) for i in range(len(ds2))])
print(s.describe([.75, .8, .85, .9, .95, .99]).apply(lambda x: format(x, 'g')))

pgsql-performance #char
count           9718
mean         13717.8
std          38778.5
min               90
50%           6600.5
75%            14935
80%          18213.8
85%          22528.1
90%          29552.5
95%          44991.1
99%           102610
max      2.34708e+06
dtype: object
pgsql-performance #space
count       9718
mean      2507.9
std      7237.06
min           11
50%         1101
75%       2631.5
80%       3207.2
85%       4023.9
90%       5383.9
95%      8526.05
99%      20354.7
max       469102
dtype: object
pgsql-hackers #char
count          57080
mean         21945.5
std           120435
min               88
50%           5244.5
75%          15143.2
80%          19823.8
85%          27065.3
90%          40414.6
95%          74222.4
99%           260732
max      9.91286e+06
dtype: object
pgsql-hackers #space
count          57080
mean         3479.23
std          20244.1
min               11
50%              820
75%             2403
80%             3158
85%          

In [1]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-bnb-4bit")
# tokenizer.tokenize("select cat from foo as xxx where xxx.value = 'joe'")

In [52]:
# for mailing_list in MAILING_LISTS:
#     toklen_path = Path(f"toklen_{mailing_list}.pickle")
#     if not toklen_path.exists():
#         s = []
#         ds = load_dataset(f"wanshenl/{mailing_list}", split="train")
#         for i in tqdm(range(len(ds))):
#             messages = ds[i]["conversations"]
#             toklens = [len(tokenizer.tokenize(msg["value"])) for msg in messages]
#             toklen = sum(toklens)
#             s.append(toklen)
#         s = pd.Series(s)
#         with open(toklen_path, "wb") as f:
#             pickle.dump(s, f)
#     with open(toklen_path, "rb") as f:
#         s = pickle.load(f)
#     print(f"{mailing_list} #tokens")
#     print(s.describe([.75, .8, .85, .9, .95, .99]).apply(lambda x: format(x, 'g')))

pgsql-hackers #tokens
count          57080
mean          6109.7
std          36969.3
min               34
50%             1418
75%          4079.25
80%           5378.2
85%          7300.15
90%            10907
95%          20001.9
99%          71109.7
max      2.92211e+06
dtype: object
pgsql-performance #tokens
count       9718
mean     3847.36
std      14574.2
min           33
50%       1780.5
75%      4079.25
80%       4990.2
85%      6201.45
90%       8167.9
95%      12338.1
99%        28180
max       960927
dtype: object


In [72]:
# ds = load_dataset(f"wanshenl/pgsql-hackers", split="train")
# with open(Path(f"toklen_pgsql-hackers.pickle"), "rb") as f:
#     s = pickle.load(f)
# ds["conversations"][s.argmax()]
16*1024

16384

In [73]:
s[s <= 16384]

0         369
1          48
2         134
3        3408
4         588
         ... 
57075    1721
57076     508
57077     490
57078     365
57079     347
Length: 53454, dtype: int64