In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from transformers.generation import GenerationConfig
from tqdm.notebook import tqdm


In [2]:
!pwd

/mystuff/notebooks/to_be_uploaded


In [4]:
!ls /mystuff/wikipedia2

Pipfile
Pipfile.lock
chroma_vectors.db
deotte-data
embedding_vectors_256_head_bge_small_en.db
embedding_vectors_512_head_bge_small_en.db
enwiki-20230801-pages-articles-multistream.xml.bz2
enwiki-20230801.json
extraction
faiss-gpu-173-python310.zip
faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
faiss_index_256_v1_bge_small_en.index
first10k.json
wikipedia-2023-07-faiss-index.zip
wikipedia_202307.index
wikipedia_chunks_128.db
wikipedia_chunks_192.db
wikipedia_chunks_256.db
wikipedia_chunks_384.db
wikipedia_chunks_512.db
wikipedia_chunks_64.db


In [5]:
#!head /mystuff/wikipedia2/enwiki-20230801.json

# Read Articles + Collect Main Structure of Data

In [6]:
import json
from tqdm.notebook import tqdm
tqdm.pandas()

# recursive key extraction from dict
def extract_keys(data, keys_set=None):
    if keys_set is None:
        keys_set = set()

    if isinstance(data, dict):
        for key, value in data.items():
            keys_set.add(key)
            extract_keys(value, keys_set)
    elif isinstance(data, list):
        for item in data:
            extract_keys(item, keys_set)

    return keys_set

# Load JSON dump of Wikipedia
file_path = "/mystuff/wikipedia2/enwiki-20230801.json"
all_keys = set()
rows = []

# Each line in the JSON as an article
with open(file_path, "r") as file:
    for line in tqdm(file):
        try:
            row = json.loads(line)
            rows.append(row)
            #keys seem to likes of "section_texts" and "section_titles"
            keys = extract_keys(row)
            all_keys.update(keys)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Convert set to list and print
all_keys_list = list(all_keys)
print(all_keys_list)


0it [00:00, ?it/s]

['interlinks', 'section_texts', 'title', 'section_titles']


## Collect All Article Titles

In [10]:
head_titles = [d["title"] for d in rows]

## Collect All Section Titles

In [9]:
titles_list = [title for d in rows for title in d["section_titles"]]


In [11]:
type(rows[0])

dict

## Count Section Titles

to remove the most common but less relevant section titles

In [23]:
from collections import Counter

counter = Counter(titles_list)

#print(counter)

In [24]:
counter.most_common(30)

[('Introduction', 6085645),
 ('References', 5285231),
 ('External links', 2815284),
 ('See also', 1456666),
 ('History', 764821),
 ('Notes', 340065),
 ('Career', 339617),
 ('Further reading', 232363),
 ('Biography', 230440),
 ('Personal life', 216187),
 ('Early life', 186275),
 ('Track listing', 161421),
 ('Geography', 159191),
 ('Reception', 156430),
 ('Bibliography', 156010),
 ('Cast', 152554),
 ('Background', 149296),
 ('Plot', 134641),
 ('Description', 133764),
 ('Sources', 124457),
 ('Demographics', 107919),
 ('Personnel', 100709),
 ('Awards', 99570),
 ('Discography', 98291),
 ('Honours', 93032),
 ('Education', 92365),
 ('Filmography', 89360),
 ('Results', 87131),
 ('Early life and education', 84268),
 ('Production', 81372)]

## List common section titles to remove from all articles. The point in this use case is to search for answers to queries about the article content. Cited sources etc are not very relevant in most cases.

In [15]:
sections_to_remove = ["References", "External links", "See also", "Further reading", "Bibliography", "Sources", "Cited sources", \
                      "General and cited references", "General sources", "Copyright note"]

In [16]:
for title in rows[0]["section_titles"]:
    print(title)

Introduction
Etymology, terminology, and definition
History
Thought
Tactics
Key issues
Criticism
See also
References
Further reading
External links


Quick check to see what gets removed using the above section to remove list:

In [17]:
row_text = ""
for idx, title in enumerate(rows[0]["section_titles"]):
    print(idx, title)
    if title in sections_to_remove:
        print("found: "+str(rows[0]["section_titles"][:idx]))
        continue
    row_text += f"### {title} ### "
    row_text += rows[0]["section_texts"][idx]
    print(len(row_text))


0 Introduction
2073
1 Etymology, terminology, and definition
5184
2 History
15869
3 Thought
24278
4 Tactics
31818
5 Key issues
42130
6 Criticism
47226
7 See also
found: ['Introduction', 'Etymology, terminology, and definition', 'History', 'Thought', 'Tactics', 'Key issues', 'Criticism']
8 References
found: ['Introduction', 'Etymology, terminology, and definition', 'History', 'Thought', 'Tactics', 'Key issues', 'Criticism', 'See also']
9 Further reading
found: ['Introduction', 'Etymology, terminology, and definition', 'History', 'Thought', 'Tactics', 'Key issues', 'Criticism', 'See also', 'References']
10 External links
found: ['Introduction', 'Etymology, terminology, and definition', 'History', 'Thought', 'Tactics', 'Key issues', 'Criticism', 'See also', 'References', 'Further reading']


In [18]:
rows[0].keys()

dict_keys(['title', 'section_titles', 'section_texts', 'interlinks'])

In [19]:
len(rows[0]["section_texts"])

11

## Try Some Embeddings

In [21]:
embedding_llm = "/mystuff/llm/gte-base"

In [22]:
%%time
from sentence_transformers.util import cos_sim

sentences = ['That is a happy person', 'That is a very happy person']

embedding_model = SentenceTransformer(embedding_llm)
embeddings = embedding_model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))


tensor([[0.9751]])
CPU times: user 4.96 s, sys: 648 ms, total: 5.61 s
Wall time: 2.2 s


## Load the Model and Associated Tokenizer

In [22]:
llm = "/mystuff/llm/flan-t5-large/flan-t5-large"
#llm = "gpt2"

In [23]:
tokenizer = AutoTokenizer.from_pretrained(llm, local_files_only=True)

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [24]:
print(rows[0]["section_titles"][0])

Introduction


## Try a Few Data Cleaning Approaches

This was never used in the end by me, but sometimes useful to document the trials

In [25]:
"    test   ".lstrip()

'test   '

In [26]:
import re

string = "dasdə\n==== Post-WWII ====\nContent1\n==== Another Section ====\nContent2"

def split_into_subsections(row):
    row = "\n"+row
    # Using re.split to split the string
    sections = re.split(r'\n==+\s*(.*?)\s*==+\n', row)
    
    titles = []
    content = []
    for i, section in enumerate(sections):
        if i % 2 == 1: # Titles are at odd indices
            titles.append(section)
        elif i % 2 == 0 and section.strip(): # Content at even indices, ignoring empty strings
            content.append(section.lstrip())
    return titles, content

titles,content = split_into_subsections(string)

print("Titles:")
for title in titles:
    print(title)

print("\nContent:")
for section in content:
    print(section)


Titles:
Post-WWII
Another Section

Content:
dasdə
Content1
Content2


## Remove Sections to Remove from all Docs

In [27]:
def process_row(row):
    title = row["title"]
    print(title)
    for section_title, section_text in zip(row["section_titles"], row["section_texts"]):
        if section_title in sections_to_remove:
            continue
        #print("------------")
        print(section_title)
        #print("------------")
        #print(section_text.lstrip())

process_row(rows[0])

Anarchism
Introduction
Etymology, terminology, and definition
History
Thought
Tactics
Key issues
Criticism


## Set up Langchain for Chunking


In [28]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import warnings
import logging

#chunk_size = 512 
#chunk_overlap = 100
chunk_size = 64 
chunk_overlap = 0

# separators: https://github.com/langchain-ai/langchain/discussions/3786
#text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
#    tokenizer, chunk_size=512, chunk_overlap=100
#)
# recursive splitter tries the given separators recursively until best match for given text is found to fit in chunk_size
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", ". "]
)

#___text_splitter = SentenceTransformersTokenTextSplitter.from_huggingface_tokenizer(
#    model_name=llm, chunk_size=512, chunk_overlap=100
#)
#__text_splitter = RecursiveCharacterTextSplitter(
#    # Set a really small chunk size, just to show.
#    chunk_size = 100,
#    chunk_overlap  = 20,
#    length_function = len,
#    add_start_index = True,
#)

def chunk_section(section_text, title, section_title):
    #https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings
#    with warnings.catch_warnings():
    #langchain actually uses logging.warning and not warning.warn, so have to do this:
    #https://stackoverflow.com/questions/27647077/fully-disable-python-logging
    #https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/text_splitter.py
    logging.disable(logging.WARNING)
    texts = text_splitter.split_text(section_text)
    texts_filtered = []
    for text in texts:
        token_count = len(tokenizer(text)["input_ids"])
        if token_count > 800:
#        if token_count < 5 or token_count > 800:
            continue
#        if token_count > 800:
#            print("**"+title)    
#            print("**"+section_title)
#            print(text)
        texts_filtered.append(text)
    logging.disable(logging.NOTSET)
    return texts_filtered


## Define SQLite Helper Functions to save chunks

In [40]:
import sqlite3

connection = sqlite3.connect("wikipedia_chunks_256.db")
cursor = connection.cursor()

create_table_sql = """
CREATE TABLE documents (
    document_id INTEGER PRIMARY KEY AUTOINCREMENT,
    document_title TEXT
);

CREATE TABLE sections (
    section_id INTEGER PRIMARY KEY AUTOINCREMENT,
    document_id INTEGER,
    section_title TEXT,
    FOREIGN KEY (document_id) REFERENCES documents (document_id)
);

CREATE TABLE text_chunks (
    chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,
    section_id INTEGER,
    document_id INTEGER,
    content TEXT,
    FOREIGN KEY (section_id) REFERENCES sections (section_id)
    FOREIGN KEY (document_id) REFERENCES documents (section_id)
);

CREATE INDEX idx_section_id ON text_chunks (section_id);
CREATE INDEX idx_document_id ON text_chunks (document_id);
CREATE INDEX idx_section_document_id ON sections (document_id);

"""

cursor.executescript(create_table_sql)
connection.commit()
connection.close()


OperationalError: table documents already exists

In [41]:
import sqlite3

connection = sqlite3.connect("wikipedia_chunks_256.db")
cursor = connection.cursor()

def insert_sqlite_document(doc_title, sections, chunks):
    try:
        # Insert a new document and retrieve its ID
        cursor.execute("INSERT INTO documents (document_title) VALUES (?)", (doc_title,))
        document_id = cursor.lastrowid

        for idx, section_title in enumerate(sections):
            # Insert a new section using the document ID and retrieve its ID
            cursor.execute("INSERT INTO sections (document_id, section_title) VALUES (?, ?)", (document_id, section_title))
            section_id = cursor.lastrowid

            for chunk_text in chunks[idx]:
                # Insert a new text chunk using the section ID and document ID
                cursor.execute("INSERT INTO text_chunks (section_id, document_id, content) VALUES (?, ?, ?)", (section_id, document_id, chunk_text))
        connection.commit()

    except sqlite3.Error as e:
        print(f"Error with document: {doc_title}")
        print("Error:", e)
        connection.rollback()



In [42]:
#from transformers.utils import logging

#https://github.com/langchain-ai/langchain/discussions/3786

def process_row(row):
    #logging.set_verbosity(40)
    title = row["title"]
    #print(title)
    all_chunks = []
    all_sections = []
    section_idx = 0
    for section_title, section_text in zip(row["section_titles"], row["section_texts"]):
        if section_title in sections_to_remove:
            continue
        section_idx += 1
        #subsection_titles, subsection_texts = split_into_subsections(section_text)
        section_text = section_text.lstrip()
        if section_idx == 1:
            section_text = section_title + " - " + section_text
        section_chunks = chunk_section(section_text, title, section_title)
#        if title == "April 6" and False:
#            print("------------")
#            print(section_title)
#            print("------------")
#            print("\n****".join(section_chunks))
        all_chunks.append(section_chunks)
        all_sections.append(section_title)
        #print("\n****".join(section_chunks))
        #print(len(section_chunks))
    #logging.set_verbosity(30)
    insert_sqlite_document(title, all_sections, all_chunks)
    return all_chunks, title

chunk_sizes = []

for row in tqdm(rows):
    all_chunks,title = process_row(row)
    #if title == "April 6":
    #    break
    for section_chunks in all_chunks:
        chunk_sizes.extend([len(tokenizer(chunk)["input_ids"]) for chunk in section_chunks])

connection.close()
#all_chunks = process_row(rows[2])
#all_chunks = chunk_section(state_of_the_union)


  0%|          | 0/6082528 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [43]:
import pandas as pd
df_chunk_sizes = pd.DataFrame()
df_chunk_sizes["size"] = chunk_sizes
df_chunk_sizes.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,size
count,12272.0
mean,43.13918
std,17.50598
min,2.0
25%,32.0
50%,43.0
75%,53.0
max,266.0


In [44]:
df_chunk_sizes.sort_values(by="size").tail(10)

Unnamed: 0,size
9815,156
5433,158
12240,163
1146,173
1222,185
1038,201
9465,211
2251,212
734,223
620,266


In [51]:
missed_docs = ["2019–20 Coupe de France preliminary rounds, Grand Est", "Garbh-bheinn (Skye)", \
               "Electoral district of Pastoral District of Murrumbidgee", "National Reform Association (chartered 1864)", \
               "Florence Moog", "Rafi Bohic", "Adventures in Silverado", "Albert Ganado", \
               "The Watchmaker of Everton", "Henria railway station", "Marcus O'Lone"]

In [52]:
#from transformers.utils import logging

#https://github.com/langchain-ai/langchain/discussions/3786

for row in tqdm(rows):
    if row["title"] not in missed_docs:
        continue
    print(f"found missing title: {row['title']}")
    all_chunks = process_row(row)
    for section_chunks in all_chunks:
        chunk_sizes.extend([len(tokenizer(chunk)["input_ids"]) for chunk in section_chunks])

connection.close()
#all_chunks = process_row(rows[2])
#all_chunks = chunk_section(state_of_the_union)

  0%|          | 0/6082528 [00:00<?, ?it/s]

found missing title: 2019–20 Coupe de France preliminary rounds, Grand Est
found missing title: Garbh-bheinn (Skye)
found missing title: Electoral district of Pastoral District of Murrumbidgee
found missing title: National Reform Association (chartered 1864)
found missing title: Florence Moog
found missing title: Rafi Bohic
found missing title: Adventures in Silverado
found missing title: Albert Ganado
found missing title: The Watchmaker of Everton
found missing title: Henria railway station
found missing title: Marcus O'Lone


# Example Splitter Trials

In [47]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=512, chunk_overlap=20
)
texts = text_splitter.split_text("dsss")
print(texts[0])
print(len(texts))

dsss
1


In [48]:
with open("state_of_the_union.txt") as f:
    state_of_the_union = f.read()

In [50]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=100, chunk_overlap=20
)
texts = text_splitter.split_text(state_of_the_union)
print(texts[0])

Here is the full text of President Joe Biden’s State of the Union address, as prepared for delivery and released by the White House on Tuesday.

    Mr. Speaker. Madam Vice President. Our First Lady and Second Gentleman.

    Members of Congress and the Cabinet. Leaders of our military.

    Mr. Chief Justice, Associate Justices, and retired Justices of the Supreme Court.
