In [1]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [2]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from qdrant_client import QdrantClient
from qdrant_client import models
import numpy as np


client = QdrantClient(url="http://10.11.12.134:6333", timeout=50)
#client = QdrantClient("http://localhost", port=6333, grpc_port=6334)

COLLECTION_NAME = 'wikipedia_20231101_en_text512_embeddings'


In [11]:
print(client)

<qdrant_client.qdrant_client.QdrantClient object at 0x7f07eda5cc40>


In [12]:
from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
'''
# Получение списка всех коллекций
collections = client.get_collections().collections

# Удаление всех коллекций
for collection in collections:
    client.delete_collection(collection.name)
    print(f"Коллекция '{collection.name}' удалена")
'''
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(
                            size=1024,
                            distance=Distance.COSINE,
                            on_disk=True,
                            quantization_config=models.ScalarQuantization(
                            scalar=models.ScalarQuantizationConfig(
                                type=models.ScalarType.INT8,
                            ),
                            )
                        ),
    optimizers_config=models.OptimizersConfigDiff(memmap_threshold=20000),
    hnsw_config=models.HnswConfigDiff(on_disk=True),
    on_disk_payload=True,

)


True

In [18]:
import json
import tqdm
from datasets import load_dataset

# Параметры
BATCH_SIZE = 256

# Загрузка датасета
#ds = load_dataset("wikimedia/wikipedia", "20231101.en")

# Инициализация временных списков для батчей
batch_ids = []
batch_payloads = []
batch_vectors = []

# Чтение большого файла построчно и загрузка в базу данных
with open('/data/wikipedia_embeddings/embeddings_part1_embed_by_text_512.json', 'r') as file:
    for index, line in tqdm.tqdm(enumerate(file)):
        #if index < 2650000:
        #    continue
        # Парсинг JSON-строки
        string = line.strip().rstrip(",").rstrip(']').lstrip('[')
        if not string:
            break
        entry = json.loads(string)
        data = ds['train'][index]

        # Добавляем данные в текущий батч
        batch_ids.append(int(10e9) + int(entry['id']))
        batch_payloads.append({
            "text": data['text'],
            "my_metadata": {
                "source": data['url'],
                "title": data['title']
            }
        })
        batch_vectors.append(entry["vector"])

        # Если батч заполнен, делаем загрузку в базу данных
        if (index + 1) % BATCH_SIZE == 0:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=models.Batch(
                    ids=batch_ids,
                    payloads=batch_payloads,
                    vectors=batch_vectors,
                ),
                #timeout=30
            )
            # Очищаем временные списки для следующего батча
            batch_ids.clear()
            batch_payloads.clear()
            batch_vectors.clear()

    # Загрузка оставшихся данных (если общее количество записей не кратно BATCH_SIZE)
    if batch_ids:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=batch_ids,
                payloads=batch_payloads,
                vectors=batch_vectors,

            ),
        )


3203072it [1:42:33, 520.49it/s]


In [3]:
import json
import tqdm
from datasets import load_dataset

# Параметры
BATCH_SIZE = 32

# Инициализация временных списков для батчей
batch_ids = []
batch_payloads = []
batch_vectors = []

# Чтение большого файла построчно и загрузка в базу данных
with open('/data/wikipedia_embeddings/embeddings_part2_embed_by_text_512.json', 'r') as file:
    for index, line in tqdm.tqdm(enumerate(file)):
        if index < 1000000:
            continue
        # Парсинг JSON-строки
        string = line.strip().rstrip(",").rstrip(']').lstrip('[')
        if not string:
            break
        entry = json.loads(string)
        data = ds['train'][index + 3203072]

        # Добавляем данные в текущий батч
        batch_ids.append(int(10e9) + int(entry['id']))
        batch_payloads.append({
            "text": data['text'],
            "my_metadata": {
                "source": data['url'],
                "title": data['title']
            }
        })
        batch_vectors.append(entry["vector"])

        # Если батч заполнен, делаем загрузку в базу данных
        if (index + 1) % BATCH_SIZE == 0:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=models.Batch(
                    ids=batch_ids,
                    payloads=batch_payloads,
                    vectors=batch_vectors,
                ),
                #params={"timeout": 30}
            )
            # Очищаем временные списки для следующего батча
            batch_ids.clear()
            batch_payloads.clear()
            batch_vectors.clear()

    # Загрузка оставшихся данных (если общее количество записей не кратно BATCH_SIZE)
    if batch_ids:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=batch_ids,
                payloads=batch_payloads,
                vectors=batch_vectors,

            ),
        )


3204742it [1:47:52, 495.15it/s]


In [4]:
# Attempting to read the first 10 lines of the specified JSON file and display them.
file_path = '/data/wikipedia_embeddings/embeddings_part2_embed_by_text_512.json'

with open(file_path, 'r', encoding='utf-8') as file:
    for i in range(10):
        print(file.readline().strip()[:50])


[{"id": "38333481", "vector": [0.02525411173701286
{"id": "38333486", "vector": [0.009253010153770447
{"id": "38333493", "vector": [0.030847351998090744
{"id": "38333522", "vector": [0.03813692182302475,
{"id": "38333538", "vector": [0.003134671365842223
{"id": "38333541", "vector": [0.04139922931790352,
{"id": "38333546", "vector": [0.014245477505028248
{"id": "38333547", "vector": [-0.00340053322724998
{"id": "38333550", "vector": [0.009827806614339352
{"id": "38333572", "vector": [0.003196465549990534


In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name='intfloat/multilingual-e5-large',
    model_kwargs={"device": "cpu"},
    cache_folder='cache'
)

  embeddings = HuggingFaceEmbeddings(


In [5]:
client.get_collection(COLLECTION_NAME)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=3203072, points_count=3203072, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=ScalarQuantization(scalar=ScalarQuantizationConfig(type=<ScalarType.INT8: 'int8'>, quantile=None, always_ram=None)), on_disk=True, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=True, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=20000, indexing_threshold=20000,

In [14]:
import numpy as np

client.query_points(
        collection_name=COLLECTION_NAME,
        #search_params=models.SearchParams(hnsw_ef=128, exact=False),
        query=embeddings.embed_query('query: ' + 'What is Witcher 2?'),
        limit=5,
        with_vectors=False,
        timeout=300
    )

QueryResponse(points=[ScoredPoint(id=10024385318, version=7662, score=0.8935852, payload={'text': 'The Witcher 2: Assassins of Kings () is a 2011 action role-playing video game developed by CD Projekt Red, based on The Witcher series of fantasy novels by Andrzej Sapkowski. It is the sequel to the 2007 game The Witcher and the second main installment in [[The Witcher (video game series)|The Witcher\'\'\'s video game series]]. It was released for Microsoft Windows, Xbox 360, OS X, and Linux.\n\nThe player directs the actions of Geralt of Rivia, a monster hunter known as a Witcher. The fantasy world in which his adventures take place owes much to Polish history and Slavic mythology.\n\nThe game was both a critical and commercial success, selling over 8 million copies by September 2014. The third installment in the series, The Witcher 3: Wild Hunt, was released in May 2015.\n\nGameplay\n\nThe gameplay of The Witcher 2 is a marked departure from that of its predecessor. Combat, for instance

In [6]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [7]:
ds['train']['id'].index('38333481')

3203072

In [10]:
ds['train'][3203072]

{'id': '38333493',
 'url': 'https://en.wikipedia.org/wiki/On%20Top%20of%20the%20World%20%28Imagine%20Dragons%20song%29',
 'title': 'On Top of the World (Imagine Dragons song)',
 'text': '"On Top of the World" is a song by the American rock band Imagine Dragons first appearing on their major-label debut extended play Continued Silence (2012). The song also appears on their first full-length album Night Visions (2012). "On Top of the World" was released digitally as a single on March 18, 2013.\n\nComposition\n\n"On Top of the World" primarily features Magne guitar and piano instrumentation, with vocals performed by lead singer Dan Reynolds. Originally published in the key of C major, the song itself expresses a celebration of accomplishment for the band after striving for years to become successful. The song incorporates the chord progression of C-F-C-Dm in the verses, and F-C-G-Dm in the chorus and bridge, with the pre-chorus using Am and G to create a different break in the song. The s

In [None]:
import json
import tqdm
from datasets import load_dataset

# Параметры
BATCH_SIZE = 1024

# Загрузка датасета
#ds = load_dataset("wikimedia/wikipedia", "20231101.en")

# Инициализация временных списков для батчей
batch_ids = []
batch_payloads = []
batch_vectors = []

# Чтение большого файла построчно и загрузка в базу данных
with open('D:\embeddings_part2_text512.json', 'r') as file:
    for index, line in tqdm.tqdm(enumerate(file)):
        # Парсинг JSON-строки
        string = line.strip().rstrip(",").lstrip('[')
        if not string:
            break
        entry = json.loads(string)
        #data = ds['train'][index]

        # Добавляем данные в текущий батч
        batch_ids.append(int(10e9) + int(entry['id']))
        '''
        batch_payloads.append({
            "text": data['text'],
            "my_metadata": {
                "source": data['url'],
                "title": data['title']
            }
        })
        '''
        batch_vectors.append(entry["vector"])

        # Если батч заполнен, делаем загрузку в базу данных
        if (index + 1) % BATCH_SIZE == 0:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=[
                    models.PointStruct(
                        id=batch_ids[i],
                        vector=batch_vectors[i]
                    )
                    for i in range(BATCH_SIZE)
                ]
            )
            # Очищаем временные списки для следующего батча
            batch_ids.clear()
            batch_payloads.clear()
            batch_vectors.clear()

    # Загрузка оставшихся данных (если общее количество записей не кратно BATCH_SIZE)
    if batch_ids:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=batch_ids,
                #payloads=batch_payloads,
                vectors=batch_vectors,

            ),
        )


In [36]:
import json
import tqdm
from datasets import load_dataset

# Параметры
BATCH_SIZE = 1000

# Загрузка датасета
ds = load_dataset("wikimedia/wikipedia", "20231101.en")

# Инициализация временных списков для батчей
batch_ids = []
batch_payloads = []
batch_vectors = []

# Чтение большого файла построчно и загрузка в базу данных
with open('C:/Users/Profe/OneDrive/Документы/embeddings2.json', 'r') as file:
    for index, line in tqdm.tqdm(enumerate(file)):
        # Парсинг JSON-строки
        print(line)
        continue
        entry = json.loads(line.strip().rstrip(",").lstrip('['))
        data = ds['train'][index]

        # Добавляем данные в текущий батч
        batch_ids.append(int(10e9) + index)
        batch_payloads.append({
            "text": data['text'],
            "my_metadata": {
                "source": data['url'],
                "title": data['title']
            }
        })
        batch_vectors.append(entry["vector"])

        # Если батч заполнен, делаем загрузку в базу данных
        if (index + 1) % BATCH_SIZE == 0:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=models.Batch(
                    ids=batch_ids,
                    payloads=batch_payloads,
                    vectors=batch_vectors,
                ),
            )
            # Очищаем временные списки для следующего батча
            batch_ids.clear()
            batch_payloads.clear()
            batch_vectors.clear()

    # Загрузка оставшихся данных (если общее количество записей не кратно BATCH_SIZE)
    if batch_ids:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=batch_ids,
                payloads=batch_payloads,
                vectors=batch_vectors,

            ),
        )


In [39]:
from qdrant_client import QdrantClient

# Получение списка всех коллекций
collections = client.get_collections().collections

# Удаление всех коллекций
for collection in collections:
    client.delete_collection(collection.name)
    print(f"Коллекция '{collection.name}' удалена")


In [31]:
len(ds['train']['id'])

6407814

In [36]:
import json
import tqdm
from datasets import load_dataset

# Параметры
BATCH_SIZE = 1000
LOWER_BOUND = 0
UPPER_BOUND = 2_000_000

# Загрузка датасета
ds = load_dataset("wikimedia/wikipedia", "20231101.en")

# Инициализация временных списков для батчей
batch_ids = []
batch_payloads = []
batch_vectors = []


# Чтение большого файла построчно и загрузка в базу данных
with open('C:/Users/Profe/OneDrive/Документы/embeddings.json.filepart', 'r') as file:
    for index, line in tqdm.tqdm(enumerate(file)):
        if index > UPPER_BOUND:
            break
        elif index < LOWER_BOUND:
            continue
        # Парсинг JSON-строки
        entry = json.loads(line.strip().rstrip(",").lstrip('['))
        data = ds['train'][index]

        # Добавляем данные в текущий батч
        batch_ids.append(int(10e9) + int(entry['id']))
        batch_payloads.append({
            "text": data['text'],
            "my_metadata": {
                "source": data['url'],
                "title": data['title']
            }
        })
        print(len(entry["vector"]))
        break
        if len(entry["vector"]) != 1024:
            break
        batch_vectors.append(entry["vector"])

        # Если батч заполнен, делаем загрузку в базу данных
        if (index + 1) % BATCH_SIZE == 0:
            client.upsert(
                collection_name=COLLECTION_NAME,
                points=models.Batch(
                    ids=batch_ids,
                    payloads=batch_payloads,
                    vectors=batch_vectors,
                ),
            )
            # Очищаем временные списки для следующего батча
            batch_ids.clear()
            batch_payloads.clear()
            batch_vectors.clear()

    # Загрузка оставшихся данных (если общее количество записей не кратно BATCH_SIZE)
    if batch_ids:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=batch_ids,
                payloads=batch_payloads,
                vectors=batch_vectors,

            ),
        )

0it [00:00, ?it/s]

1024





IndexError: list index out of range

In [12]:
from fastembed import SparseTextEmbedding

documents = [
    "You should stay, study and sprint.",
    "History can only prepare us to be surprised yet again.",
]

model = SparseTextEmbedding(model_name="Qdrant/bm25")
embeddings = list(model.embed(documents))

# [
#     SparseEmbedding(
#         values=array([1.67419738, 1.67419738, 1.67419738, 1.67419738]),
#         indices=array([171321964, 1881538586, 150760872, 1932363795])),
#     SparseEmbedding(values=array(
#         [1.66973021, 1.66973021, 1.66973021, 1.66973021, 1.66973021]),
#                     indices=array([
#                         578407224, 1849833631, 1008800696, 2090661150,
#                         1117393019
#                     ]))
# ]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 29 files: 100%|██████████| 29/29 [00:01<00:00, 16.36it/s]


In [13]:
embeddings

[SparseEmbedding(values=array([1.67868852, 1.67868852, 1.67868852]), indices=array([1881538586,  150760872, 1932363795])),
 SparseEmbedding(values=array([1.66973021, 1.66973021, 1.66973021, 1.66973021, 1.66973021]), indices=array([ 733618285, 1849833631, 1008800696, 2090661150, 1117393019]))]

In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings