In [66]:
%pip install -U qdrant-client

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas  as pd

dataset = pd.read_csv('../ozhegov_dataset.csv')
dataset = dataset[dataset['title'].str.len() > 0]
dataset = dataset[dataset['text'].str.len() > 0]
#для lenta - подсовываем даты
#dataset['title'] = dataset['date']
dataset = dataset.sample(n=1000)

dataset.astype({"text": str, "title": str})
dataset.info(show_counts=True)

dataset.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 28568 to 16697
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   title       1000 non-null   object
 2   text        1000 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.2+ KB


Unnamed: 0.1,Unnamed: 0,title,text
28568,28568,УМНЕТЬ,"Становиться умным (в 1 знач.), умнее. Дети умн..."
21283,21283,ПРОЗНАТЬ,"То же, что проведать (во 2 знач.)."
15554,15554,ОДИНАРНЫЙ,"Состоящий из одного, не двойной. В одинарном р..."
9409,9409,КАТАПУЛЬТИРОВАТЬ,Выбросить (-расывать) из летательного аппарата...
11714,11714,ЛЮТОВАТЬ,"Зверствовать, проявлять лютость. Лютует враг. ..."


In [2]:
from qdrant_client import QdrantClient, models as qdrant_models

models = [
    {"model_name":"intfloat/multilingual-e5-large", "size": 1024},
    {"model_name":"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "size": 768},
    {"model_name":"symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli", "size": 768},
    {"model_name":"cointegrated/LaBSE-en-ru", "size": 768},
    {"model_name":"sentence-transformers/LaBSE", "size": 768}
]

distances = [
    qdrant_models.Distance.EUCLID,
    qdrant_models.Distance.DOT,
    qdrant_models.Distance.COSINE
]

In [3]:
documents = dataset["title"].map(lambda x : f"{{\"title\": \"{x}\"}}")
print(documents.tolist())

['{"title": "УМНЕТЬ"}', '{"title": "ПРОЗНАТЬ"}', '{"title": "ОДИНАРНЫЙ"}', '{"title": "КАТАПУЛЬТИРОВАТЬ"}', '{"title": "ЛЮТОВАТЬ"}', '{"title": "СКРЫТЬ"}', '{"title": "ВЫЧИСЛИТЕЛЬ"}', '{"title": "ХРИПУЧИЙ"}', '{"title": "БОМБА"}', '{"title": "ПОЛУКРОВНЫЙ"}', '{"title": "ЛИЦЕДЕЙ"}', '{"title": "ПЛЕМЯННИК"}', '{"title": "РАЗДОЛЬЕ"}', '{"title": "МЕЧТА"}', '{"title": "ГЕЛЬ"}', '{"title": "ВПУТЫВАТЬ"}', '{"title": "ИЗОБИЛЬНЫЙ"}', '{"title": "БУТОН"}', '{"title": "БОСОЙ"}', '{"title": "ОБМУСЛИТЬ"}', '{"title": "КОЛОДКА"}', '{"title": "ДИРИЖАБЛЕСТРОЕНИЕ"}', '{"title": "УЙГУРСКИЙ"}', '{"title": "ПЕРЕСУШИТЬ"}', '{"title": "ГЛАСНОСТЬ"}', '{"title": "ЛИХО"}', '{"title": "ЗЛИТЬ"}', '{"title": "СГИБАТЬСЯ"}', '{"title": "СВОЯК"}', '{"title": "ЦУКАТ"}', '{"title": "ПОТАСКУХА"}', '{"title": "РЕШИМОСТЬ"}', '{"title": "ПОКУПАТЬСЯ"}', '{"title": "ГЛАЗУРОВАТЬ"}', '{"title": "ДЕЛЕЦ"}', '{"title": "УЖИМКА"}', '{"title": "ПЛАНОВИК"}', '{"title": "ОТРАЗИТЬ"}', '{"title": "СТРОНЦИЙ"}', '{"title": "ЖИДКОСТЬ"}'

In [17]:
from qdrant_client import QdrantClient, models as qdrant_models

client = QdrantClient(url="http://localhost:6333")
COLLECTION_NAME="termins"

def create_collection(client, model_name):
    client.set_model(model_name)
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=client.get_fastembed_vector_params()
    )
    add_data()

def add_data():
    ids = list(map(int, dataset.index.values.tolist()))

    client.add(
        collection_name=COLLECTION_NAME,
        ids = ids, documents=dataset["title"].tolist(), batch_size=2, parallel=0
    )


def delete_collection():
    client.delete_collection(collection_name=COLLECTION_NAME)

In [18]:
delete_collection()
create_collection(client, "intfloat/multilingual-e5-large")

In [71]:
def query_collection(client, query, max_results, dataframe, model_name):
    results = client.query(
        collection_name=COLLECTION_NAME,
        query_text=query,
        query_filter=None,  # If you don't want any filters for now
        limit=max_results,  # 5 the closest results
    )

    df = pd.DataFrame()
    for res in results:
        df = df._append(pd.DataFrame({
                'id':res.id, 
                'score':res.score,
                'query': query,
                'title': dataframe[dataframe.index == res.id]['title'],
                'content': dataframe[dataframe.index == res.id]['text'],
                'model_name': model_name,
                }))

    # Забираем с максимально высокой оценкой
    df = df[df.score == df.score.max()]
    df['is_found'] = df.apply(lambda row: row.query == row.title, axis=1)
    
    return df

In [72]:
test_dataset = dataset.sample(n=100)
test_dataset.head()
test_results = pd.DataFrame()

In [73]:
for title in test_dataset["title"].tolist()[:1]:
        test_results = test_results._append(query_collection(
        client,
        query=title,
        max_results=5,
        dataframe=dataset,
        model_name="intfloat/multilingual-e5-large"))
        print(f"{len(test_results)}")

1


In [None]:

for model in models:
    print(f"{model}")
    try:
        delete_collection()
    except Exception as ex:
        print(f"delete_collection error: {ex}")
    collection = create_collection(client, model)
    for title in test_dataset["title"].tolist():
        test_results = test_results._append(query_collection(
        client,
        query=title,
        max_results=5,
        dataframe=dataset,
        model_name=model))
        print(f"{len(test_results)}")
        

test_results.to_csv("results_ozhegov.csv")