Тут используется qdrant как хранилище и поиск по векторам на входе (см. text2vec), а не как research_hybrid, когда грузится текст в чистом виде и указывается модель для работы.

In [1]:
%pip install -U qdrant-client

Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Using cached protobuf-5.26.1-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Using cached protobuf-5.26.1-cp310-abi3-win_amd64.whl (420 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.3
    Uninstalling protobuf-4.25.3:
      Successfully uninstalled protobuf-4.25.3
Successfully installed protobuf-5.26.1
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
googleapis-common-protos 1.63.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5, but you have protobuf 5.26.1 which is incompatible.
opentelemetry-proto 1.23.0 requires protobuf<5.0,>=3.19, but you have protobuf 5.26.1 which is incompatible.


In [2]:
import pandas  as pd

dataset = pd.read_csv('../ozhegov_dataset.csv')
dataset = dataset[dataset['title'].str.len() > 0]
dataset = dataset[dataset['text'].str.len() > 0]
#для lenta - подсовываем даты
#dataset['title'] = dataset['date']
dataset = dataset.sample(n=1000)

dataset.astype({"text": str, "title": str})
dataset.info(show_counts=True)

dataset.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 11773 to 12476
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   title       1000 non-null   object
 2   text        1000 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.2+ KB


Unnamed: 0.1,Unnamed: 0,title,text
11773,11773,МАЗУРКА,"Польский народный, сценический и бальный танец..."
4996,4996,ГРОШ,"Старинная медная монета в две копейки, позднее..."
30960,30960,ЭКВИВАЛЕНТНЫЙ,Вполне равноценный чему-н. в каком-н. отношени...
1775,1775,БОЯЗНО,"О чувстве страха, испуга: страшно. Б. идти одн..."
31391,31391,ЯСНОВИДЕЦ,"Человек, к-рый обладает даром ясновидения. Н ж..."


In [3]:
from qdrant_client import QdrantClient, models as qdrant_models

models = [
    {"model_name":"intfloat/multilingual-e5-large", "size": 1024},
    {"model_name":"sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "size": 768},
    {"model_name":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "size": 384},
    #{"model_name":"symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli", "size": 768}, не поддежваются fastEmbeded
    {"model_name":"sentence-transformers/all-MiniLM-L6-v2", "size": 384},
    #{"model_name":"cointegrated/LaBSE-en-ru", "size": 768}, не поддежваются fastEmbeded
    #{"model_name":"sentence-transformers/LaBSE", "size": 768} не поддежваются fastEmbeded
]

distances = [
    qdrant_models.Distance.EUCLID,
    qdrant_models.Distance.DOT,
    qdrant_models.Distance.COSINE
]

In [4]:
from sentence_transformers import SentenceTransformer

def text2vec(model_name, text):
    model = SentenceTransformer(model_name)
    embedding = model.encode(text)

    return embedding

In [19]:
from qdrant_client import QdrantClient, models as qdrant_models

client = QdrantClient(url="http://localhost:6333")
COLLECTION_NAME="termins"

def create_collection(client, model_name, model_size):
    #client.set_model(model_name)
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=qdrant_models.VectorParams(size=model_size, distance=qdrant_models.Distance.COSINE),
    )
    add_data(model_name)

def add_data(model_name):
    client.upload_points(
    collection_name=COLLECTION_NAME,
    points=[
        qdrant_models.PointStruct(
            id=index, vector=text2vec(model_name, row["title"]).tolist(), payload={"title": row["title"], "text": row["text"]}
        )
        for index, row in dataset.iterrows()
    ],
)


def delete_collection():
    client.delete_collection(collection_name=COLLECTION_NAME)

In [20]:
def query_collection(client, query, max_results, dataframe, model_name):
    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=text2vec(model_name, query).tolist(),
        query_filter=None,  # If you don't want any filters for now
        limit=max_results,  # 5 the closest results
    )

    df = pd.DataFrame()
    for res in results:
        df = df._append(pd.DataFrame({
                'id':res.id, 
                'score':res.score,
                'query': query,
                'title': dataframe[dataframe.index == res.id]['title'],
                'content': dataframe[dataframe.index == res.id]['text'],
                'model_name': model_name,
                }))

    # Забираем с максимально высокой оценкой
    df = df[df.score == df.score.max()][:1]
    df['is_found'] = df.apply(lambda row: row.query == row.title, axis=1)
    
    return df

In [21]:
test_dataset = dataset.sample(n=100)
test_dataset.head()
test_results = pd.DataFrame()

In [22]:

for model in models:
    print(f"{model}")
    try:
        delete_collection()
        collection = create_collection(client, model["model_name"], model["size"])
    except Exception as ex:
        print(f"recreate collection error: {ex}")
        continue
    for title in test_dataset["title"].tolist():
        try:
            test_results = test_results._append(query_collection(
            client,
            query=title,
            max_results=5,
            dataframe=dataset,
            model_name=model["model_name"]))
        except Exception as ex:
            print(f"Test error: {ex}")
            continue
        

test_results.to_csv("results_ozhegov.csv")

{'model_name': 'intfloat/multilingual-e5-large', 'size': 1024}
{'model_name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', 'size': 768}
{'model_name': 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'size': 384}
{'model_name': 'sentence-transformers/all-MiniLM-L6-v2', 'size': 384}


In [26]:
finally_result = pd.DataFrame()
for model in models:
    df = test_results.loc[test_results['model_name'].str.contains(model["model_name"]) == True]
    finally_result = finally_result._append(pd.DataFrame({
            'found': [len(df[df['is_found'] == True])],
            'model_name': [model["model_name"]],
            }))
        
finally_result.head(15)
finally_result.to_csv("finally_result_ozhegov2.csv")

In [27]:
test_results.head()

Unnamed: 0,id,score,query,title,content,model_name,is_found
1844,1844,1.0,БРЕНЧАТЬ,БРЕНЧАТЬ,"Тихо позванивать, звякать. Бренчат шпоры. Б. к...",intfloat/multilingual-e5-large,True
3810,3810,1.0,ВЫКРЕСТ,ВЫКРЕСТ,"Человек, перешедший в христианство из другой р...",intfloat/multilingual-e5-large,True
12144,12144,1.0,МЕЛОДЕКЛАМАТОР,МЕЛОДЕКЛАМАТОР,"Артист, занимающийся мелодекламацией. II прил....",intfloat/multilingual-e5-large,True
29191,29191,1.0,ФИЖМЫ,ФИЖМЫ,"В 18 - нач. 19 в.: каркас в виде обруча, встав...",intfloat/multilingual-e5-large,True
11102,11102,1.0,ЛАКОМИТЬСЯ,ЛАКОМИТЬСЯ,"Есть что-н. вкусное, лакомое. Л. вареньем. II ...",intfloat/multilingual-e5-large,True


In [3]:
finally_result.head(15)


Unnamed: 0,found,model_name
0,100,intfloat/multilingual-e5-large
1,100,sentence-transformers/paraphrase-multilingual-...
2,100,sentence-transformers/paraphrase-multilingual-...
3,100,sentence-transformers/all-MiniLM-L6-v2
