In [12]:
import pandas  as pd

dataset = pd.read_csv('../news_with_numbers.csv')
dataset = dataset[dataset['title'].str.len() > 0]
dataset = dataset[dataset['text'].str.len() > 0]
dataset['title'] = dataset['date']
dataset = dataset.sample(n=1000)

dataset.astype({"text": str, "title": str})
dataset.info(show_counts=True)

dataset.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 391 to 82
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   url         1000 non-null   object
 2   title       1000 non-null   object
 3   text        1000 non-null   object
 4   topic       953 non-null    object
 5   tags        929 non-null    object
 6   date        1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 62.5+ KB


Unnamed: 0.1,Unnamed: 0,url,title,text,topic,tags,date
391,125033,https://lenta.ru/news/2005/07/14/osprey/,2005/07/14,Многопрофильный военно-транспортный конвертопл...,Наука и техника,Все,2005/07/14
264,163057,https://lenta.ru/news/2006/08/23/reason/,2006/08/23,"По мнению некоторых специалистов, сами по себе...",Мир,Все,2006/08/23
947,283801,https://lenta.ru/news/2009/05/22/lowry/,2009/05/22,Картина британского художника-любителя Лоуренс...,Культура,Все,2009/05/22
107,484122,https://lenta.ru/news/2013/10/22/sledvuz/,2013/10/22,"Комитет Госдумы по противодействию коррупции, ...",Россия,Общество,2013/10/22
141,180210,https://lenta.ru/news/2007/02/15/save/,2007/02/15,В Хасанском районе Приморья 15 февраля военные...,Россия,Все,2007/02/15


In [14]:
models = [
    "intfloat/multilingual-e5-large",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    "symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli",
    "cointegrated/LaBSE-en-ru",
    "sentence-transformers/LaBSE"
]

distances = [
    "l2",
    "ip",
    "cosine"
]

In [16]:
%pip install -U sentence-transformers ipywidgets chromadb chardet charset-normalizer


Collecting charset-normalizer
  Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl.metadata (34 kB)
Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl (99 kB)
Installing collected packages: charset-normalizer
  Attempting uninstall: charset-normalizer
    Found existing installation: charset-normalizer 3.1.0
    Uninstalling charset-normalizer-3.1.0:
      Successfully uninstalled charset-normalizer-3.1.0
Successfully installed charset-normalizer-3.3.2


Бывает ошибка с установкой, в самой ошибке есть решение

HINT: This error might have occurred since this system does not have Windows Long Path support enabled. You can find information on how to enable this at https://pip.pypa.io/warnings/enable-long-paths

https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=powershell#enable-long-paths-in-windows-10-version-1607-and-later

In [15]:
from chromadb.utils import embedding_functions
import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port=8000)

def create_collection(model_name, distance):
    
    chroma_client = chromadb.HttpClient(host="localhost", port=8000)

    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

    #text_collection = chroma_client.create_collection(name='text', embedding_function=sentence_transformer_ef)
    title_collection = chroma_client.create_collection(name="title", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": distance})

    ids = list(map(str, dataset.index.values.tolist()))
    #text_collection.add(ids = ids, documents=dataset["text"].tolist())
    title_collection.add(ids = ids, documents=dataset["title"].tolist())

    return title_collection

def delete_collection():
    chroma_client.delete_collection("title")


def query_collection(collection, query, max_results, dataframe, model_name, distance):
    results = collection.query(query_texts=query, n_results=max_results, include=['distances']) 
    #print(results)
    df = pd.DataFrame({
                'id':results['ids'][0], 
                'score':list(map(float,results['distances'][0])),
                'query': query,
                'title': dataframe[dataframe.index.isin(list(map(int,results['ids'][0])))]['title'],
                'content': dataframe[dataframe.index.isin(list(map(int,results['ids'][0])))]['text'],
                'model_name': model_name,
                'distance': distance
                })
    
    # Забираем с минимальной дистанцией, значит он ближе и больше похож
    df = df[df.score == df.score.min()]
    df['is_found'] = df.apply(lambda row: row.query == row.title, axis=1)
    
    return df


In [16]:
test_dataset = dataset.sample(n=100)
test_dataset.head()
test_results = pd.DataFrame()

In [17]:

for model in models:
    for distance in distances:
        print(f"{model} - {distance}")
        try:
            delete_collection()
        except Exception as ex:
            print(f"delete_collection error: {ex}")

        collection = create_collection(model, distance)

        for title in test_dataset["title"].tolist():
            test_results = test_results._append(query_collection(
            collection=collection,
            query=title,
            max_results=5,
            dataframe=dataset,
            model_name=model,
            distance=distance))

            print(f"{len(test_results)}")

        

test_results.to_csv("results.csv")

intfloat/multilingual-e5-large - l2
1
2
3
4
5
6
8
9
10
11
12
13
14
15
17
18
19
20
22
23
24
25
26
27
28
29
30
32
33
34
35
36
37
38
39
40
41
43
44
45
46
47
48
49
50
51
53
54
55
56
58
59
60
61
62
64
65
66
67
68
69
70
71
72
73
75
76
77
79
80
81
82
83
84
85
86
87
88
89
91
92
93
94
95
96
99
100
101
102
104
105
106
108
109
110
111
112
113
114
115
intfloat/multilingual-e5-large - ip
116
117
118
119
120
121
123
124
125
126
127
128
129
130
132
133
134
135
137
138
139
140
141
142
143
144
145
147
148
149
150
151
152
153
154
155
156
158
159
160
161
162
163
164
165
166
168
169
170
171
173
174
175
176
177
179
180
181
182
183
184
185
186
187
188
190
191
192
194
195
196
197
198
199
200
201
202
203
204
206
207
208
209
210
211
214
215
216
217
219
220
221
223
224
226
227
228
229
230
231
intfloat/multilingual-e5-large - cosine
232
233
234
235
236
237
239
240
241
242
243
244
245
246
248
249
250
251
253
254
255
256
257
258
260
261
262
264
265
266
267
269
270
271
272
273
274
276
277
278
279
280
281
282
283
28

In [18]:
finally_result = pd.DataFrame()
for model in models:
    for distance in distances:
        df = test_results.loc[test_results['model_name'].str.contains(model) == True]
        df = df.loc[df['distance'].str.contains(distance) == True]

        finally_result = finally_result._append(pd.DataFrame({
                'founded': [len(df[df['is_found'] == True])],
                'model_name': [model],
                'distance': [distance]
                }))
        
finally_result.head(15)


Unnamed: 0,founded,model_name,distance
0,31,intfloat/multilingual-e5-large,l2
0,30,intfloat/multilingual-e5-large,ip
0,31,intfloat/multilingual-e5-large,cosine
0,27,sentence-transformers/paraphrase-multilingual-...,l2
0,5,sentence-transformers/paraphrase-multilingual-...,ip
0,25,sentence-transformers/paraphrase-multilingual-...,cosine
0,31,symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli,l2
0,30,symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli,ip
0,35,symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli,cosine
0,30,cointegrated/LaBSE-en-ru,l2


In [11]:
finally_result.to_csv("finally_result3.csv")