In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/embeddings-of-news/embeddings.joblib
/kaggle/input/embeddings/embeddings (1).joblib


In [3]:
from sentence_transformers import SentenceTransformer


**Dataset**

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train',shuffle=True,random_state=42)

In [6]:
df = pd.DataFrame({
    'text': newsgroups_train.data,
    'category': newsgroups_train.target
})

df.head()

Unnamed: 0,text,category
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [7]:
print(df.iloc[0]['text'])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [8]:
print("\nDataset Size:", df.shape)
print("\nNumber of Categories:", len(newsgroups_train.target_names))
print("\nCategories:", newsgroups_train.target_names)


Dataset Size: (11314, 2)

Number of Categories: 20

Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


**Preprocessing and Vectorizing Data**

In [9]:
from sentence_transformers import SentenceTransformer
import joblib

In [10]:
model_name = "BAAI/bge-base-en-v1.5"
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# emb_vecs = model.encode(
#     df['text'].tolist(),
#     batch_size=64,
#     show_progress_bar=True,
#     convert_to_numpy=True
# )

In [12]:
# output_path = '/kaggle/working/gen_embeddings.joblib'
# joblib.dump(emb_vecs, output_path)
# print(f"Saved embeddings to: {output_path}")

In [13]:
emb_vecs = joblib.load("/kaggle/working/gen_embeddings.joblib")

In [14]:
len(emb_vecs)

11314

In [15]:
def prepro_text(text):
    text = text.strip()
    return text

In [16]:
def cosine_sim(v1,array_of_vecs):
    cos_sim = []

    v1 = np.array(v1)
    if len(np.shape(array_of_vecs)) == 1:
        array_of_vecs = [array_of_vecs]
    for v2 in array_of_vecs:
        v2 = np.array(v2)
        dot = (np.dot(v1,v2))
        norm_1 = np.linalg.norm(v1)
        norm_2 = np.linalg.norm(v2)
        c = dot/(norm_1*norm_2)
        cos_sim.append(c)
    return cos_sim

In [17]:
def top_k_greatest_inds(lst,k):
    ind_list = list(enumerate(lst))
    sort_lst = sorted(ind_list,key=lambda x: x[1],reverse=True)
    top_k_inds = [ind for ind,val in sort_lst[:k]]
    return top_k_inds

In [18]:
def retrieve_docs(query,embeds,model,top_k=5):
    query_clean = prepro_text(query)
    query_emb = model.encode(query_clean)

    cosine_scores = []
    for x in embeds:
        cosine_scores.append(cosine_sim(query_emb,x))

    top_res = top_k_greatest_inds(cosine_scores,top_k)

    print(f"Query: {query}")
    for x in top_res:
        print(x)
        print(f"Document: {df.iloc[x]['text'][:200]}...")
        # Print the category of the document using its index
        print(f"Category: {newsgroups_train.target_names[df.iloc[x]['category']]}...")
        print("\n\n")

In [19]:
eg_query = "space exploration"
retrieve_docs(eg_query,emb_vecs,model,2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query: space exploration
6707
Document: From: u1452@penelope.sdsc.edu (Jeff Bytof - SIO)
Subject: End of the Space Age?
Organization: San Diego Supercomputer Center @ UCSD
Lines: 16
Distribution: world
NNTP-Posting-Host: penelope.sdsc.edu

...
Category: sci.space...



2950
Document: From: dennisn@ecs.comm.mot.com (Dennis Newkirk)
Subject: Space class for teachers near Chicago
Organization: Motorola
Distribution: usa
Nntp-Posting-Host: 145.1.146.43
Lines: 59

I am posting this for...
Category: sci.space...





**Retrieving Metrics**

**Precision**

In [21]:
def precision(tp, tn, fp, fn):
    if tp+fp == 0:
        return 0.0
    return tp/(tp+fp)

**Recall**

In [22]:
def recall(tp, tn, fp, fn):
    if tp+fn==0:
        return 0.0
    return tp/(tp+fn)

In [23]:
# Define more complex test queries with their corresponding desired categories
test_queries = [
    {"query": "advancements in space exploration technology", "desired_category": "sci.space"},
    {"query": "real-time rendering techniques in computer graphics", "desired_category": "comp.graphics"},
    {"query": "latest findings in cardiovascular medical research", "desired_category": "sci.med"},
    {"query": "NHL playoffs and team performance statistics", "desired_category": "rec.sport.hockey"},
    {"query": "impacts of cryptography in online security", "desired_category": "sci.crypt"},
    {"query": "the role of electronics in modern computing devices", "desired_category": "sci.electronics"},
    {"query": "motorcycles maintenance tips for enthusiasts", "desired_category": "rec.motorcycles"},
    {"query": "high-performance baseball tactics for championships", "desired_category": "rec.sport.baseball"},
    {"query": "historical influence of politics on society", "desired_category": "talk.politics.misc"},
    {"query": "latest technology trends in the Windows operating system", "desired_category": "comp.os.ms-windows.misc"}
    
]

In [31]:
def compute_metrics(queries,embeds,model,top_k=5):
    res = []

    for item in queries:
        query = item["query"]
        des_cat = item["desired_category"]

        query_clean = prepro_text(query)
        query_embeds = model.encode(query_clean)

        cos_sims = []
        for x in embeds:
            cos_sim = cosine_sim(query_embeds,x)
            cos_sims.append(cos_sim)

        top_res = top_k_greatest_inds(cos_sims,top_k)

        retrieved_cats = [
            newsgroups_train.target_names[df.iloc[ind]['category']] for ind in top_res
        ]

        
        tp = sum(1 for cat in retrieved_cats 
                 if cat==des_cat)
        fp = top_k-tp

        fn = 0
        for ind in range(len(newsgroups_train.data)):
            if ind in top_res:
                continue
            if newsgroups_train.target_names[df.iloc[ind]['category']] == des_cat:
                fn+=1

        tn = 0
        
        p = precision(tp,tn,fp,fn)
        r = recall(tp,tn,fp,fn)

        res.append({
            "query": query,
            "precision": p,
            "recall": r,
        })
        
    return res        

In [33]:
res = compute_metrics(test_queries, emb_vecs,model)

print("Results:")
for r in res:
    print(f"Query: {r['query']}, Precision: {r['precision']:.2f}, Recall: {r['recall']:.2f}")    


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Results:
Query: advancements in space exploration technology, Precision: 1.00, Recall: 0.01
Query: real-time rendering techniques in computer graphics, Precision: 1.00, Recall: 0.01
Query: latest findings in cardiovascular medical research, Precision: 1.00, Recall: 0.01
Query: NHL playoffs and team performance statistics, Precision: 1.00, Recall: 0.01
Query: impacts of cryptography in online security, Precision: 1.00, Recall: 0.01
Query: the role of electronics in modern computing devices, Precision: 1.00, Recall: 0.01
Query: motorcycles maintenance tips for enthusiasts, Precision: 1.00, Recall: 0.01
Query: high-performance baseball tactics for championships, Precision: 1.00, Recall: 0.01
Query: historical influence of politics on society, Precision: 0.40, Recall: 0.00
Query: latest technology trends in the Windows operating system, Precision: 0.80, Recall: 0.01
