<a href="https://colab.research.google.com/github/mikecinnamon/MLearning/blob/main/Notebooks/covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ML-26] Example - Semantic search of COVID-19 articles

## Importing the data

In [None]:
import pandas as pd, numpy
path = 'https://raw.githubusercontent.com/mikecinnamon/Data/main/'
df = pd.read_csv(path + 'covid.csv')

In [None]:
df.info()

In [None]:
df[['title', 'abstract']].head()

In [None]:
pd.concat([df['title'].str.len().describe(), df['abstract'].str.len().describe()], axis=1)

## Q1. Embedding model

In [None]:
pip install cohere

In [None]:
import cohere
co = cohere.ClientV2(api_key='YOUR_API_KEY')

In [None]:
model_name = 'embed-english-v3.0'

## Q2. Encoding the query

In [None]:
query = ['False positive rate in COVID test']

In [None]:
query_embed = co.embed(texts=query, model=model_name, input_type='search_query')

In [None]:
query_embed = np.array(query_embed.embeddings[0])
query_embed.shape

## Q3. Encoding the abstracts

In [None]:
import time

In [None]:
abstract = df['abstract'].tolist()

In [None]:
abstract_embed = co.embed(texts=abstract[:2500], model=model_name, input_type='search_document').embeddings

In [None]:
for i in range(1, 4):
    time.sleep(60)
    new_embed = co.embed(texts=abstract[(2500*i):2500*(i+1)], model=model_name, input_type='search_document').embeddings
    abstract_embed = abstract_embed + new_embed

In [None]:
df['abstract_embed'] = [np.array(e) for e in abstract_embed]

In [None]:
df.info()

In [None]:
df['abstract_embed'][0]

## Q4. Vector search

In [None]:
df['similarity'] = df['abstract_embed'].apply(lambda x: np.dot(x, query_embed))
df.head()

In [None]:
search_output = df.sort_values(by='similarity', ascending=False).head(20)

## Q5. Reranking

In [None]:
model_name = 'rerank-english-v3.0'

In [None]:
docs = search_output['abstract'].tolist()

In [None]:
top3 = co.rerank(model=model_name, query=query[0], documents=docs, top_n=3)
top3.results

In [None]:
selection = [r.index for r in top3.results]
search_output['url'].iloc[selection]