# NER Powered Semantic Search Using Pinecone v5.0.0

### Setup Environment

In [25]:
# init pinecone

from pinecone import Pinecone, ServerlessSpec
# API_KEY = "YOUR API KEY"
pc = Pinecone(api_key = API_KEY)

index = pc.Index("medium-data")

In [26]:
# clean up pinecone index, after deleting all vectors if you run it again you will get error

index.delete(delete_all=True)

In [5]:
# delete index , dimension no longer useful
pc.delete_index("medium-data")

In [12]:
# load libraries for NER 

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch


### NER Engine

In [13]:
# init NER engine

model_id = 'dslim/bert-base-NER'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

# nlp pipeline

nlp = pipeline('ner',
              model=model,
              tokenizer=tokenizer,
              aggregation_strategy= 'max',
              device= 'cpu') 
# nlp("Bill Gates is the founder of Microsoft")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Retriever

In [14]:
# load libraries for retriever

from sentence_transformers import SentenceTransformer


# https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base
retriever = SentenceTransformer(
    "flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [15]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
# Create Pinecone Index
pc.create_index("medium-data", dimension= 768, metric="cosine",
                     spec=ServerlessSpec(cloud="aws", region="us-east-1"))


In [17]:
index= pc.Index("medium-data")

### Data Prep

In [22]:
from datasets import load_dataset

In [204]:
# Obtain Raw Data

df = load_dataset(
    "fabiochiu/medium-articles",
    data_files="medium_articles.csv",
    split="train"
).to_pandas()

df = df.dropna().sample(10000, random_state=45) # might take 30mins to 1hr

df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]


Found cached dataset csv (/home/mohsin/.cache/huggingface/datasets/fabiochiu___csv/fabiochiu--medium-articles-96791ff68926910d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [18]:
# Incase your internet is slow and couldn't make "dataset" works, you can download the file I uploaded as "medium_articles_10k.csv"
# Source of data: https://www.kaggle.com/code/fabiochiusano/medium-articles-simple-data-analysis?select=medium_articles.csv
# it is the same underlying data

import pandas as pd
df = pd.read_csv("medium_articles_10k.csv")
df = df.dropna().sample(1000, random_state=45) # 
df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
3577,3577,Why You’re Struggling with Innovation. And How...,The modern typewriter had a problem. When Chri...,https://jswilder16.medium.com/why-youre-strugg...,['Jake Wilder'],2019-04-22 01:46:42.469000+00:00,"['Management', 'Leadership', 'Innovation', 'Cr...",Why You’re Struggling with Innovation. And How...
7003,7003,#085 | Debugging Vue Sites Displaying Blank Pages,So I had this bright idea to consolidate all m...,https://medium.com/footprints-on-the-sand/085-...,['Kelvin Zhao'],2020-03-24 23:11:00.960000+00:00,"['Vuejs', 'Website Development', 'Debugging', ...",#085 | Debugging Vue Sites Displaying Blank Pa...
925,925,You won’t believe what I’m going to tell you a...,You won’t believe what I’m going to tell you a...,https://medium.com/pnewton84/he-discovered-a-n...,['Paul Newton'],2017-09-19 19:26:41.376000+00:00,"['Marketing', 'Digital Marketing', 'Facebook',...",You won’t believe what I’m going to tell you a...
3233,3233,How Emotionally Intelligent People Deal With T...,This reminds me of the time I was on the way h...,https://medium.com/curious/how-emotionally-int...,['Ayodeji Awosika'],2020-11-09 20:50:40.841000+00:00,"['Emotional Intelligence', 'Mental Health', 'I...",How Emotionally Intelligent People Deal With T...
3515,3515,What’s New in React 16 and Fiber Explanation,"Previously, React would block the entire threa...",https://medium.com/edge-coders/react-16-featur...,['Trey Huffine'],2018-10-30 13:57:05.603000+00:00,"['Code', 'Tech', 'React', 'JavaScript', 'Start...",What’s New in React 16 and Fiber Explanation.P...


In [20]:
# len(nlp(df_batch)) # list of lst

### NER Helper Function

In [21]:
# helper function for extracting entities of a batch of texts

def extract_entities(list_of_text):
    entities = []
    for doc in list_of_text: 
        entities.append([item['word'] for item in nlp(doc)])
        # list of entities for 1 doc
    return entities

In [22]:
# embedding

# len(retriever.encode(df_batch))
# len(retriever.encode(df_batch[0])) # try for one doc
# embedding for batch
# emb = retriever.encode(df_batch).tolist() # array to python list

### Batch Upsert

In [23]:
# upsert data

from tqdm.auto import tqdm

batch_size = 64

for i in range(0, len(df), batch_size):
    i_end = min(i+batch_size, len(df))
    # print(i, i_end) # starting and ending index of each batch
    
    # get a batch of data
    df_batch = df.iloc[i: i_end].copy()
    
    # embedding
    emb = retriever.encode(df_batch['text_extended'].tolist()
                          ).tolist() # array to python list
    
    # ner extraction
    entities = extract_entities(df_batch['text_extended'].tolist())
    
    # [[]] --> [set1, set2, ], remove duplicate entities    
    df_batch['named_entity'] = [list(set(entity)) for entity in entities] # one list per document
    
    # create meta data
    df_batch = df_batch.drop('text', axis=1)
    
    meta_data = df_batch.to_dict(orient='records') # pd.df to dictionary
    
    # create ids
    
    ids = [f"{idx}" for idx in range(i, i_end)] #
    
    # upsert
    
    vectors_to_upsert = list(zip(ids, emb, meta_data))  # nd array to python list
    
    _ = index.upsert(vectors= vectors_to_upsert)  
    
    



In [207]:
# index.describe_index_stats()

### Query data

In [27]:
query = "How to make a Wordpress website?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

In [28]:
xc = index.query(vector=emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}})

In [29]:
# you might not find any match if you are only upserting 1k data because of insufficient data there might not be good match, 
# try to load more data or tweak query based on data (glance over pinecone console and look for text_extended field in your vectors)
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

In [48]:
query = "How to learn NLP?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

xc = index.query(vector=emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}  })

In [49]:
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

0.28366977   ['Allen', 'Allen NLP', 'NLP', 'PyTorch']
0.263163775   ['Python', 'Harnham', 'NLP', 'LDA', 'Science', 'London', 'Datatech Analytics', 'Data']
0.241720855   ['NLP', 'Lambda Layer']
0.236823067   ['The Dark Tower', 'It', 'NLP', 'Text Mining', 'Carrie', 'Stephen King', 'King', 'Under the Dome', 'The Shining']
0.211078629   ['AI', 'ML', 'NLP', 'Your Weekly AI', 'Machine Learning and Data Science']
