In [1]:
from dotenv import load_dotenv
import pandas as pd
from datasets import load_dataset
import os
from tqdm.auto import tqdm
import pinecone

In [2]:
load_dotenv()
PINECONE_KEY = os.getenv("PINECONE_KEY")
hf_token = os.getenv("HF_TOKEN")

In [3]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = hf_token

In [4]:
import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [5]:
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [6]:
sample_texts = [
    "This is the first sample text",
    "These texts are required to load the model without errors during the main embedding calculations"
]
sample_output = query(sample_texts)

In [7]:
sample_output

[[-0.02273164875805378,
  0.0477682463824749,
  -0.00668375426903367,
  0.03634946048259735,
  0.06079265847802162,
  0.0396263562142849,
  0.060343846678733826,
  0.03269622102379799,
  0.032260194420814514,
  -0.017252592369914055,
  0.054471466690301895,
  0.009407194331288338,
  -0.013869247399270535,
  -0.050972700119018555,
  0.009013917297124863,
  0.04846423491835594,
  0.06679240614175797,
  -0.058826882392168045,
  0.004202724434435368,
  -8.080745465122163e-05,
  0.048516009002923965,
  0.08671743422746658,
  0.06877390295267105,
  0.015224380418658257,
  0.019572166725993156,
  0.05134625732898712,
  -0.03643522784113884,
  0.07847463339567184,
  0.11318924278020859,
  -0.04334423691034317,
  -0.024414412677288055,
  0.05804324150085449,
  0.17109976708889008,
  0.03223296254873276,
  0.08333372324705124,
  0.010822674259543419,
  -0.019283119589090347,
  0.03212469443678856,
  0.027633683755993843,
  0.09738506376743317,
  0.012617363594472408,
  -0.0943077951669693,
  0.0

In [8]:
EMBEDDING_LENGTH = len(sample_output[0])

In [9]:
import zipfile
zipref = zipfile.ZipFile('data_zip.zip')
zipref.extractall()
zipref.close()

In [10]:
data_files = "arxiv-metadata-oai-snapshot.json"
arxiv_dataset = load_dataset("json", data_files=data_files, split="train[:15000]")
arxiv_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
    num_rows: 15000
})

In [11]:
print("Length of Abstract:", len(arxiv_dataset[0]['abstract']))

Length of Abstract: 983


In [12]:
columns = arxiv_dataset.column_names
columns_to_keep = ["title", "doi", "abstract", 'id']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
arxiv_dataset = arxiv_dataset.remove_columns(columns_to_remove)
arxiv_dataset

Dataset({
    features: ['id', 'title', 'doi', 'abstract'],
    num_rows: 15000
})

In [15]:
from pinecone import Pinecone, ServerlessSpec
index_name = 'semantic-search'

# initialize connection to pinecone (get API key at app.pinecone.io)
pc = Pinecone(
    api_key=PINECONE_KEY, # find next to api key in   console
)
# check if index already exists (only create index if not)
if index_name not in pc.list_indexes().names():
    pc.create_index(index_name, dimension=EMBEDDING_LENGTH, spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )
# connect to index
index = pc.Index(index_name)

In [None]:
count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32

for i in tqdm(range(0, len(arxiv_dataset['abstract']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(arxiv_dataset['abstract']))
    # get batch of lines and IDs
    abstracts_batch = arxiv_dataset['abstract'][i: i+batch_size]
    titles_batch = arxiv_dataset['title'][i: i+batch_size]
    ids_batch = arxiv_dataset['id'][i: i+batch_size]

    # ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    embeds = query(abstracts_batch)
    # prep metadata and upsert batch
    meta = [{'abstract': abstract, 'title': title} for abstract, title in zip(abstracts_batch, titles_batch)]
    
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

In [16]:
queries = ["I-V characteristics of MgB2"]
xq = query(queries)
xq

[[-0.09614157676696777,
  -0.003704544622451067,
  0.06417614221572876,
  0.00023344335204456002,
  0.023335356265306473,
  -0.01745571941137314,
  0.028246592730283737,
  0.07575574517250061,
  -0.11157209426164627,
  -0.04028768464922905,
  -0.019049065187573433,
  0.016606466844677925,
  0.08244995772838593,
  -0.061306316405534744,
  0.026029514148831367,
  -0.027534207329154015,
  0.12642726302146912,
  -0.003221851075068116,
  0.031423356384038925,
  0.10328345745801926,
  0.04525872319936752,
  0.024488545954227448,
  0.005053304135799408,
  -0.036212027072906494,
  -0.028623659163713455,
  0.07176884263753891,
  0.04333584010601044,
  -0.0030004940927028656,
  0.032715559005737305,
  -0.08088089525699615,
  0.06336891651153564,
  0.02597382850944996,
  -0.022972779348492622,
  -0.03029542602598667,
  0.03192710131406784,
  -0.029109779745340347,
  0.014475992880761623,
  0.028983894735574722,
  -0.06640126556158066,
  -0.08498173207044601,
  -0.004938329569995403,
  -0.00992825

In [17]:
from pinecone import Pinecone
pc = Pinecone(
    api_key=PINECONE_KEY,
)
index_name = 'semantic-search'
index = pc.Index(index_name)

In [18]:
res = index.query(vector = xq, top_k=5, include_metadata=True)
res

{'matches': [{'id': '0705.4229',
              'metadata': {'abstract': '  Precursor MgB2 thin films were '
                                       'prepared on sapphire substrates by '
                                       'magnetron\n'
                                       'sputtering. Influence of ex-situ '
                                       'annealing process on superconducting '
                                       'MgB2 thin\n'
                                       'films roughness is discussed. '
                                       'Optimized annealing process of MgB '
                                       'precursor thin\n'
                                       'films in vacuum results in smooth '
                                       'superconducting MgB2 thin films with\n'
                                       'roughness below 10 nm, critical '
                                       'temperature Tcon = 31 K and transition '
                                     

In [30]:
for key in res['matches'][0].to_dict():
    if key != 'values':
        print(key)

id
score
metadata
