In [1]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET

# Namespace for ArXiv's Atom-based XML format.
ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'

def extract_from_arxiv(search_query='cat:cs.AI', max_results=100, json_file_path='files/arxiv_dataset.json'):
    """
    Fetches papers from the ArXiv API based on a search query, saves them as JSON, 
    and returns a pandas DataFrame.

    Args:
        search_query (str): The search query for ArXiv (default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.
    """
    
    # Construct the URL for the API request.
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}'
    
    # Send a GET request to the ArXiv API.
    response = requests.get(url)
    response.raise_for_status()
    
    # Parse the XML response.
    root = ET.fromstring(response.content)
    
    papers = []
    
    # Loop through each "entry" in the XML, representing a single paper.
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title_elem = entry.find(f'{ARXIV_NAMESPACE}title')
        title = title_elem.text.strip() if title_elem is not None and title_elem.text else ''
        
        summary_elem = entry.find(f'{ARXIV_NAMESPACE}summary')
        summary = summary_elem.text.strip() if summary_elem is not None and summary_elem.text else ''

        # Get the authors of the paper.
        author_elements = entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = []
        for author in author_elements:
            name_elem = author.find(f'{ARXIV_NAMESPACE}name')
            if name_elem is not None and name_elem.text:
                authors.append(name_elem.text)

        # Get the paper's URL.
        id_elem = entry.find(f'{ARXIV_NAMESPACE}id')
        paper_url = id_elem.text if id_elem is not None and id_elem.text else ''
        arxiv_id = paper_url.split('/')[-1] if paper_url else ''

        # Check for the PDF link.
        pdf_link = next((link.attrib['href'] for link in entry.findall(f'{ARXIV_NAMESPACE}link') 
                         if link.attrib.get('title') == 'pdf'), None)

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': paper_url,
            'pdf_link': pdf_link
        })
    
    # Convert list into a pandas DataFrame.
    df = pd.DataFrame(papers)
    
    # Save the DataFrame to a JSON file.
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')
    
    return df

In [3]:
df = extract_from_arxiv(max_results=20)

Data saved to files/arxiv_dataset.json ...


In [4]:
import json
file_name = 'files/arxiv_dataset.json'
with  open(file_name, 'r') as file:
    data = json.load(file)

print(data)

[{'title': 'A Deep Reinforcement Learning Approach for Ramp Metering Based on Traffic Video Data', 'summary': 'Ramp metering that uses traffic signals to regulate vehicle flows from the on-ramps has been widely implemented to improve vehicle mobility of the freeway. Previous studies generally update signal timings in real-time based on predefined traffic measures collected by point detectors, such as traffic volumes and occupancies. Comparing with point detectors, traffic cameras-which have been increasingly deployed on road networks-could cover larger areas and provide more detailed traffic information. In this work, we propose a deep reinforcement learning (DRL) method to explore the potential of traffic video data in improving the efficiency of ramp metering. The proposed method uses traffic video frames as inputs and learns the optimal control strategies directly from the high-dimensional visual inputs. A real-world case study demonstrates that, in comparison with a state-of-the-pr

In [5]:
import pandas as pd
df = pd.DataFrame(data)
df.sample(n=5)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
14,Neural document expansion for ad-hoc informati...,"Recently, Nogueira et al. [2019] proposed a ne...","[Cheng Tang, Andrew Arnold]",2012.14005v1,http://arxiv.org/abs/2012.14005v1,https://arxiv.org/pdf/2012.14005v1
19,AI-Powered Text Generation for Harmonious Huma...,"In the last two decades, the landscape of text...","[Qiuyun Zhang, Bin Guo, Hao Wang, Yunji Liang,...",1905.01984v1,http://arxiv.org/abs/1905.01984v1,https://arxiv.org/pdf/1905.01984v1
11,Toward Compact Data from Big Data,Bigdata is a dataset of which size is beyond t...,"[ Song-Kyoo, Kim]",2012.13677v1,http://arxiv.org/abs/2012.13677v1,https://arxiv.org/pdf/2012.13677v1
16,How to define co-occurrence in different domai...,This position paper presents a comparative stu...,[Mathieu Roche],1904.08010v1,http://arxiv.org/abs/1904.08010v1,https://arxiv.org/pdf/1904.08010v1
8,Dynamic-K Recommendation with Personalized Dec...,"In this paper, we investigate the recommendati...","[Yan Gao, Jiafeng Guo, Yanyan Lan, Huaming Liao]",2012.13569v1,http://arxiv.org/abs/2012.13569v1,https://arxiv.org/pdf/2012.13569v1


In [6]:
import pandas as pd
import requests
import os

def download_pdfs(df, download_folder='files'):
    """
    Downloads PDFs from URLs listed in the DataFrame and saves them to a specified folder. 
    The file names are stored in a new column 'pdf_file_name' in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing a 'pdf_link' column with URLs to download.
        download_folder (str): Path to the folder where PDFs will be saved (default is 'files').
    
    Returns:
        pd.DataFrame: The original DataFrame with an additional 'pdf_file_name' column containing 
                      the paths of the downloaded PDF files or None if the download failed.
    """
    
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    pdf_file_names = []
    
    # Loop through each row to download PDFs
    for index, row in df.iterrows():
        pdf_link = row['pdf_link']
        
        try:
            response = requests.get(pdf_link)
            response.raise_for_status()
    
            file_name = os.path.join(download_folder, pdf_link.split('/')[-1]) + '.pdf'
            pdf_file_names.append(file_name)
    
            # Save the downloaded PDF
            with open(file_name, 'wb') as f:
                f.write(response.content)
            
            print(f'PDF downloaded successfully and saved as {file_name}')
        
        except requests.exceptions.RequestException as e:
            print(f'Failed to download the PDF: {e}')
            pdf_file_names.append(None)
    
    df['pdf_file_name'] = pdf_file_names

    return df

In [7]:
df = download_pdfs(df)

PDF downloaded successfully and saved as files/2012.12104v1.pdf
PDF downloaded successfully and saved as files/2012.13026v1.pdf
PDF downloaded successfully and saved as files/2012.13293v1.pdf
PDF downloaded successfully and saved as files/2012.13315v1.pdf
PDF downloaded successfully and saved as files/2012.13391v2.pdf
PDF downloaded successfully and saved as files/2012.12447v1.pdf
PDF downloaded successfully and saved as files/2012.12634v1.pdf
PDF downloaded successfully and saved as files/2012.11903v1.pdf
PDF downloaded successfully and saved as files/2012.13569v1.pdf
PDF downloaded successfully and saved as files/2012.12718v1.pdf
PDF downloaded successfully and saved as files/2012.13666v1.pdf
PDF downloaded successfully and saved as files/2012.13677v1.pdf
PDF downloaded successfully and saved as files/2012.13779v1.pdf
PDF downloaded successfully and saved as files/2012.13872v1.pdf
PDF downloaded successfully and saved as files/2012.14005v1.pdf
PDF downloaded successfully and saved as

In [8]:
df

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,https://arxiv.org/pdf/2012.12104v1,files/2012.12104v1.pdf
1,Rethink AI-based Power Grid Control: Diving In...,"Recently, deep reinforcement learning (DRL)-ba...","[Xiren Zhou, Siqi Wang, Ruisheng Diao, Desong ...",2012.13026v1,http://arxiv.org/abs/2012.13026v1,https://arxiv.org/pdf/2012.13026v1,files/2012.13026v1.pdf
2,Fuzzy Commitments Offer Insufficient Protectio...,"In this work, we study the protection that fuz...","[Danny Keller, Margarita Osadchy, Orr Dunkelman]",2012.13293v1,http://arxiv.org/abs/2012.13293v1,https://arxiv.org/pdf/2012.13293v1,files/2012.13293v1.pdf
3,Generalization in portfolio-based algorithm se...,Portfolio-based algorithm selection has seen t...,"[Maria-Florina Balcan, Tuomas Sandholm, Ellen ...",2012.13315v1,http://arxiv.org/abs/2012.13315v1,https://arxiv.org/pdf/2012.13315v1,files/2012.13315v1.pdf
4,"I like fish, especially dolphins: Addressing C...",To quantify how well natural language understa...,"[Yixin Nie, Mary Williamson, Mohit Bansal, Dou...",2012.13391v2,http://arxiv.org/abs/2012.13391v2,https://arxiv.org/pdf/2012.13391v2,files/2012.13391v2.pdf
5,Skeleton-based Approaches based on Machine Vis...,"Recently, skeleton-based approaches have achie...","[Jie Li, Binglin Li, Min Gao]",2012.12447v1,http://arxiv.org/abs/2012.12447v1,https://arxiv.org/pdf/2012.12447v1,files/2012.12447v1.pdf
6,Overview of FPGA deep learning acceleration ba...,"In recent years, deep learning has become more...",[Simin Liu],2012.12634v1,http://arxiv.org/abs/2012.12634v1,https://arxiv.org/pdf/2012.12634v1,files/2012.12634v1.pdf
7,Modelling Human Routines: Conceptualising Soci...,Our routines play an important role in a wide ...,"[Rijk Mercuur, Virginia Dignum, Catholijn M. J...",2012.11903v1,http://arxiv.org/abs/2012.11903v1,https://arxiv.org/pdf/2012.11903v1,files/2012.11903v1.pdf
8,Dynamic-K Recommendation with Personalized Dec...,"In this paper, we investigate the recommendati...","[Yan Gao, Jiafeng Guo, Yanyan Lan, Huaming Liao]",2012.13569v1,http://arxiv.org/abs/2012.13569v1,https://arxiv.org/pdf/2012.13569v1,files/2012.13569v1.pdf
9,Compliance Generation for Privacy Documents un...,Most prominent research today addresses compli...,"[David Restrepo Amariles, Aurore Clément Trous...",2012.12718v1,http://arxiv.org/abs/2012.12718v1,https://arxiv.org/pdf/2012.12718v1,files/2012.12718v1.pdf


In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_and_chunk_pdf(pdf_file_name, chunk_size=512):
    """
    Loads a PDF file and splits its content into chunks of a specified size.

    Args:
        file (str): Path to the PDF file to be loaded.
        chunk_size (int): The maximum size of each chunk in characters (default is 512).

    Returns:
        List[Document]: A list of document chunks.
    """

    print(f'Loading and splitting into chunks: {pdf_file_name}')

    # Load the content of the PDF
    loader = PyPDFLoader(pdf_file_name)
    data = loader.load()

    # Split the content into chunks with slight overlap to preserve context
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=64)
    chunks = text_splitter.split_documents(data)

    return chunks


In [10]:
def expand_df(df):
    """
    Expands each row in the DataFrame by splitting PDF documents into chunks.

    Args:
        df (pd.DataFrame): DataFrame containing 'pdf_file_name', 'arxiv_id', 'title', 'summary', 
                           'authors', and 'url' columns.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a chunk of the original document, 
                      with additional metadata such as chunk identifiers and relationships to 
                      adjacent chunks.
    """

    expanded_rows = []  # List to store expanded rows with chunk information

    # Loop through each row in the DataFrame
    for idx, row in df.iterrows():
        try:
            chunks = load_and_chunk_pdf(row['pdf_file_name'])
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']}: {e}")
            continue

        # Loop over the chunks and construct a new DataFrame row for each
        for i, chunk in enumerate(chunks):
            prechunk_id = i-1 if i > 0 else ''  # Preceding chunk ID
            postchunk_id = i+1 if i < len(chunks) - 1 else ''  # Following chunk ID

            expanded_rows.append({
                'id': f"{row['arxiv_id']}#{i}",  # Unique chunk identifier
                'title': row['title'],
                'summary': row['summary'],
                'authors': row['authors'],
                'arxiv_id': row['arxiv_id'],
                'url': row['url'],
                'chunk': chunk.page_content,  # Text content of the chunk
                'prechunk_id': '' if i == 0 else f"{row['arxiv_id']}#{prechunk_id}",  # Previous chunk ID
                'postchunk_id': '' if i == len(chunks) - 1 else f"{row['arxiv_id']}#{postchunk_id}"  # Next chunk ID
            })

    # Return a new expanded DataFrame
    return pd.DataFrame(expanded_rows)


In [11]:
expanded_df = expand_df(df)

Loading and splitting into chunks: files/2012.12104v1.pdf
Loading and splitting into chunks: files/2012.13026v1.pdf
Loading and splitting into chunks: files/2012.13293v1.pdf
Loading and splitting into chunks: files/2012.13315v1.pdf
Loading and splitting into chunks: files/2012.13391v2.pdf
Loading and splitting into chunks: files/2012.12447v1.pdf
Loading and splitting into chunks: files/2012.12634v1.pdf
Loading and splitting into chunks: files/2012.11903v1.pdf
Loading and splitting into chunks: files/2012.13569v1.pdf
Loading and splitting into chunks: files/2012.12718v1.pdf
Loading and splitting into chunks: files/2012.13666v1.pdf
Loading and splitting into chunks: files/2012.13677v1.pdf
Loading and splitting into chunks: files/2012.13779v1.pdf
Loading and splitting into chunks: files/2012.13872v1.pdf
Loading and splitting into chunks: files/2012.14005v1.pdf
Loading and splitting into chunks: files/1904.07934v2.pdf
Loading and splitting into chunks: files/1904.08010v1.pdf
Loading and sp

In [12]:
expanded_df

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,2012.12104v1#0,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,1 \nA Deep Reinforcement Learning Approach for...,,2012.12104v1#1
1,2012.12104v1#1,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,Abstract \nRamp metering that uses traffic sig...,2012.12104v1#0,2012.12104v1#2
2,2012.12104v1#2,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,and provide more detailed traffic information....,2012.12104v1#1,2012.12104v1#3
3,2012.12104v1#3,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,method results in 1) lower travel times in the...,2012.12104v1#2,2012.12104v1#4
4,2012.12104v1#4,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,2 \nIntroduction \nRamp metering uses traffic ...,2012.12104v1#3,2012.12104v1#5
...,...,...,...,...,...,...,...,...,...
2195,1905.01984v1#123,AI-Powered Text Generation for Harmonious Huma...,"In the last two decades, the landscape of text...","[Qiuyun Zhang, Bin Guo, Hao Wang, Yunji Liang,...",1905.01984v1,http://arxiv.org/abs/1905.01984v1,Topic aware neural response generation . In Th...,1905.01984v1#122,1905.01984v1#124
2196,1905.01984v1#124,AI-Powered Text Generation for Harmonious Huma...,"In the last two decades, the landscape of text...","[Qiuyun Zhang, Bin Guo, Hao Wang, Yunji Liang,...",1905.01984v1,http://arxiv.org/abs/1905.01984v1,Personalized response generation via domain ad...,1905.01984v1#123,1905.01984v1#125
2197,1905.01984v1#125,AI-Powered Text Generation for Harmonious Huma...,"In the last two decades, the landscape of text...","[Qiuyun Zhang, Bin Guo, Hao Wang, Yunji Liang,...",1905.01984v1,http://arxiv.org/abs/1905.01984v1,generative adversarial nets with policy gradie...,1905.01984v1#124,1905.01984v1#126
2198,1905.01984v1#126,AI-Powered Text Generation for Harmonious Huma...,"In the last two decades, the landscape of text...","[Qiuyun Zhang, Bin Guo, Hao Wang, Yunji Liang,...",1905.01984v1,http://arxiv.org/abs/1905.01984v1,pets too? arXiv preprint arXiv:1801.07243. \n...,1905.01984v1#125,1905.01984v1#127


In [13]:
from dotenv import load_dotenv, find_dotenv

# Load the API keys from .env
load_dotenv(find_dotenv(), override=True)

True

In [14]:
import os
from getpass import getpass

from semantic_router.encoders import OpenAIEncoder

# Check if 'OPENAI_API_KEY' is set; prompt if not
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') or getpass('OpenAI API key: ')

# Initialize the OpenAIEncoder with a specific model
encoder = OpenAIEncoder(name='text-embedding-3-small')  

In [15]:
encoder(['hello hallo hola salut'])

[[0.018583279103040695,
  -0.025987325236201286,
  0.01025006826967001,
  -0.031957387924194336,
  -0.004148314706981182,
  -0.04852138087153435,
  -0.01638840325176716,
  0.04334147274494171,
  -0.03148914501070976,
  -0.018451586365699768,
  0.0052201454527676105,
  -0.08088847994804382,
  0.005318915005773306,
  -0.012496157549321651,
  -0.010967060923576355,
  0.09593068808317184,
  -0.0387176014482975,
  0.025460554286837578,
  -0.0030234409496188164,
  0.04275617375969887,
  0.04527296498417854,
  0.011076805181801319,
  -0.0395662896335125,
  0.03997599706053734,
  0.03169400244951248,
  0.029118681326508522,
  0.024977682158350945,
  0.015583615750074387,
  0.014076467603445053,
  -0.013761868700385094,
  0.04544855281710625,
  -0.05346716567873955,
  -0.01498368289321661,
  0.025050844997167587,
  -0.0031313556246459484,
  -0.020851315930485725,
  -0.024348484352231026,
  0.009284323081374168,
  0.007041892036795616,
  -0.016915174201130867,
  -0.0011724292999133468,
  -0.0330

In [16]:
dims = len(encoder(['hello hallo hola salut'])[0])
dims

1536

In [17]:
from pinecone import Pinecone, ServerlessSpec

# Check if 'PINECONE_API_KEY' is set; prompt if not
api_key = os.getenv('PINECONE_API_KEY') or getpass('Pinecone API key: ')

# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

# Define the serverless specification for Pinecone (AWS region 'us-east-1')
spec = ServerlessSpec(
    cloud='aws', 
    region='us-east-1'
)



In [18]:
import time

# Define the name of the index
index_name = 'langgraph-research-agent'

# Check if the index exists; create it if it doesn't
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=dims,  # Embedding dimension (1536)
        metric='cosine',
        spec=spec  # Cloud provider and region specification
    )

    # Wait until the index is fully initialized
    while True:
        description = pc.describe_index(index_name)
        if description and description.status and description.status.get('ready'):
            break
        time.sleep(1)

# Connect to the index
index = pc.Index(index_name)

# Add a short delay before checking the stats
time.sleep(1)

# View the index statistics
index.describe_index_stats()


{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '151',
                                    'content-type': 'application/json',
                                    'date': 'Thu, 05 Feb 2026 10:22:14 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '40',
                                    'x-pinecone-request-latency-ms': '47',
                                    'x-pinecone-response-duration-ms': '49'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'storageFullness': 0.0,
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [19]:
expanded_df.iloc[:5]

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,2012.12104v1#0,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,1 \nA Deep Reinforcement Learning Approach for...,,2012.12104v1#1
1,2012.12104v1#1,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,Abstract \nRamp metering that uses traffic sig...,2012.12104v1#0,2012.12104v1#2
2,2012.12104v1#2,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,and provide more detailed traffic information....,2012.12104v1#1,2012.12104v1#3
3,2012.12104v1#3,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,method results in 1) lower travel times in the...,2012.12104v1#2,2012.12104v1#4
4,2012.12104v1#4,A Deep Reinforcement Learning Approach for Ram...,Ramp metering that uses traffic signals to reg...,"[Bing Liu, Yu Tang, Yuxiong Ji, Yu Shen, Yuchu...",2012.12104v1,http://arxiv.org/abs/2012.12104v1,2 \nIntroduction \nRamp metering uses traffic ...,2012.12104v1#3,2012.12104v1#5


In [21]:
from tqdm.auto import tqdm

data = expanded_df
batch_size = 64  # Set batch size

# Loop through the data in batches, using tqdm for a progress bar
for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)  # Define batch endpoint
    batch = data[i:i_end].to_dict(orient='records')  # Slice data into a batch

    # Extract metadata for each chunk in the batch
    metadata = [{
        'arxiv_id': r['arxiv_id'],
        'title': r['title'],
        'chunk': r['chunk'],
    } for r in batch]
    
    # Generate unique IDs for each chunk
    ids = [r['id'] for r in batch]
    
    # Extract the chunk content
    chunks = [r['chunk'] for r in batch]
    
    # Convert chunks into embeddings
    embeds = encoder(chunks)
    
    # Upload embeddings, IDs, and metadata to Pinecone
    index.upsert(vectors=list(zip(ids, embeds, metadata)))


  0%|          | 0/35 [00:00<?, ?it/s]



In [22]:
# Display the index statistics.
index.describe_index_stats()

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '188',
                                    'content-type': 'application/json',
                                    'date': 'Thu, 05 Feb 2026 10:28:32 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '49',
                                    'x-pinecone-request-latency-ms': '48',
                                    'x-pinecone-response-duration-ms': '50'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 2200}},
 'storageFullness': 0.0,
 'total_vector_count': 2200,
 'vector_type': 'dense'}