In [None]:
import os

# Set environment variables
os.environ['PINECONE_API_KEY'] = 'your_pinecone_api_key_here'
os.environ['PINECONE_BASE_URL'] = 'your_pinecone_base_url_here'
os.environ['PINECONE_INDEX_NAME'] = 'your_pinecone_index_name_here'

In [24]:
import pandas as pd

# Load the CSV data
df = pd.read_csv('yc_startups_aggregated_by_company.csv')

# Function to prepare text for embedding
def prepare_text(row):
    # Prepare all fields
    all_fields = [
        'web-scraper-order', 'web-scraper-start-url', 'one-liner', 'batch', 'location', 'type', 'categories',
        'profile-href', 'url-href', 'status', 'founded', 'description', 'launches', 'company-linkedin-href',
        'company-twitter-href', 'company-crunchbase-href', 'company-github', 'number-jobs', 'team-size',
        'latest_news', 'latest_news-href', 'youtube_link', 'email', 'calendar_link', 'name', 'founder_count'
    ]

    # Add founder information
    for i in range(1, 9):
        all_fields.extend([f'founder-{i}', f'founder-{i}-linkedin-href', f'founder-{i}-twitter-href'])

    # Prepare all fields
    prepared_fields = [str(row.get(field, '')) for field in all_fields]
    
    # Filter out None values and empty strings, then join
    return ' '.join(field for field in prepared_fields if field and field.lower() != 'nan')

df['prepared_text'] = df.apply(prepare_text, axis=1)


In [25]:
import tiktoken
from tqdm.auto import tqdm

# Function to count tokens
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("text-embedding-3-small")
    return len(encoding.encode(text))

# Process each row: count tokens and keep track of max
max_tokens = 0
tqdm.pandas(desc="Processing rows")
df['token_count'] = df['prepared_text'].progress_apply(count_tokens)
max_tokens = df['token_count'].max()

print(f"Maximum token count: {max_tokens}")


Processing rows:   0%|          | 0/4516 [00:00<?, ?it/s]

Maximum token count: 2397


In [26]:
from openai import OpenAI
import os

# Initialize OpenAI client (make sure to set your API key in the environment)
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Function to get embedding
def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# Test the embedding function with a sample text
sample_text = df.iloc[0]['prepared_text']  # Use the prepared text from the 'prepared_text' column
sample_embedding = get_embedding(sample_text)

print("Sample embedding:")
print(sample_embedding)
print(f"Embedding dimension: {len(sample_embedding)}")


Sample embedding:
[-0.0051547931507229805, 0.020816121250391006, 0.008340083993971348, 0.020876722410321236, -0.0031057533342391253, -0.05063135176897049, -0.04760134965181351, 0.057842761278152466, -0.018786020576953888, -0.01224878802895546, 0.03190593421459198, -0.08277969062328339, 0.005404768045991659, -0.04069294407963753, -0.01515001617372036, 0.018770869821310043, -0.015483316034078598, -0.028300229460000992, -0.020073771476745605, 0.007256857585161924, 0.013445639051496983, 0.05026775225996971, -0.005912293680012226, 0.013127489015460014, 0.014900040812790394, -0.014900040812790394, 0.011627636849880219, 1.333023828919977e-05, 0.006408456712961197, -0.01046108640730381, 0.05363105610013008, -0.0366024374961853, -0.06490267068147659, 0.00581760611385107, -0.01214273739606142, 0.05163125321269035, -0.0033159598242491484, 0.02878502942621708, -0.01669531688094139, -0.0024391524493694305, -0.01257451344281435, -0.009264234453439713, 0.07478047907352448, 0.009233934804797173, -0.03

In [27]:
from openai import OpenAI
import os
from tqdm.auto import tqdm

# Initialize OpenAI client (make sure to set your API key in the environment)
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Function to get embedding
def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# Function to process a row: create embedding and add to the row
def process_row(row):
    embedding = get_embedding(row['prepared_text']) 
    row['embedding'] = embedding
    return row

# Process each row: create embedding and add to the DataFrame
tqdm.pandas(desc="Processing rows")
df['embedding'] = df.progress_apply(lambda row: get_embedding(row['prepared_text']), axis=1)

# Save the DataFrame with embeddings
df.to_pickle('yc_startups_with_embeddings.pkl')

print(f"Processed {len(df)} items. Data with embeddings saved to 'yc_startups_with_embeddings.pkl'")

Processing rows:   0%|          | 0/4516 [00:00<?, ?it/s]

Processed 4516 items. Data with embeddings saved to 'yc_startups_with_embeddings.pkl'


In [28]:
# Count how many rows have embeddings
rows_with_embeddings = df['embedding'].notna().sum()

print(f"Number of rows with embeddings: {rows_with_embeddings}")
print(f"Total number of rows: {len(df)}")
print(f"Percentage of rows with embeddings: {(rows_with_embeddings / len(df)) * 100:.2f}%")


Number of rows with embeddings: 4516
Total number of rows: 4516
Percentage of rows with embeddings: 100.00%


In [33]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

#create index if it doesn't exist
if 'yc-startups' not in pc.list_indexes().names():
    # Create a new index with the specified configuration
    pc.create_index(
        name='yc-startups',
        dimension=1536,  # Dimension of the embeddings
        metric='cosine',  # Metric for similarity
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1',
        )
    )


In [42]:
index = pc.Index('yc-startups')

import numpy as np

def replace_nan_with_empty_string(value):
    if isinstance(value, np.ndarray):
        return '' if pd.isna(value).any() else value.tolist()
    elif isinstance(value, (list, pd.Series)):
        return '' if pd.isna(value).any() else value
    elif pd.isna(value):
        return ''
    else:
        return value

# Prepare the data for upsert
vectors_with_metadata = []
for _, row in df.iterrows():
    # Replace NaN values with empty strings or handle arrays
    metadata = {k: replace_nan_with_empty_string(v) for k, v in row.items()}
    vector_data = {
        'id': str(row['web-scraper-order']),  # Using web-scraper-order as a unique identifier
        'values': row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding'],
        'metadata': {
            'web-scraper-order': metadata['web-scraper-order'],
            'web-scraper-start-url': metadata['web-scraper-start-url'],
            'one-liner': metadata['one-liner'],
            'batch': metadata['batch'],
            'location': metadata['location'],
            'type': metadata['type'],
            'categories': metadata['categories'],
            'profile-href': metadata['profile-href'],
            'url-href': metadata['url-href'],
            'status': metadata['status'],
            'founded': metadata['founded'],
            'description': metadata['description'],
            'launches': metadata['launches'],
            'company-linkedin-href': metadata['company-linkedin-href'],
            'company-twitter-href': metadata['company-twitter-href'],
            'company-crunchbase-href': metadata['company-crunchbase-href'],
            'company-github': metadata['company-github'],
            'number-jobs': metadata['number-jobs'],
            'team-size': metadata['team-size'],
            'latest_news': metadata['latest_news'],
            'latest_news-href': metadata['latest_news-href'],
            'youtube_link': metadata['youtube_link'],
            'email': metadata['email'],
            'calendar_link': metadata['calendar_link'],
            'name': metadata['name'],
            'founder_count': metadata['founder_count'],
            'founder-1': metadata['founder-1'],
            'founder-1-linkedin-href': metadata['founder-1-linkedin-href'],
            'founder-1-twitter-href': metadata['founder-1-twitter-href'],
            'founder-2': metadata['founder-2'],
            'founder-2-linkedin-href': metadata['founder-2-linkedin-href'],
            'founder-2-twitter-href': metadata['founder-2-twitter-href'],
            'founder-3': metadata['founder-3'],
            'founder-3-linkedin-href': metadata['founder-3-linkedin-href'],
            'founder-3-twitter-href': metadata['founder-3-twitter-href'],
            'founder-4': metadata['founder-4'],
            'founder-4-linkedin-href': metadata['founder-4-linkedin-href'],
            'founder-4-twitter-href': metadata['founder-4-twitter-href'],
            'founder-5': metadata['founder-5'],
            'founder-5-linkedin-href': metadata['founder-5-linkedin-href'],
            'founder-5-twitter-href': metadata['founder-5-twitter-href'],
            'founder-6': metadata['founder-6'],
            'founder-6-linkedin-href': metadata['founder-6-linkedin-href'],
            'founder-6-twitter-href': metadata['founder-6-twitter-href'],
            'founder-7': metadata['founder-7'],
            'founder-7-linkedin-href': metadata['founder-7-linkedin-href'],
            'founder-7-twitter-href': metadata['founder-7-twitter-href'],
            'founder-8': metadata['founder-8'],
            'founder-8-linkedin-href': metadata['founder-8-linkedin-href'],
            'founder-8-twitter-href': metadata['founder-8-twitter-href']
        }
    }
    vectors_with_metadata.append(vector_data)

# Upsert the data into the index with error handling and retries
from tenacity import retry, stop_after_attempt, wait_exponential
import time

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
def upsert_with_retry(vectors):
    try:
        index.upsert(vectors=vectors, namespace='yc-startups')
    except Exception as e:
        print(f"Error during upsert: {str(e)}")
        time.sleep(5)  # Add a delay before retrying
        raise  # Re-raise the exception to trigger a retry

# Split the data into smaller batches
batch_size = 100
for i in range(0, len(vectors_with_metadata), batch_size):
    batch = vectors_with_metadata[i:i+batch_size]
    try:
        upsert_with_retry(batch)
        print(f"Upserted batch {i//batch_size + 1}/{len(vectors_with_metadata)//batch_size + 1}")
    except Exception as e:
        print(f"Failed to upsert batch {i//batch_size + 1} after multiple retries: {str(e)}")

print(f"Upsert process completed for {len(vectors_with_metadata)} items")

Upserted batch 1/46
Upserted batch 2/46
Upserted batch 3/46
Upserted batch 4/46
Upserted batch 5/46
Upserted batch 6/46
Upserted batch 7/46
Upserted batch 8/46
Upserted batch 9/46
Upserted batch 10/46
Upserted batch 11/46
Upserted batch 12/46
Upserted batch 13/46
Upserted batch 14/46
Upserted batch 15/46
Upserted batch 16/46
Upserted batch 17/46
Upserted batch 18/46
Upserted batch 19/46
Upserted batch 20/46
Upserted batch 21/46
Upserted batch 22/46
Upserted batch 23/46
Upserted batch 24/46
Upserted batch 25/46
Upserted batch 26/46
Upserted batch 27/46
Upserted batch 28/46
Upserted batch 29/46
Upserted batch 30/46
Upserted batch 31/46
Upserted batch 32/46
Upserted batch 33/46
Upserted batch 34/46
Upserted batch 35/46
Upserted batch 36/46
Upserted batch 37/46
Upserted batch 38/46
Upserted batch 39/46
Upserted batch 40/46
Upserted batch 41/46
Upserted batch 42/46
Upserted batch 43/46
Upserted batch 44/46
Upserted batch 45/46
Upserted batch 46/46
Upsert process completed for 4516 items
{'m

In [45]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'yc-startups': {'vector_count': 4516}},
 'total_vector_count': 4516}


In [50]:
# Test query function
def test_query(query_text, top_k=10, filter_dict=None):
    # Create the query vector
    query_vector = get_embedding(query_text)
    
    # Prepare the query object
    query_obj = {
        'namespace': 'yc-startups',
        'top_k': top_k,
        'vector': query_vector,
        'include_metadata': True
    }
    
    # Add filter if provided
    if filter_dict:
        query_obj['filter'] = filter_dict
    
    # Perform the query
    results = index.query(**query_obj)
    
    # Print the results
    print(f"Query: {query_text}")
    print(f"Top {top_k} results:")
    
    if not results['matches']:
        print("No matches found.")
    else:
        for match in results['matches']:
            print(f"Score: {match.score:.4f}")
            if match.metadata:
                print(f"Name: {match.metadata.get('name', 'N/A')}")
                print(f"Description: {match.metadata.get('description', 'N/A')}")
                print(f"Status: {match.metadata.get('status', 'N/A')}")
            else:
                print("Metadata not available for this match.")
            print("---")

# Example usage
test_query("AI startups in healthcare", top_k=5)

# If the above query returns no results, try without the filter
if not index.query(vector=get_embedding("AI startups in healthcare"), top_k=1, namespace='yc-startups')['matches']:
    print("\nNo results found with filter. Trying without filter:")
    test_query("AI startups in healthcare", top_k=5)


Query: AI startups in healthcare
Top 5 results:
Score: 0.6296
Name: Bunkerhill Health
Description: Bunkerhill is a Sequoia/Felicis/YC-backed company whose goal is to increase the adoption of AI in healthcare, in order to help doctors catch deadly diseases earlier.


Bunkerhill Health has built a consortium of research institutions (Stanford, Johns Hopkins, Harvard, etc.), who develop and validate AI algorithms seamlessly under our legal umbrella (less than one week of setup time for a new project when using the Bunkerhill consortium, versus a year or more for projects outside the consortium). Bunkerhill then manages the FDA clearance and commercial sales process for the algorithms that these institutions develop and validate with us.


We’ve signed commercial contracts and deployed algorithms with the world’s leading health systems—UCSF, Mayo Clinic, Georgetown/MedStar, and several others. These algorithms help doctors identify patients earlier than has otherwise been possible, leading