In [13]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from pathlib import Path

In [14]:
pd.set_option('display.max_columns', None)

In [15]:
df = pd.read_excel("../data/Huddle Global 2024 - expo application .xlsx", sheet_name="22.10.24")
df.head()



Unnamed: 0,Ticket Number,Name,Email,Phone,Designation,Organisation,Choose Category,Is the product aligned with the UN Sustainable Development Goals,Select the Sustainable Development Goals your organization primarily focuses on,Product Name,Product Detail,About the Company,Website,Stage,Do you want investment/ investor connect?,Amount looking for,Have you raised investment?,Amount you have raised,Do you want to connect with Mentor?,Type of mentorship required,Technology,Sector,Stage of funding,Would you like to matchmake with the startups,How many hours you would like to spend,Please select the dates preferable for matchmaking,Stage of the startup looking for,Choose your interested sector,Do you wish to avail accommodation facility?,Type of Accomodation you are looking for?,Person 2 - Name,Person 2 - Designation,Person 2 - Email,Person 3 - Name,Person 3 - Designation,Person 3 - Email,Person 4 - Name,Person 4 - Designation,Person 4 - Email,Person 5 - Name,Person 5 - Designation,Person 5 - Email,Person 6 - Name,Person 6 - Designation,Person 6 - Email,Person 7 - Name,Person 7 - Designation,Person 7 - Email,Person 8 - Name,Person 8 - Designation,Person 8 - Email,Person 9 - Name,Person 9 - Designation,Person 9 - Email,Person 10 - Name,Person 10 - Designation,Person 10 - Email
0,VKXg8Iow3z,Tony Francis,aswinunkn@gmail.com,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,B31O6sLOmZ,Fincy M,fincy@equipohealth.com,917736400000.0,VICE PRESIDENT - GROWTH,EQUIPO HEALTH,Interested Individual,,,,,,,,,,,,,,,,,,,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,vmQMQiAgmR,Smitha Prabhakaran,smitha.ap@gmail.com,919895500000.0,Solution Architect,UST,Corporate,,,,,,,,,,,,,,,,,No,,,,Analytics,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,qKyNqTVzme,Sidharth Test,support@startupmission.in,914802800000.0,Test,KSUM,Interested Individual,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,B31NdF6kKZ,Test Person,sidharth+123@startupmission.in,911944700000.0,Test,KSUM,Interested Individual,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
# Initialize ChromaDB client with persistent storage
persist_dir = Path("../data/chroma/about_company")
persist_dir.mkdir(parents=True, exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(persist_dir))


In [18]:
# chroma_client.delete_collection("about_company")

In [19]:
# Create or get a collection
collection = chroma_client.get_or_create_collection("about_company",  metadata={"hnsw:space": "cosine"})

In [20]:
column_mapping = {
    'Ticket Number': 'ticket_num',
    'Name': 'name',
    'Email': 'email',
    'Phone': 'phone',
    'Designation': 'designation',
    'Organisation': 'organization',
    'Choose Category': 'category',
    'Is the product aligned with the UN Sustainable Development Goals': 'aligned_with_sdg',
    'Select the Sustainable Development Goals your organization primarily focuses on': 'primary_sdgs',
    'Product Name': 'product_name',
    'Product Detail': 'product_detail',
    'About the Company': 'company_description',
    'Website': 'website',
    'Stage': 'stage',
    'Do you want investment/ investor connect?': 'seeking_investment',
    'Amount looking for': 'investment_amount',
    'Have you raised investment?': 'raised_investment',
    'Amount you have raised': 'raised_amount',
    'Do you want to connect with Mentor?': 'seeking_mentor',
    'Type of mentorship required': 'mentorship_type',
    'Technology': 'technology',
    'Sector': 'sector'
}

In [21]:
df_cleaned = df.copy()[column_mapping.keys()]
df_cleaned = df_cleaned.rename(columns=column_mapping)
df_cleaned['company_description'] = df_cleaned['company_description'].str.strip().replace(np.nan, '', regex=True)
df_cleaned.head()

Unnamed: 0,ticket_num,name,email,phone,designation,organization,category,aligned_with_sdg,primary_sdgs,product_name,product_detail,company_description,website,stage,seeking_investment,investment_amount,raised_investment,raised_amount,seeking_mentor,mentorship_type,technology,sector
0,VKXg8Iow3z,Tony Francis,aswinunkn@gmail.com,,,,,,,,,,,,,,,,,,,
1,B31O6sLOmZ,Fincy M,fincy@equipohealth.com,917736400000.0,VICE PRESIDENT - GROWTH,EQUIPO HEALTH,Interested Individual,,,,,,,,,,,,,,,
2,vmQMQiAgmR,Smitha Prabhakaran,smitha.ap@gmail.com,919895500000.0,Solution Architect,UST,Corporate,,,,,,,,,,,,,,,
3,qKyNqTVzme,Sidharth Test,support@startupmission.in,914802800000.0,Test,KSUM,Interested Individual,,,,,,,,,,,,,,,
4,B31NdF6kKZ,Test Person,sidharth+123@startupmission.in,911944700000.0,Test,KSUM,Interested Individual,,,,,,,,,,,,,,,


In [22]:
# chroma_client.delete_collection("about_company")

In [23]:
# Preprocess data for batch upload to ChromaDB
doc_ids = []
documents = []
metadatas = []
embeddings_list = []

for index, row in df_cleaned.iterrows():
    # Create a unique ID for each document
    doc_ids.append(str(row['ticket_num']))
    
    # Extract the 'about_company' text
    documents.append(str(row['company_description']))
    
    # Create a dictionary of all fields in the record
    metadata = row.to_dict()
    
    # Remove the 'company_description' from metadata as it's used as the main text
    # metadata.pop('company_description', None)
    
    # Convert all values to strings to ensure compatibility with ChromaDB
    metadata = {k: str(v) for k, v in metadata.items() if pd.notna(v)}
    metadatas.append(metadata)

embeddings = model.encode(documents, convert_to_tensor=True)
embeddings_list = embeddings.tolist()

# Batch upload to ChromaDB
collection.add(
    documents=documents,
    metadatas=metadatas,
    embeddings=embeddings_list,
    ids=doc_ids
)

print(f"Stored {len(df_cleaned)} documents with embeddings in ChromaDB")


Stored 851 documents with embeddings in ChromaDB


In [24]:
def search_chroma(query):
    embeddings = model.encode([query])
    results = collection.query(query_embeddings=embeddings, n_results=5)
    return results

search_chroma("AI")

{'ids': [['xBN7MiZzOx',
   'mDMrksr8bx',
   'xgQq5soa4K',
   'qKyNqTVzme',
   '3V7dXfWWyK']],
 'embeddings': None,
 'documents': [['blokchain ai vision',
   "We're redefining investing and making it effortlessly 'cool' for 437 million Gen Zers/beginners with a personalized fintech platform driven by Gen AI. Similar to how Spotify curates music playlists, we craft diversified investment playlists tailored to individual goals/risk appetite, featuring Digi Gold, ETFs, stocks and Mutual Funds. Our platform simplifies decision-making with timely insights, making investing accessible without daunting jargons, expensive advisors, or unreliable influencer advice.",
   'Expertmentoring & Industry interface to youth',
   '',
   '']],
 'uris': None,
 'data': None,
 'metadatas': [[{'category': 'Startup',
    'company_description': 'blokchain ai vision',
    'designation': 'Founder',
    'email': 'shehinfano@gmail.com',
    'investment_amount': '1.0',
    'name': 'SHEHIN FN',
    'organization': 's

In [25]:
def find_similar_companies(organization):
    company = df_cleaned[df_cleaned['organization'] == organization]
    if company.empty:
        return []
    company_description = company['company_description'].values[0]
    results = search_chroma(company_description)
    return results

In [26]:
find_similar_companies("Cliperact")

{'ids': [['zKMVQuRrmp',
   'jK64JiLlxY',
   'LK8JbtadxX',
   'mGlrqcZ08x',
   'o3VXeC2Zem']],
 'embeddings': None,
 'documents': [['Cliperact is poised to revolutionize the way businesses engage with their audience through interactive video content. With a proven solution, scalable business model, and exciting roadmap, we invite you to join us on this journey to reshape the future of digital engagement. Invest in Cliperact today and be part of the interactive video revolution!',
   "Urav Advanced Learning Systems Pvt Ltd\n\nOverview:\nUrav Advanced Learning Systems Pvt Ltd is a pioneering technology company dedicated to developing cutting-edge AI solutions. Our focus is on creating innovative products that enhance user experience through advanced AI, computer vision, and natural language processing. Our portfolio includes a range of applications designed to cater to diverse industries such as healthcare, retail, finance, and transportation.\n\nMission:\nTo leverage advanced AI technolo

In [13]:
df_cleaned.to_excel("../data/huddle-expo-cleaned.xlsx")

# Load from ChromaDB

In [2]:
from pathlib import Path
import chromadb

In [4]:
persist_dir = Path("../data/chroma/about_company")
persist_dir.mkdir(parents=True, exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(persist_dir))

# Create or get a collection
collection = chroma_client.get_or_create_collection("about_company")


In [5]:
collection.get(limit=10)

{'ids': ['VKXg8Iow3z',
  'B31O6sLOmZ',
  'vmQMQiAgmR',
  'qKyNqTVzme',
  'B31NdF6kKZ',
  'wmkjOs7JKl',
  'V32NacQW3O',
  'V32NkfQE3O',
  'XmvGMUe5xL',
  'ZxZ4ESJ1Kp'],
 'embeddings': None,
 'documents': ['',
  '',
  '',
  '',
  '',
  '',
  'Consultancy Service and IT Services',
  '',
  '',
  ''],
 'uris': None,
 'data': None,
 'metadatas': [{'email': 'aswinunkn@gmail.com',
   'name': 'Tony Francis',
   'ticket_num': 'VKXg8Iow3z'},
  {'category': 'Interested Individual',
   'designation': 'VICE PRESIDENT - GROWTH',
   'email': 'fincy@equipohealth.com',
   'name': 'Fincy M',
   'organization': 'EQUIPO HEALTH',
   'phone': '917736411147.0',
   'ticket_num': 'B31O6sLOmZ'},
  {'category': 'Corporate',
   'designation': 'Solution Architect',
   'email': 'smitha.ap@gmail.com',
   'name': 'Smitha Prabhakaran',
   'organization': 'UST',
   'phone': '919895542015.0',
   'ticket_num': 'vmQMQiAgmR'},
  {'category': 'Interested Individual',
   'designation': 'Test',
   'email': 'support@startupmiss