In [None]:
!pip install --pre -U "weaviate-client==4.*"

In [2]:
import weaviate
from weaviate.auth import AuthApiKey
import weaviate.classes.config as wc
from weaviate.classes.config import Configure
from weaviate.util import generate_uuid5
import weaviate.classes.data as wd
import weaviate.classes.query as wq

import json
import os
import pandas as pd
import csv

In [3]:
# Connecting to WCD, Change the config to connect to your instance
WEAVIATE_URL = "YOUR_WEAVIATE_URL"
API_KEY = "YOUR_WEAVIATE_API_KEY"

client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_URL,
    auth_credentials=AuthApiKey(API_KEY),
    headers={
        # You also need an OpenAI key for vectorizing and generation models
        "X-OpenAI-Api-Key": "YOUR_OPENAI-API-KEY",
    }
)

In [18]:
# Download the dataset from https://www.kaggle.com/datasets/arshkon/linkedin-job-postings
postings = pd.read_csv('./postings.csv')

In [44]:
# let's delete any instance with similar name - just in case.
client.collections.delete("Postings")

# Create the schema for database - only these features are enough for our demo
client.collections.create(
    name="Postings",
    properties=[
        wc.Property(name="job_id", data_type=wc.DataType.NUMBER),
        wc.Property(name="title", data_type=wc.DataType.TEXT),
        wc.Property(name="description", data_type=wc.DataType.TEXT),
        wc.Property(name="company_name", data_type=wc.DataType.TEXT)
    ],
    # Define the vectorizer module
    vectorizer_config=wc.Configure.Vectorizer.text2vec_openai(),
    # Define the generative module
    generative_config=wc.Configure.Generative.openai()
)

# If you don't want to test it on a large dataset, uncomment the line below:
# postings = postings.head(1000)

<weaviate.collections.collection.Collection at 0x787c6fd13370>

In [50]:
# Adding objects to the dataset
postings_collection = client.collections.get("Postings")

object_list = list()

for index, row in postings.iterrows():
    posting_obj = {
        "job_id": row["job_id"],
        "title": row["title"],
        "description": row["description"],
        "company_name": row["company_name"],
    }

    wv_obj = wd.DataObject(
        properties=posting_obj,
        uuid=generate_uuid5(posting_obj)
    )
    object_list.append(wv_obj)

response = postings_collection.data.insert_many(object_list)

# Checking if everything was successfully done
print(response.has_errors)

In [153]:
# There are different sorts of search we can use

# Simple Filtering
response_naive = postings_collection.query.fetch_objects(
    filters=wq.Filter.by_property("title").like("*machine learning*"),
    limit=5)
for o in response.objects:
    print(json.dumps(o.properties, indent=2))

{
  "description": "GigaGen, a subsidiary of Grifols, discovers and develops next-generation recombinant antibody therapeutics. Our core technology utilizes a microfluidic droplet system to capture and immortalize diverse immune repertoires, which can then be mined for exceptional monoclonals or enriched and used as high-potency polyclonal therapeutics. Our preclinical pipeline includes several recombinant polyclonal products for infectious diseases and a monoclonal antibody with a unique mechanism for oncology, which is entering the clinic. We are seeking a talented, highly motivated Research Associate to join the Technology Development team. This team is responsible for inventing, developing, and implementing novel capabilities to further expand our core technology suite. This unique position will focus on early-stage research projects but will also offer exposure to the full drug development process, with ample opportunity to learn new techniques, design experiments, and present you

In [154]:
# BM25 Search
response_bm25 = postings_collection.query.bm25(
    query="machine learning",
    limit=5)
for o in response.objects:
    print(json.dumps(o.properties, indent=2))

{
  "description": "GigaGen, a subsidiary of Grifols, discovers and develops next-generation recombinant antibody therapeutics. Our core technology utilizes a microfluidic droplet system to capture and immortalize diverse immune repertoires, which can then be mined for exceptional monoclonals or enriched and used as high-potency polyclonal therapeutics. Our preclinical pipeline includes several recombinant polyclonal products for infectious diseases and a monoclonal antibody with a unique mechanism for oncology, which is entering the clinic. We are seeking a talented, highly motivated Research Associate to join the Technology Development team. This team is responsible for inventing, developing, and implementing novel capabilities to further expand our core technology suite. This unique position will focus on early-stage research projects but will also offer exposure to the full drug development process, with ample opportunity to learn new techniques, design experiments, and present you

In [155]:
# Semantic search
response_semantic = postings_collection.query.near_text(
    query="machine learning",
    limit=5)
for o in response.objects:
    print(json.dumps(o.properties, indent=2))

{
  "description": "GigaGen, a subsidiary of Grifols, discovers and develops next-generation recombinant antibody therapeutics. Our core technology utilizes a microfluidic droplet system to capture and immortalize diverse immune repertoires, which can then be mined for exceptional monoclonals or enriched and used as high-potency polyclonal therapeutics. Our preclinical pipeline includes several recombinant polyclonal products for infectious diseases and a monoclonal antibody with a unique mechanism for oncology, which is entering the clinic. We are seeking a talented, highly motivated Research Associate to join the Technology Development team. This team is responsible for inventing, developing, and implementing novel capabilities to further expand our core technology suite. This unique position will focus on early-stage research projects but will also offer exposure to the full drug development process, with ample opportunity to learn new techniques, design experiments, and present you

In [156]:
# Generative Search, the prompts will generate skills that are needed for each posting and for the whole search
# This is the model that I used in the visualized project
# See: https://weaviate.netlify.app
response = postings_collection.generate.near_text(
    query="machine learning",
    limit=5,
    single_prompt="Write Three skills (separated by comma, maximum 4 words) that are needed in the job {description}.",
    grouped_task="write most common needed skills, separated by comma, maximum 5 skills."
)

print(response.generated)
for o in response.objects:
    print(o.generated)
    print(json.dumps(o.properties, indent=2))

Python, AI/ML, Neural Networks, NLP, Cloud Environment
Python programming, AI/ML algorithms, NLP techniques
{
  "title": "Machine Learning Engineer",
  "description": "Job Title: Python AI/ MLType: FulltimeLocation: Dallas, TX Python/AI-ML:Hands on experience with Python, Streamlit, Fastapi (minimum 2+ max 6 years)Hands on experience in developing neural networks using Tensorflow or Pytorch frameworkHands on experience with NLP (NLTK, Spacy, BERT, SBERT models)Hands on experience with vector database (Milvus, FAISS, Pinecone, Vespa, Chroma etc.,)Good understanding on LLMs, Gen AI, Langchain, transformersExperience working and deploying in cloud environment will be added advantage (Google Cloud, AWS, Azure)Willing to learn and adapt to the advancements in the Gen AI technology and work on POCs",
  "company_name": "NLB Services",
  "job_id": 3871631334.0
}
Programming skills, Machine learning expertise, Natural language processing
{
  "title": "Artificial Intelligence Engineer Intern - C

In [None]:
client.close()