In [44]:
import pandas as pd
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
from pprint import pprint
import tqdm

#load in the individual data files and merge them together
from keys import open_ai_key, weaviate_url, weaviate_key

In [9]:
import os
os.environ["OPENAI_APIKEY"] = open_ai_key
os.environ["WCD_URL"] = weaviate_url
os.environ["WCD_API_KEY"] = weaviate_key

In [11]:

client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCD_URL"),
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCD_API_KEY")),
    headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"]  # Replace with your inference API key
    }
)


In [None]:
listings = client.collections.create(
    name="JobListings",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=wvc.config.Configure.Generative.openai()  # Ensure the `generative-openai` module is used for generative queries
)

In [20]:
data = pd.read_csv("data/postings.csv")

In [34]:
data['len'] = data['description'].apply(lambda x: len(x) if isinstance(x, str) else 0)
data['len'].describe()

count    123849.000000
mean       3766.251322
std        2146.618664
min           0.000000
25%        2176.000000
50%        3435.000000
75%        4986.000000
max       23201.000000
Name: len, dtype: float64

In [55]:
listings_objs = data[["company_name","title","description","skills_desc"]].head(1000).to_dict(orient="records")
pprint(listings_objs[0])

listings = client.collections.get("JobListings")
listings.data.insert_many(listings_objs)

[{'company_name': 'Corcoran Sawyer Smith',
  'description': 'Job descriptionA leading real estate firm in New Jersey is '
                 'seeking an administrative Marketing Coordinator with some '
                 'experience in graphic design. You will be working closely '
                 'with our fun, kind, ambitious members of the sales team and '
                 'our dynamic executive team on a daily basis. This is an '
                 'opportunity to be part of a fast-growing, highly respected '
                 'real estate brokerage with a reputation for exceptional '
                 'marketing and extraordinary culture of cooperation and '
                 'inclusion.Who you are:You must be a well-organized, '
                 'creative, proactive, positive, and most importantly, '
                 'kind-hearted person. Please, be responsible, respectful, and '
                 'cool-under-pressure. Please, be proficient in Adobe Creative '
                 'Cloud (Inde

BatchObjectReturn(all_responses=[UUID('10f49585-cdbb-4f20-8361-6b96075eabf1'), UUID('f92e4395-e7fa-4b3d-ad6e-b5a536474700'), UUID('28d4498a-6634-4c17-a86c-afc139276f68'), UUID('e7d04b91-af33-46ed-bb98-801ad8ff4fb1'), UUID('4a884096-1668-4876-875c-61b3f6fd4b08'), UUID('b402d8df-51e1-44cc-97b2-c9625d20f3a2'), UUID('baad23e8-e8b3-4f4d-ad6c-682ee9aeb3b3'), UUID('afb09d4e-00b8-494d-8532-c0f91f5aee15'), UUID('2c9fa83d-d6f8-4ad4-a18a-4cb9d908d976'), UUID('bf7ac302-c3da-4117-a906-f047c306ae67'), UUID('d88eb541-e4ac-403e-b28a-79d29c3f7fc2'), UUID('153ba343-5ea0-48d6-8c8e-e9e39c7bbd23'), UUID('a8c2d8ec-8e82-4f17-bf19-e341c9897bfd'), UUID('312d7a74-611c-424c-9f96-a3357c376e31'), UUID('cadd387a-e7aa-474c-b3c0-345c288a3969'), UUID('ea7e41f0-414e-47ff-8e6c-cae6a14af527'), UUID('5c8e25bc-ec9d-4ba6-9cb4-0ac024e6ba06'), UUID('cbae66f8-5031-4e47-9aa7-b25420dc4f63'), UUID('61809253-2279-459a-8e8d-a3468965ceed'), UUID('1fc9832b-706b-4eef-bb16-a275a6413842'), UUID('01cb651a-6ebd-4263-a34f-1911de04b2ac'), U

In [61]:
questions = client.collections.get("JobListings")

response = questions.query.near_text(
    query="Accountant",
    include_vector="True",
    limit=2
)

pprint(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('40d0805d-3ce6-443f-a394-acbdcf3518fa'),
                            metadata=MetadataReturn(creation_time=None,
                                                    last_update_time=None,
                                                    distance=None,
                                                    certainty=None,
                                                    score=None,
                                                    explain_score=None,
                                                    is_consistent=None,
                                                    rerank_score=None),
                            properties={'company_name': 'Robert Half',
                                        'description': 'The ideal candidate '
                                                       'will be well organized '
                                                       'and comfortable '
                                           