In [15]:
from llama_index import PromptTemplate
from llama_index.llms import Bedrock, ChatMessage
from llama_index.program import LLMTextCompletionProgram
from llama_index.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
import pandas as pd

In [3]:
# Need to have AWS creddentials loaded into environment
# This could be previously done in ~/.aws/credentials
bedrock = Bedrock(model="anthropic.claude-v2", max_tokens=8000)

In [10]:
# Load the raw data
job_df = pd.read_csv("../data/bronze/apify/indeed/dataset_indeed-scraper_2023-10-23_07-07-15.csv")
descriptions = job_df['description'].to_list()

In [16]:
# Create the pydantic class to output into

class Skill(BaseModel):
    name: str
    reference_text: str

class Job(BaseModel):
    original_description: str
    skills: List[Skill]

# Create the program to parse each description
job_program = LLMTextCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Job),
    llm=bedrock,
    prompt_template_str="Extract the Job skills, with the reference text to that skill, from the following description: {description}",
    verbose=True
)

In [18]:
test_output = job_program(description=descriptions[0])

In [34]:
for skill in test_output.skills:
    if skill.reference_text in descriptions[0]:
        print(skill)
    else:
        print(skill.name, "is not in the description")

name='ETL' reference_text='We are seeking an experienced ETL (Extract, Transform, and Load) Data Engineer with expertise in Google Cloud Platform (GCP) to join our client data engineering team.'
name='Google Cloud Platform' reference_text='We are seeking an experienced ETL (Extract, Transform, and Load) Data Engineer with expertise in Google Cloud Platform (GCP) to join our client data engineering team.'
name='Data Pipeline Design' reference_text='· Design, develop, and maintain ETL pipelines on Google Cloud Platform (GCP) to ensure efficient data extraction, transformation, and loading processes.'
name='Data Extraction' reference_text='· Extract data from various sources, including databases, APIs, and cloud storage, and ensure data quality and consistency.'
name='Data Transformation' reference_text='· Implement data transformations, including cleaning, aggregation, and enrichment, to prepare data for analysis and reporting.'
name='GCP Services' reference_text='· Design, build, and ma