In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

In [2]:
env_file = '../ws.env'

In [3]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
else:
    print(f"File {env_file} not found.")

In [4]:
df = pd.read_csv('expanded_skills.csv')

In [5]:
df.head()

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
1,lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux"
2,richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps"
3,amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure"
4,david.hill@test.org,David Hill,"Java, Scrum, Angular"


In [6]:
df = df[['skills']]

In [7]:
df['Skill'] = df['skills'].str.split(', ')

df = df[['Skill']].explode('Skill')
df = df.drop_duplicates().sort_values(by='Skill').reset_index(drop=True)

In [8]:
df.shape

(54, 1)

In [9]:
df.head()

Unnamed: 0,Skill
0,API Design
1,AWS
2,Agile
3,Angular
4,Azure


In [10]:
llm = ChatOpenAI(model=LLM, temperature=0)

In [11]:
system_prompt = "You are a helpful assistant that provides more information on IT skills. You will get a skill coming from a CV. Your goal is to provide a short description based on this skill. What does it entail, where are these skills used for etc." 

In [12]:
generated_texts = []
for skill in df['Skill']:
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=f"Write a brief and engaging description of the IT skill: {skill}.")
    ]
    
    response = llm.invoke(messages)
    generated_texts.append(response.content)

In [13]:
df['Description'] = generated_texts

In [14]:
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

In [15]:
df['Embedding'] = df['Description'].apply( lambda skill: embeddings.embed_documents([skill])[0])

In [16]:
df.head()

Unnamed: 0,Skill,Description,Embedding
0,API Design,API Design is the process of creating applicat...,"[0.007902550511062145, -0.006266295909881592, ..."
1,AWS,Amazon Web Services (AWS) is a comprehensive a...,"[-0.0029347320087254047, -0.015877487137913704..."
2,Agile,Agile is a dynamic and flexible project manage...,"[-0.023693757131695747, -0.012672649696469307,..."
3,Angular,"Angular is a powerful, open-source web applica...","[0.013188531622290611, 0.029791485518217087, -..."
4,Azure,Azure is Microsoft's cloud computing platform ...,"[-0.004470727406442165, -0.024176467210054398,..."


In [18]:
df.to_csv('skills_embeddings.csv', index=False)