In [19]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
# Create a Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="careerplannrai-rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [27]:
# Load the Data
import json
data = json.load(open("profiles.json"))
data['profiles']

[{'name': 'Alice Johnson',
  'experience': [{'title': 'Software Engineer',
    'company': 'TechCorp Inc.',
    'duration': '3 years'},
   {'title': 'Junior Developer',
    'company': 'WebWorks Solutions',
    'duration': '2 years'}],
  'skills': ['JavaScript', 'React', 'Node.js', 'CSS']},
 {'name': 'Michael Smith',
  'experience': [{'title': 'Marketing Manager',
    'company': 'Creative Media Co.',
    'duration': '5 years'},
   {'title': 'Marketing Specialist',
    'company': 'AdVenture Ltd.',
    'duration': '3 years'}],
  'skills': ['Digital Marketing',
   'SEO',
   'Content Strategy',
   'Google Analytics']},
 {'name': 'Laura Brown',
  'experience': [{'title': 'Project Manager',
    'company': 'BuildRight Construction',
    'duration': '4 years'},
   {'title': 'Assistant Project Manager',
    'company': 'Urban Developers',
    'duration': '2 years'}],
  'skills': ['Project Management',
   'Budgeting',
   'Team Leadership',
   'Risk Management']},
 {'name': 'James Williams',
  'expe

In [32]:
# Process the data
processed_data = []
client = OpenAI()


for profile in data['profiles']:
    # Combine experience and skills into a single string for embedding
    experience_text = ', '.join([f"{exp['title']} at {exp['company']} for {exp['duration']}" for exp in profile['experience']])
    skills_text = ', '.join(profile['skills'])
    profile_text = f"Experience: {experience_text}. Skills: {skills_text}."

    # Generate embedding
    response = client.embeddings.create(
        input=profile_text,
        model="text-embedding-3-small",  # Ensure to use a valid model name
    )
    embedding = response.data[0].embedding
    
    # Store processed data
    processed_data.append({
        "id": profile["name"],
        "values": embedding,
        "metadata": {
            "experience": experience_text,  # Convert complex dict to string
            "skills": ', '.join(profile["skills"])  # Ensure skills are in a string format
        }
    })

In [33]:
# Display the processed data
processed_data[0]

{'id': 'Alice Johnson',
 'values': [-0.034015525,
  -0.029715749,
  -2.0049689e-05,
  -0.004651528,
  0.03599658,
  -0.00061942986,
  -0.003573769,
  0.026496543,
  0.0053240717,
  0.029197974,
  0.036761984,
  -0.0006954076,
  -0.012212723,
  -0.007203818,
  0.047004912,
  0.057540495,
  0.0069843265,
  -0.041309394,
  -0.020722233,
  0.004167521,
  -0.034128085,
  0.0008821863,
  0.011109638,
  0.01369851,
  0.06456421,
  0.003269858,
  0.005630797,
  0.060241926,
  -0.009967158,
  -0.03926081,
  0.02017069,
  -0.024065254,
  0.020789769,
  0.012730497,
  0.045721732,
  0.0018713042,
  0.045879316,
  0.009111704,
  0.031359117,
  0.011548621,
  -0.013901117,
  -0.025078291,
  0.01381107,
  0.060602117,
  -0.027329484,
  -0.0060050576,
  -0.07298368,
  0.026068816,
  0.010090973,
  0.035793968,
  -0.010805727,
  -0.018324712,
  0.025280898,
  0.04407836,
  -0.05119213,
  -0.023795111,
  0.011509225,
  0.031156512,
  -0.026496543,
  -0.008278762,
  0.08847189,
  -0.016895203,
  0.04101

In [34]:
# Insert the data into the Pinecone index
index = pc.Index('careerplannrai-rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 5}

In [35]:
#Display your Pinecone Index Stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 5}},
 'total_vector_count': 5}