In [46]:
import numpy as np
import pandas as pd


In [63]:
coursera = pd.read_csv('../data/coursera_courses.csv')
# drop rows where course_skills = []
coursera = coursera[coursera['course_skills'] != '[]']
coursera = coursera.reset_index(drop=True)

import ast

coursera['course_skills'] = coursera['course_skills'].apply(ast.literal_eval)
coursera['course_skills'] = coursera['course_skills'].str.join(', ')
coursera = coursera.drop(columns=['course_students_enrolled', 'course_summary'])
coursera.head()

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_skills,course_description
0,(ISC)² Systems Security Certified Practitioner...,ISC2,Specialization,3 - 6 Months,4.7,492.0,Beginner,https://www.coursera.org/specializations/sscp-...,"Risk Management, Access Control, Asset, Incide...",Pursue better IT security job opportunities an...
1,.NET FullStack Developer,Board Infinity,Specialization,1 - 3 Months,4.3,51.0,Intermediate,https://www.coursera.org/specializations/dot-n...,"Web API, Web Development, Cascading Style Shee...",Develop the proficiency required to design and...
2,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,Course,1 - 3 Months,4.7,517.0,Intermediate,https://www.coursera.org/learn/crash-course-in...,"Instrumental Variable, Propensity Score Matchi...",We have all heard the phrase “correlation does...
3,A life with ADHD,University of Geneva,Course,1 - 3 Months,,,Beginner,https://www.coursera.org/learn/life-with-adhd,"differential diagnosis and comorbidities, symp...",What is ADHD and what are the challenges that ...
4,AI For Business,University of Pennsylvania,Specialization,3 - 6 Months,4.7,381.0,Beginner,https://www.coursera.org/specializations/ai-fo...,"Artificial Intelligence (AI), Machine Learning...",This specialization will provide learners with...


In [66]:
from dotenv import load_dotenv

from openai import OpenAI
import os

load_dotenv(override=True)
    
openai_key = os.getenv("OPENAI_KEY")

client = OpenAI(api_key=openai_key)

def make_embeddings(client, text, model="text-embedding-3-small"):
    
    return client.embeddings.create(input = [text], model=model).data[0].embedding

coursera['course_skills_embeddings'] = coursera['course_skills'].apply(lambda x: make_embeddings(client, x))

In [67]:
coursera.to_csv('../data/coursera_courses_embeddings.csv', index=False)
coursera = pd.read_csv('../data/coursera_courses_embeddings.csv')
coursera.head()

coursera['course_skills_embeddings'] = coursera['course_skills_embeddings'].apply(ast.literal_eval)

In [79]:
coursera = coursera.dropna()
coursera = coursera.reset_index(drop=True)
coursera['course_rating'] = coursera['course_rating'].astype(str)
coursera.head()

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_skills,course_description,course_skills_embeddings
0,(ISC)² Systems Security Certified Practitioner...,ISC2,Specialization,3 - 6 Months,4.7,492,Beginner,https://www.coursera.org/specializations/sscp-...,"Risk Management, Access Control, Asset, Incide...",Pursue better IT security job opportunities an...,"[0.009270939975976944, 0.028746716678142548, 0..."
1,.NET FullStack Developer,Board Infinity,Specialization,1 - 3 Months,4.3,51,Intermediate,https://www.coursera.org/specializations/dot-n...,"Web API, Web Development, Cascading Style Shee...",Develop the proficiency required to design and...,"[-0.05880910903215408, -0.0014066009316593409,..."
2,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,Course,1 - 3 Months,4.7,517,Intermediate,https://www.coursera.org/learn/crash-course-in...,"Instrumental Variable, Propensity Score Matchi...",We have all heard the phrase “correlation does...,"[-0.006142226047813892, 0.024427974596619606, ..."
3,AI For Business,University of Pennsylvania,Specialization,3 - 6 Months,4.7,381,Beginner,https://www.coursera.org/specializations/ai-fo...,"Artificial Intelligence (AI), Machine Learning...",This specialization will provide learners with...,"[-0.030125897377729416, 0.009498830884695053, ..."
4,AI For Everyone,DeepLearning.AI,Course,1 - 4 Weeks,4.8,39.3k,Beginner,https://www.coursera.org/learn/ai-for-everyone,"Workflow of Machine Learning projects, AI term...",AI is not only for engineers. If you want your...,"[0.0029956104699522257, 0.029252810403704643, ..."


In [94]:
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm

def upload_to_db(data):
    # pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    pc = Pinecone(api_key="41c0115a-1aae-400c-9347-52d2176e03ff")
    index = pc.Index("coursera")
    
    batch_size = 50
    
    test = []
    
    for i in tqdm(range(0, len(data), batch_size)):
        vectors = []
        batch = data.iloc[i:i+batch_size]
        for idx in range(len(batch)):
            vector = {}; metadata = {}
            vector["id"] = str(idx+i)
            vector["values"] = batch.iloc[idx]['course_skills_embeddings']

            metadata["course_name"] = batch.iloc[idx]['course_title']; test.append(batch.iloc[idx]['course_title'])
            metadata["course_organization"] = batch.iloc[idx]['course_organization']
            metadata["course_Certificate_type"] = batch.iloc[idx]['course_certificate_type']
            metadata["course_rating"] = batch.iloc[idx]['course_rating']
            metadata["course_difficulty"] = batch.iloc[idx]['course_difficulty']
            metadata["course_URL"] = batch.iloc[idx]['course_url']
            metadata["course_skills"] = batch.iloc[idx]['course_skills']
            metadata["course_time"] = batch.iloc[idx]['course_time']

            vector["metadata"] = metadata
            vectors.append(vector)
        
        index.upsert(vectors=vectors)
        print(f"Uploaded batch {i+1} to {i+batch_size} to Pinecone")

    return test

test = upload_to_db(coursera)

  6%|▌         | 1/17 [00:02<00:36,  2.26s/it]

Uploaded batch 1 to 50 to Pinecone


 12%|█▏        | 2/17 [00:04<00:29,  1.98s/it]

Uploaded batch 51 to 100 to Pinecone


 18%|█▊        | 3/17 [00:05<00:25,  1.80s/it]

Uploaded batch 101 to 150 to Pinecone


 24%|██▎       | 4/17 [00:07<00:23,  1.79s/it]

Uploaded batch 151 to 200 to Pinecone


 29%|██▉       | 5/17 [00:09<00:20,  1.73s/it]

Uploaded batch 201 to 250 to Pinecone


 35%|███▌      | 6/17 [00:10<00:18,  1.69s/it]

Uploaded batch 251 to 300 to Pinecone


 41%|████      | 7/17 [00:12<00:16,  1.70s/it]

Uploaded batch 301 to 350 to Pinecone


 47%|████▋     | 8/17 [00:13<00:14,  1.66s/it]

Uploaded batch 351 to 400 to Pinecone


 53%|█████▎    | 9/17 [00:15<00:13,  1.64s/it]

Uploaded batch 401 to 450 to Pinecone


 59%|█████▉    | 10/17 [00:17<00:11,  1.68s/it]

Uploaded batch 451 to 500 to Pinecone


 65%|██████▍   | 11/17 [00:18<00:09,  1.62s/it]

Uploaded batch 501 to 550 to Pinecone


 71%|███████   | 12/17 [00:20<00:08,  1.64s/it]

Uploaded batch 551 to 600 to Pinecone


 76%|███████▋  | 13/17 [00:21<00:06,  1.60s/it]

Uploaded batch 601 to 650 to Pinecone


 82%|████████▏ | 14/17 [00:23<00:04,  1.59s/it]

Uploaded batch 651 to 700 to Pinecone


 88%|████████▊ | 15/17 [00:25<00:03,  1.57s/it]

Uploaded batch 701 to 750 to Pinecone


 94%|█████████▍| 16/17 [00:26<00:01,  1.57s/it]

Uploaded batch 751 to 800 to Pinecone


100%|██████████| 17/17 [00:27<00:00,  1.60s/it]

Uploaded batch 801 to 850 to Pinecone





In [98]:
list_r = ['painting', 'drawing', 'singing']
list_j = ['data', 'ai', 'data', 'data science', 'data scraping', 'data', 'database', 'data scraping', 'software', 'data analytics', 'python', 'database querying', 'mysql']


In [99]:
missing_skill_embeddings = make_embeddings(client, ', '.join(list_r))

pc = Pinecone(api_key="41c0115a-1aae-400c-9347-52d2176e03ff")
index = pc.Index("coursera")
    
result = index.query(
    vector=missing_skill_embeddings,
    top_k=5,
    include_values=False,
    include_metadata=True
)

In [105]:
# pd.DataFrame(result['matches'][0])
result['matches']

[{'id': '378',
  'metadata': {'course_Certificate_type': 'Course',
               'course_URL': 'https://www.coursera.org/learn/healing-with-the-arts',
               'course_difficulty': 'Mixed',
               'course_name': 'Healing with the Arts',
               'course_organization': 'University of Florida',
               'course_rating': '4.5',
               'course_skills': 'Music, Art, Meditation, Visual Arts',
               'course_time': '1 - 3 Months'},
  'score': 0.55827,
  'values': []},
 {'id': '775',
  'metadata': {'course_Certificate_type': 'Specialization',
               'course_URL': 'https://www.coursera.org/specializations/singer-songwriter',
               'course_difficulty': 'Beginner',
               'course_name': 'The Singer Songwriter',
               'course_organization': 'Berklee',
               'course_rating': '4.8',
               'course_skills': 'Music production, Ableton, Singing, '
                                'Songwriting, Guitar',
        