In [4]:
from openai import OpenAI
OPENAI_API_TOKEN = " "
client = OpenAI(api_key=OPENAI_API_TOKEN)

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [6]:
import json

with open('courses.json', 'r') as f:
    data = json.load(f)

In [7]:
import pandas as pd
df = pd.DataFrame(data['courses'])
df.head()

Unnamed: 0,title,description
0,Introduction to Management,This course introduces the main functional are...
1,High-Performance Teams,"Creating, leading or contributing to a high pe..."
2,Mathematics for Artificial Intelligence,This course introduces important mathematical ...
3,Machine Learning and AI Technology,This course covers machine learning (ML) and A...
4,Analytical Decision Making,This course focuses on fundamental concepts of...


In [8]:
df['embedding'] = df['description'].astype(str).apply(get_embedding)
df.head()

Unnamed: 0,title,description,embedding
0,Introduction to Management,This course introduces the main functional are...,"[-0.033125270158052444, 0.0445437990128994, 0...."
1,High-Performance Teams,"Creating, leading or contributing to a high pe...","[0.007104985415935516, 0.03567705675959587, 0...."
2,Mathematics for Artificial Intelligence,This course introduces important mathematical ...,"[-0.03330518305301666, 0.0030541757587343454, ..."
3,Machine Learning and AI Technology,This course covers machine learning (ML) and A...,"[-0.031079674139618874, 0.024304011836647987, ..."
4,Analytical Decision Making,This course focuses on fundamental concepts of...,"[-0.025203440338373184, 0.039639662951231, 0.0..."


In [9]:
import numpy as np
embeddings_array = np.array(df['embedding'].tolist())

In [10]:
embeddings_array.shape

(15, 1536)

In [11]:
from sklearn.decomposition import PCA

# Initialize PCA to reduce to 2 dimensions
pca = PCA(n_components=2)

# Fit and transform the embeddings array
embeddings_2d = pca.fit_transform(embeddings_array)

# Convert to DataFrame for easier visualization
embeddings_df = pd.DataFrame(embeddings_2d, columns=['PC1', 'PC2'])

# adding title column
embeddings_df['title'] = df['title']

# Display first few rows
embeddings_df.head()

Unnamed: 0,PC1,PC2,title
0,0.104371,0.242496,Introduction to Management
1,0.166695,0.053127,High-Performance Teams
2,-0.167667,-0.060566,Mathematics for Artificial Intelligence
3,-0.285622,-0.090547,Machine Learning and AI Technology
4,-0.198408,0.517237,Analytical Decision Making


In [12]:
pca.explained_variance_

array([0.0745054 , 0.05529317])

In [13]:
import plotly.express as px

# Create scatter plot with improved formatting
fig = px.scatter(
    embeddings_df,
    x='PC1',
    y='PC2', 
    text='title',
    title='Course Embeddings Visualization using PCA',
    template='plotly_white'  # Use a cleaner template
)

# Customize text labels
fig.update_traces(
    textposition='top center',
    textfont=dict(size=12),
    marker=dict(size=10)
)

# Enhance layout and styling
fig.update_layout(
    title_x=0.5,  # Center the title
    title_font_size=20,
    plot_bgcolor='white',
    showlegend=False,
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
    xaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray'
    ),
    margin=dict(t=100, l=50, r=50, b=50)  # Adjust margins
)

# Display the plot
fig.show()

In [14]:
# Example t-SNE implementation
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=5, random_state=42)  # perplexity might need tuning for 15 samples
embeddings_tsne = tsne.fit_transform(embeddings_array)

# Create DataFrame
tsne_df = pd.DataFrame(embeddings_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df['title'] = df['title']
tsne_df.head()

Unnamed: 0,TSNE1,TSNE2,title
0,45.623543,-20.26193,Introduction to Management
1,25.571184,-118.678772,High-Performance Teams
2,-35.863514,50.734604,Mathematics for Artificial Intelligence
3,-40.075325,83.369728,Machine Learning and AI Technology
4,62.090321,30.805304,Analytical Decision Making


In [15]:
import plotly.express as px

# Create scatter plot with improved formatting
fig = px.scatter(
    tsne_df,
    x='TSNE1',
    y='TSNE2', 
    text='title',
    title='Course Embeddings Visualization using TSNE',
    template='plotly_white'  # Use a cleaner template
)

# Customize text labels
fig.update_traces(
    textposition='top center',
    textfont=dict(size=12),
    marker=dict(size=10)
)

# Enhance layout and styling
fig.update_layout(
    title_x=0.5,  # Center the title
    title_font_size=20,
    plot_bgcolor='white',
    showlegend=False,
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
    xaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray'
    ),
    margin=dict(t=100, l=50, r=50, b=50)  # Adjust margins
)

# Display the plot
fig.show()