In [2]:
from openai import OpenAI
import os

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

request = client.embeddings.create(
    model="text-embedding-ada-002",
    input="The food was delicious and the waiter...",
)

response = request["data"][0]["embedding"]
print(response)

# Embedding Multiple headlines
articles = [
    {
        'headline': 'Economic growth in the US',
        'topic': 'Economy',
    },
    {
        'headline': 'New advancements in AI technology',
        'topic': 'Technology',
    },
    {
        'headline': 'The impact of climate change on agriculture',
        'topic': 'Environment',
    },
    {
        'headline': 'The future of electric vehicles',
        'topic': 'Transportation',
    },
]

headline_text = [article['headline'] for article in articles]

response = client.embeddings.create(
    model="text-embedding-ada-002",
    input=headline_text,
)

for i, articles in enumerate(articles):
    articles['embedding'] = response['data'][i]['embedding']

print(articles)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Dimensionality reduction using t-SNE (t-distributed Stochastic Neighbor Embedding)
from sklearn.manifold import TSNE

import numpy as np
import matplotlib.pyplot as plt


embeddings = [article['embedding'] for article in articles]

tsne = TSNE(n_components=2, perplexity=5)
embeddings_2d = tsne.fit_transform(np.array(embeddings))

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c='blue', marker='o')

topics = [article['topic'] for article in articles]

for i, topic in enumerate(topics):
    plt.annotate(topic, (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=12)

plt.title('t-SNE Visualization of Article Embeddings')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid()
plt.show()

### Text similarity
Cosine distance

In [3]:
from scipy.spatial import distance

distance.cosine([0, 1], [1, 0])

1.0

Comparing headline similarity

In [None]:
def create_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=texts,
    )
    response_dict = response.model_dump()
    return [data['embedding'] for data in response_dict['data']]

In [None]:
from scipy.spatial import distance
import numpy as np

search_text = "computer"
search_embedding = create_embeddings([search_text])[0]
distances = []
for article in articles:
    dist = distance.cosine(search_embedding, article['embedding'])
    distances.append(dist)


min_dist_index = np.argmin(distances)
print(articles[min_dist_index]['headline'])

# Semantic Search
Use embeddings to return most similar results to a search query

In [None]:
news_articles = [
    {
        'headline': 'Economic growth in the US',
        'topic': 'Economy',
        'keywords': ['economy', 'growth', 'US'],
    },
    {
        'headline': 'New advancements in AI technology',
        'topic': 'Technology',
        'keywords': ['AI', 'technology', 'advancements'],
    },
    {
        'headline': 'The impact of climate change on agriculture',
        'topic': 'Environment',
        'keywords': ['climate change', 'agriculture', 'impact'],
    },
    {
        'headline': 'The future of electric vehicles',
        'topic': 'Transportation',
        'keywords': ['electric vehicles', 'future'],
    }
]

def create_article_text(article):
    return f"""Headline: {article['headline']}
Topic: {article['topic']}
Keywords: {', '.join(article['keywords'])}
"""

article_texts = [create_article_text(article) for article in news_articles]
article_embeddings = create_embeddings(article_texts)
print(article_embeddings)

# Computing distances
from scipy.spatial import distance

def find_n_closest(query_vector, embeddings, n=3):
    dist =[]
    for index, embedding in enumerate(embeddings):
        dist = distance.cosine(query_vector, embedding)
        distances.append({'distance': dist, 'index': index})

    distances_sorted = sorted(distances, key=lambda x: x['distance'])
    return distances_sorted[:n]

In [None]:
query_text = 'AI'
query_vector = create_embeddings([query_text])[0]

hits = find_n_closest(query_vector, article_embeddings, n=3)
for hit in hits:
    article = news_articles[hit['index']]
    print(f"Headline: {article['headline']}")

# Recommendation Systems

In [None]:
current_article = {
    'headline': 'How NVIDIA GPUs Could Decide Who Wins the AI Race',
    'topic': 'Tech',
    'keywords': ['ai', 'business', 'computers']
}

user_history = [
    {
        'headline': 'Economic growth in the US',
        'topic': 'Economy',
        'keywords': ['economy', 'growth', 'US'],
    },
    {
        'headline': 'New advancements in AI technology',
        'topic': 'Technology',
        'keywords': ['AI', 'technology', 'advancements'],
    },
    {
        'headline': 'The impact of climate change on agriculture',
        'topic': 'Environment',
        'keywords': ['climate change', 'agriculture', 'impact'],
    },
]

history_texts = [create_article_text(article) for article in user_history]
history_embeddings = create_embeddings(history_texts)
mean_history_embedding = np.mean(history_embeddings, axis=0)

current_article_text = create_article_text(current_article)
current_article_embedding = create_embeddings([current_article_text])[0]


articles_filtered = [article for article in news_articles if article not in user_history]
article_texts = [create_article_text(article) for article in articles_filtered]
article_embeddings = create_embeddings(article_texts)

hits = find_n_closest(current_article_embedding, article_embeddings, n=3)

for hit in hits:
    article = articles_filtered[hit['index']]
    


# Classification 
Zero-shot classification:
* Not using labelled data

In [None]:
topics = [
    {'label': 'Tech'},
    {'label': 'Economy'},
    {'label': 'Environment'},
    {'label': 'Transportation'},
    {'label': 'Health'},
    {'label': 'Sports'},
    {'label': 'Entertainment'},
    {'label': 'Politics'},
]

class_descriptions = [topic['label'] for topic in topics]
class_embeddings = create_embeddings(class_descriptions)

# Compute the cosine similarity between the current article and each class
def find_closest(query_vector, embeddings):
    distances = []
    for index, embedding in enumerate(embeddings):
        dist = distance.cosine(query_vector, embedding)
        distances.append({'distance': dist, 'index': index})
    return min(distances, key=lambda x: x['distance'])

closest_class = find_closest(current_article_embedding, class_embeddings)

label = topics[closest_class['index']]['label']
print(f"Predicted label: {label}")
