### OpenAI Authentication

In [1]:
import openai
import os

In [2]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [11]:
import pandas as pd
import numpy as np

### 1. Loading the Dataset into a Pandas DataFrame

In [7]:
# Loading the CSV file into a Pandas DataFrame
df = pd.read_csv('./data/books_dataset.csv')

# Cleaning the data, dropping rows with missing values
df.dropna(inplace=True)

# sorting by average rating the returning the first 2000 records
df = df.sort_values('average_rating', ascending=False).head(2000)
df

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
6738,9781932206081,Insights,Frederick Lenz,Spiritual life,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00
4284,9780738539560,Lake Orion,James E. Ingram;Lori Grove,History,"Orion Township, established in 1835, became a ...",2006.0,5.00
3580,9780567044716,Colossians and Philemon,Robert McL Wilson,Religion,For over one hundred years International Criti...,2005.0,5.00
4306,9780739844328,Bill Gates,Sara Barton-Wood,Juvenile Nonfiction,"Presents the life of Bill Gates, from his chil...",2001.0,5.00
5398,9780851621814,The Complete Theory Fun Factory,Katie Elliott;Ian Martin,Juvenile Nonfiction,(Boosey & Hawkes Scores/Books). Contains the m...,1996.0,5.00
...,...,...,...,...,...,...,...
4871,9780786809943,The Final Battle,Mary Pope Osborne,Juvenile Fiction,After struggling against the gods and his fate...,2005.0,4.08
4720,9780765309969,Blade of Fortriu,Juliet Marillier,Fiction,As King Bridei prepares to expel the Gaelic in...,2006.0,4.08
1701,9780330340199,In Pharaoh's Army,Tobias Wolff,"Authors, American",Having survived the extraordinary childhood re...,1995.0,4.08
1066,9780143039853,The Outsiders,S. E. Hinton;Jodi Picoult,Fiction,The struggle of three brothers to stay togethe...,1967.0,4.08


### 2. Embedding Cost Calculation

In [8]:
import tiktoken

enc = tiktoken.encoding_for_model('text-embedding-ada-002')
descriptions = list(df['description'])
total_tokens = sum([len(enc.encode(item)) for item in descriptions])
print(f'Total tokens: {total_tokens}')

cost = total_tokens * (0.0004/1000)
print(f'Estimated cost in USD: {cost:.10f}')

Total tokens: 166700
Estimated cost in USD: 0.0666800000


### 3. Calculating the Embeddings and Cache Them Locally

In [9]:
def get_embeddings_and_save_to_csv(embedding_cache_file):
    from openai.embeddings_utils import get_embedding
    
    # 1. adding a new column called embedding to the Pandas DataFrame
    # 2. calling get_embedding() on each book's description
    df['embedding'] = df['description'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
    
    # saving the DataFrame to CSV
    df.to_csv(embedding_cache_file)

In [10]:
embedding_cache_file = './data/book_embeddings.csv'

# calling the function (takes time and costs money)
get_embeddings_and_save_to_csv(embedding_cache_file)

### 4. Loading the Embeddings

In [13]:
embedding_cache_file = './data/book_embeddings.csv'  # 2000 records
# embedding_cache_file = 'books_embeddings_cache_big.csv'  # 50000 records
df_embeddings = pd.read_csv(embedding_cache_file)

# converting embeddings: str => numpy array
df_embeddings['embedding'] = df_embeddings['embedding'].apply(eval).apply(np.array)

In [14]:
df_embeddings.shape

(2000, 9)