### OpenAI Authentication

In [2]:
import openai
import os

In [3]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
import pandas as pd
import numpy as np

### 1. Loading the Dataset into a Pandas DataFrame

In [12]:
# Loading the CSV file into a Pandas DataFrame
df = pd.read_csv('./data/books_dataset.csv')

# Cleaning the data, dropping rows with missing values
df.dropna(inplace=True)

# sorting by average rating the returning the first 2000 records
df = df.sort_values('average_rating', ascending=False).head(2000)
df

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
6738,9781932206081,Insights,Frederick Lenz,Spiritual life,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00
4284,9780738539560,Lake Orion,James E. Ingram;Lori Grove,History,"Orion Township, established in 1835, became a ...",2006.0,5.00
3580,9780567044716,Colossians and Philemon,Robert McL Wilson,Religion,For over one hundred years International Criti...,2005.0,5.00
4306,9780739844328,Bill Gates,Sara Barton-Wood,Juvenile Nonfiction,"Presents the life of Bill Gates, from his chil...",2001.0,5.00
5398,9780851621814,The Complete Theory Fun Factory,Katie Elliott;Ian Martin,Juvenile Nonfiction,(Boosey & Hawkes Scores/Books). Contains the m...,1996.0,5.00
...,...,...,...,...,...,...,...
4871,9780786809943,The Final Battle,Mary Pope Osborne,Juvenile Fiction,After struggling against the gods and his fate...,2005.0,4.08
4720,9780765309969,Blade of Fortriu,Juliet Marillier,Fiction,As King Bridei prepares to expel the Gaelic in...,2006.0,4.08
1701,9780330340199,In Pharaoh's Army,Tobias Wolff,"Authors, American",Having survived the extraordinary childhood re...,1995.0,4.08
1066,9780143039853,The Outsiders,S. E. Hinton;Jodi Picoult,Fiction,The struggle of three brothers to stay togethe...,1967.0,4.08


### 2. Embedding Cost Calculation

In [8]:
import tiktoken

enc = tiktoken.encoding_for_model('text-embedding-ada-002')
descriptions = list(df['description'])
total_tokens = sum([len(enc.encode(item)) for item in descriptions])
print(f'Total tokens: {total_tokens}')

cost = total_tokens * (0.0004/1000)
print(f'Estimated cost in USD: {cost:.10f}')

Total tokens: 166700
Estimated cost in USD: 0.0666800000


### 3. Calculating the Embeddings and Cache Them Locally

In [9]:
def get_embeddings_and_save_to_csv(embedding_cache_file):
    from openai.embeddings_utils import get_embedding
    
    # 1. adding a new column called embedding to the Pandas DataFrame
    # 2. calling get_embedding() on each book's description
    df['embedding'] = df['description'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
    
    # saving the DataFrame to CSV
    df.to_csv(embedding_cache_file)

In [10]:
embedding_cache_file = './data/book_embeddings.csv'

# calling the function (takes time and costs money)
get_embeddings_and_save_to_csv(embedding_cache_file)

### 4. Loading the Embeddings

In [7]:
embedding_cache_file = './data/book_embeddings.csv'  # 2000 records
# embedding_cache_file = 'books_embeddings_cache_big.csv'  # 50000 records
df_embeddings = pd.read_csv(embedding_cache_file)

# converting embeddings: str => numpy array
df_embeddings['embedding'] = df_embeddings['embedding'].apply(eval).apply(np.array)

In [8]:
df_embeddings.shape

(2000, 9)

### 5. Getting Recommendations from Title

In [14]:
# Copilot recommendation
# def get_recommendations(query, df_embeddings, top_n=5):
#     from sklearn.metrics.pairwise import cosine_similarity
    
#     # 1. getting the embedding for the query
#     query_embedding = get_embedding(query, engine='text-embedding-ada-002')
    
#     # 2. converting the embedding to a numpy array
#     query_embedding = np.array(eval(query_embedding))
    
#     # 3. calculating the cosine similarity between the query embedding and all the book embeddings
#     cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), df_embeddings['embedding'].tolist())
    
#     # 4. sorting the books by their similarity to the query
#     books_sorted_by_similarity = df_embeddings.iloc[np.argsort(cosine_similarities[0])][::-1]
    
#     # 5. returning the top N recommendations
#     return books_sorted_by_similarity.head(top_n)

def get_recommendation_from_title(df_embeddings, title, k):
    from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
    
    # return False if title is not in the DataFrame
    # lowercase search
    title = title.strip()
    if title.lower() not in list(df_embeddings['title'].str.lower()):
        return False
    
    # embedding of the target book
    book_embedding = df_embeddings[df_embeddings['title'].str.lower() == title.lower()]['embedding']
    
    # squeeze to a series (one dimension)
    book_embedding = book_embedding.squeeze()
   
    # getting the all the embeddings into a Python list
    embeddings = list(df_embeddings['embedding'])

    # computing the distance from the target embedding to all the other embeddings
    distances = distances_from_embeddings(book_embedding, embeddings)
    # print(sorted(distances))
    # [0, 0.15196297705554485, 0.1532065041537366, 0.15422387188303377, ..., 0.33120761884396, 0.3329323335122415]

    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    # print(indices_of_nearest_neighbors)
    # [  97  771    7 ...  137 1465  861]

    recommendations = list()
    for index in indices_of_nearest_neighbors[1:k+1]:
        book = dict()
        book['title'] = df_embeddings.iloc[index]['title']
        book['description'] = df_embeddings.iloc[index]['description']
        book['distance'] = distances[index]
        recommendations.append(book)

    return recommendations    

In [13]:
# df.iloc[97]

isbn13                                                9780756616342
title                                                        Animal
authors                                  David Burnie;Don E. Wilson
categories                                                  Zoology
description       Offers photographs and information about mamma...
published_year                                               2005.0
average_rating                                                 4.52
Name: 4620, dtype: object

In [18]:
title = input('Enter Book\'s Title: ')
book_recommendations = get_recommendation_from_title(df_embeddings, title, 5)
# print(book_recommendations)

print(f'Entered Title: {title}\n')
print('#' * 50)

if book_recommendations:
    for i, item in enumerate(book_recommendations):
        print(f'Book Recommendation #{i+1}, Distance: {item["distance"]}')
        print(f'Title: {item["title"]}')
        print(f'Description: {item["description"]}')
        print()
        print('#' * 50)
        print()
else:
    print(f'Title {title} does not exist in the dataset.')

Entered Title: Animal

##################################################
Book Recommendation #1, Distance: 0.15196297705554485
Title: In Focus
Description: A collection of nearly three hundred photographs from "National Geographic," representing the work of more than one hundred fifty acclaimed photographers, captures portrait images of people from around the world.

##################################################

Book Recommendation #2, Distance: 0.1532065041537366
Title: Ecuador Nature Guide
Description: The guide provides information on 76 species of birds, plants, mammals and insects of Ecuador. Each species description is accompanied by an illustration as well as information on ecology, local names and uses. Profits from the sale of this guide will go

##################################################

Book Recommendation #3, Distance: 0.15422387188303377
Title: Insects & Spiders
Description: An introduction to the physical characteristics, habits, and habitats of different 