In [31]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [32]:
from dotenv import load_dotenv
load_dotenv()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [33]:
import pandas as pd
books = pd.read_csv("books_cleaned.csv")

In [34]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 - A NOVEL THAT READERS and criti...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web - A Novel,9780002261982 - A new 'Christie for Christmas'...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 - A memorable, mesmerizing heroi..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 - Lewis' work on the nature of l...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 - ""In The Problem of Pain, C.S. ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 - On A Train Journey Home To Nor...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 - This book tells the tale of a ...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 - Wisdom to Create a Life of Pas...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that - Talks with Sri Nisargadatta Maharaj,9788185300535 - This collection of the timeles...


In [35]:
books["tagged_description"].to_csv("tagged_description.txt",
index=False,
header=False)

In [36]:
raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=0,chunk_overlap=0,separator="\n")

In [37]:
documents=text_splitter.split_documents(raw_documents)

Created a chunk of size 1172, which is longer than the specified 0
Created a chunk of size 1218, which is longer than the specified 0
Created a chunk of size 377, which is longer than the specified 0
Created a chunk of size 313, which is longer than the specified 0
Created a chunk of size 485, which is longer than the specified 0
Created a chunk of size 486, which is longer than the specified 0
Created a chunk of size 964, which is longer than the specified 0
Created a chunk of size 190, which is longer than the specified 0
Created a chunk of size 847, which is longer than the specified 0
Created a chunk of size 298, which is longer than the specified 0
Created a chunk of size 199, which is longer than the specified 0
Created a chunk of size 883, which is longer than the specified 0
Created a chunk of size 1092, which is longer than the specified 0
Created a chunk of size 1193, which is longer than the specified 0
Created a chunk of size 308, which is longer than the specified 0
Create

In [38]:
db_books = Chroma.from_documents(
    documents,
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

In [39]:
query="A Book to teach children about nature"
docs = db_books.similarity_search(query, k=10)
docs

[Document(id='11ae5593-79b5-43e6-aa6e-47ebd2d39386', metadata={'source': 'tagged_description.txt'}, page_content='"9780786808069 - Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience."'),
 Document(id='00e37b74-4b08-4af4-806d-14f15a98b6c8', metadata={'source': 'tagged_description.txt'}, page_content='"9780786808069 - Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience."'),
 Document(id='954fd5e0-cbc7-4e33-bdcd-03d590a2c10f', metadata={'source': 'tagged_description.txt'}, page_content='"9780786808380 - Introduce your babies to birds, cats, dogs, and babies through fine art

In [40]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip('"'))]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 - Children will discover the exc...


In [41]:
def retrieve_semantic_recommendation(
    query: str,
    top_k: int = 10,
) -> pd.DataFrame:
    # Get top 50 semantically similar documents
    recs = db_books.similarity_search(query, k=50)
    
    books_list = []

    for rec in recs:
        try:
            # Extract and clean the ISBN from the start of the page_content
            isbn = int(rec.page_content.split()[0].strip('"'))
            books_list.append(isbn)
        except ValueError:
            # Skip if it fails to convert
            continue

    # Filter the books DataFrame using the collected ISBNs
    return books[books["isbn13"].isin(books_list)].head(top_k)
retrieve_semantic_recommendation("A Book to teach war")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
66,9780007162994,0007162995,If I Die in a Combat Zone,Tim O'Brien,"Vietnam War, 1961-1975",http://books.google.com/books/content?id=0qUtS...,Perhaps the best book to emerge from the Vietn...,2003.0,3.95,208.0,11.0,If I Die in a Combat Zone,9780007162994 - Perhaps the best book to emerg...
524,9780099483472,0099483475,All Quiet on the Western Front,Erich Maria Remarque,"World War, 1914-1918",,All Quiet on the Western Front is probably the...,2005.0,3.95,216.0,1018.0,All Quiet on the Western Front,9780099483472 - All Quiet on the Western Front...
572,9780140149241,0140149244,We Were the Rats,Lawson Glassop,"Tobruk, Battles of, 1941-1942",,Reissue of the famous novel based on the autho...,1991.0,3.23,275.0,13.0,We Were the Rats,9780140149241 - Reissue of the famous novel ba...
999,9780195119206,0195119207,Ride of the Second Horseman,Robert L. O'Connell,History,http://books.google.com/books/content?id=1Xs7D...,"""Accurst be he that first invented war,"" wrote...",1997.0,4.23,320.0,12.0,Ride of the Second Horseman - The Birth and De...,"9780195119206 - ""Accurst be he that first inve..."
1006,9780195168952,019516895X,Battle Cry of Freedom,James M. McPherson,History,http://books.google.com/books/content?id=09FkZ...,Filled with fresh interpretations and informat...,2005.0,4.34,867.0,22318.0,Battle Cry of Freedom - The Civil War Era,9780195168952 - Filled with fresh interpretati...
1105,9780275942694,0275942694,The Heights of Courage,Avigdor Kahalani,History,http://books.google.com/books/content?id=7aIXJ...,"In October 1973, the State of Israel was invad...",1992.0,4.18,234.0,97.0,The Heights of Courage - A Tank Leader's War o...,"9780275942694 - In October 1973, the State of ..."
1199,9780312265052,0312265050,The Naked and the Dead,Norman Mailer,Fiction,http://books.google.com/books/content?id=c66GL...,Portrays the contrasting personalities and nos...,2000.0,3.94,721.0,20541.0,The Naked and the Dead - 50th Anniversary Edit...,9780312265052 - Portrays the contrasting perso...
1522,9780345461360,0345461363,To The Last Man,Jeff Shaara,Fiction,http://books.google.com/books/content?id=YLTBB...,"In the spring of 1918, when a neutral America ...",2005.0,4.22,636.0,4656.0,To The Last Man - A Novel Of The First World War,"9780345461360 - In the spring of 1918, when a ..."
1735,9780375760525,0375760520,Paris 1919,Margaret MacMillan,History,http://books.google.com/books/content?id=5NYdw...,National Bestseller New York Times Editors’ Ch...,2002.0,4.07,570.0,9150.0,Paris 1919 - Six Months That Changed the World,9780375760525 - National Bestseller New York T...
1931,9780393329292,0393329291,Identity and Violence: The Illusion of Destiny...,Amartya Sen,Philosophy,http://books.google.com/books/content?id=zSqBA...,Arguing that the violence of today's world is ...,2007.0,3.85,240.0,1157.0,Identity and Violence: The Illusion of Destiny...,9780393329292 - Arguing that the violence of t...
