# Download datasets to Notebook

In [None]:
# Set up kaggle - sign into kaggle, go into Account -> Settings -> API -> Create new token
from google.colab import files

# Upload kaggle.json credentials
files.upload()

# Configure kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download CMU Book Summary Dataset => https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset/data
!kaggle datasets download ymaricar/cmu-book-summary-dataset

# Download Amazon Book Reviews Dataset => https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
!kaggle datasets download mohamedbakhet/amazon-books-reviews

Saving kaggle.json to kaggle.json
Downloading cmu-book-summary-dataset.zip to /content
 62% 10.0M/16.2M [00:00<00:00, 39.6MB/s]
100% 16.2M/16.2M [00:00<00:00, 48.5MB/s]
Downloading amazon-books-reviews.zip to /content
 99% 1.05G/1.06G [00:18<00:00, 132MB/s]
100% 1.06G/1.06G [00:18<00:00, 62.6MB/s]


# Load data and install dependencies

In [None]:
# Create requirements.txt with all dependencies
%%writefile requirements.txt
pandas
zipfile36
langchain
pinecone-client
transformers
accelerate
sentencepiece
unstructured
jq
sentence-transformers
torch

Writing requirements.txt


In [None]:
# Install dependencies from requirements.txt file
!pip install -r requirements.txt

Collecting zipfile36 (from -r requirements.txt (line 2))
  Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)
Collecting langchain (from -r requirements.txt (line 3))
  Downloading langchain-0.0.351-py3-none-any.whl (794 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.3/794.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client (from -r requirements.txt (line 4))
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from -r requirements.txt (line 6))
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from -r requirements.txt (line 7))
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [None]:
import csv
import pandas as pd
import zipfile36 as zipfile
import os
import json
import torch

In [None]:
# Unzip cmu-book-summary-dataset.zip and convert book_summaries.txt to DataFrame
if 'booksummaries.txt' not in os.listdir("./"):
    with zipfile.ZipFile("cmu-book-summary-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("./")

data = []
with open("booksummaries.txt", "r", encoding="utf8") as f:
    reader = csv.reader(f, dialect="excel-tab")
    for row in reader:
        data.append(row)

sum_df = pd.DataFrame.from_records(data, columns=['book_id', 'freebase_id', 'book_title', 'author', 'publication_date', 'genre', 'summary'])
sum_df = sum_df[['book_title', 'author', 'genre', 'summary']]

In [None]:
# Unzip amazon-book-reviews.zip and load csv's as DataFrames
if 'Books_rating.csv' not in os.listdir("./"):
    with zipfile.ZipFile("amazon-books-reviews.zip", "r") as zip_ref:
        zip_ref.extractall("./")

rev_df = pd.read_csv("%s.csv" % "Books_rating")
rev_df = rev_df[['Title', 'review/text']]

In [None]:
!mkdir ./docs/

sum_df.head(100).to_csv('./docs/summaries.csv', index=False)
rev_df.head(100).to_csv('./docs/reviews.csv', index=False)

torch.cuda.empty_cache()

# Initialize Pinecone

In [None]:
import langchain
import pinecone
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

  from tqdm.autonotebook import tqdm


In [None]:
loader = CSVLoader(file_path="./docs/reviews.csv", source_column = "review/text")
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

loader_s = CSVLoader(file_path="./docs/summaries.csv", source_column="summary", encoding="utf-8")
data_s = loader_s.load()
text_splitter_s = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_s = text_splitter.split_documents(data_s)

all_docs = docs + docs_s

print(len(all_docs))


200


In [None]:
pinecone.init(
    api_key="8dcc7834-ef9f-497e-a2d7-3c6dddc17054",
    environment="gcp-starter"
)
index_name="langchainvector"

model_id = "databricks/dolly-v2-3b"
embeddings_model_id = "sangmini/msmarco-cotmae-MiniLM-L12_en-ko-ja"

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key="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",
    model_name=embeddings_model_id
)

In [None]:
# initialize Pinecone index: langchainvector
docsearch = Pinecone.from_documents((docs+docs_s), embeddings, index_name=index_name)

In [None]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline
import torch

pipe = pipeline(model=model_id, torch_dtype="auto", trust_remote_code=True, device_map="auto", return_full_text=True)

hf_pipeline = HuggingFacePipeline(pipeline=pipe)

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

instruct_pipeline.py:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/databricks/dolly-v2-3b:
- instruct_pipeline.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

In [None]:
def get_5_books_similar_book(book_title: str, author: str):
    context = docsearch.similarity_search(book_title + " by " + author + " is ", k=5)
    template = """
        Give me 5 books similar to {book} by {author}, just their titles and authors.
        Provided similar book summaries and reviews:
        {context}
    """
    prompt_template = PromptTemplate(
        template=template,
        input_variables=["book", "author", "context"]
    )
    llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt_template)
    return llm_chain.predict(book=book_title, author=author, context=context).lstrip()

def get_author(book_title: str, author: str):
    context = docsearch.similarity_search(book_title + " by " + author, k=5)
    template = """
        Give me an author I would like, given that I enjoyed {book} by {author}.
        Provided similar book summaries and reviews:
        {context}
    """
    prompt_template = PromptTemplate(
        template=template,
        input_variables=["book", "author", "context"]
    )
    llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt_template)
    return llm_chain.predict(book=book_title, author=author, context=context).lstrip()

def get_5_books_from_genre(genre: str):
    context = docsearch.similarity_search("Book is in the genre: " + genre, k=5)
    template = """
        Give me 5 books I would like in the genre {genre}.
        Provided similar book summaries and reviews:
        {context}
    """
    prompt_template = PromptTemplate(
        template=template,
        input_variables=["genre", "context"]
    )
    llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt_template)
    return llm_chain.predict(genre=genre, context=context).lstrip()

def get_2_books_similar_to_author(author: str):
    context = docsearch.similarity_search(author + " writes ", k=5)
    template = """
        Give me 5 books similar to books written by {author}, just the author's names.
        Provided similar book summaries and reviews:
        {context}
    """
    prompt_template = PromptTemplate(
        template=template,
        input_variables=["author", "context"]
    )
    llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt_template)
    return llm_chain.predict(author=author, context=context).lstrip()

def get_5_books_from_description(description: str):
    context = docsearch.similarity_search(description, k=5)
    template = """
        Give me 5 books that I would like given a description: {description} and
        book summaries and reviews similar to the description:
        {context}
    """
    prompt_template = PromptTemplate(
        template=template,
        input_variables=["description", "context"]
    )
    llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt_template)
    return llm_chain.predict(description=description, context=context).lstrip()

In [None]:
print(get_5_books_similar_book("The Lord of the Rings", "J.R.R. Tolkien"))

1. "Leaf by Niggle"
2. "The Name of the Wind" by Patrick Rothfuss
3. "The Silmarillion" by J.R.R. Tolkien
4. "War of the Ring" by Sam Witwer
5. "The Shadow of the Old Gods" by Glen Cook


In [None]:
print(get_author("Dune", "Frank Herbert"))

Pacific Overtures
Stephen Sondheim


In [None]:
print(get_5_books_from_genre("Science Fiction"))

1.   Dune by  Frank Herbert
2.   TheBIT alumnus by  Greg Bear
3.   The Three Jurors by  Dan Simmons
4.   The Martian Chronicles by  Michael Crichton
5.   Red Mars by  Stephen Brust.


In [None]:
print(get_2_books_similar_to_author("George Orwell"))

-   The Orwell Papers
-   Burmese Days
-   Down and Out in Paris and London
-   Crime and Punishment
-   Nineteen Eighty-Four


In [None]:
print(get_5_books_from_description("A disillusioned teenager struggles with adolescence."))

Once Upon a Time in America is a book that I would like given the description: A disillusioned teenager struggles with adolescence. It is written by American author Charles Bukowski. Its themes include addiction, depression, sex and the military. It is known for its conversational style, profanity and references to strippers, farm dogs and 'Chicks with huge, natural titties.' The book follows a disillusioned American teenager, Jake, who moves with his mother to rural Illinois after his father dies.
