In [2]:
!python3 -m nltk.downloader punkt
!python3 -m nltk.downloader stopwords
!python3 -m nltk.downloader wordnet
!python3 -m nltk.downloader omw-1.4
!python3 -m nltk.downloader averaged_perceptron_tagger
!pip install sentence-transformers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m72.0 MB/s[0m eta 

In [3]:
!git clone https://github.com/facebookresearch/faiss.git

Cloning into 'faiss'...
remote: Enumerating objects: 43446, done.[K
remote: Counting objects: 100% (3941/3941), done.[K
remote: Compressing objects: 100% (431/431), done.[K
remote: Total 43446 (delta 3578), reused 3666 (delta 3503), pack-reused 39505[K
Receiving objects: 100% (43446/43446), 132.17 MiB | 13.38 MiB/s, done.
Resolving deltas: 100% (39574/39574), done.


In [4]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import ast


import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import spacy

data = pd.read_csv('processed_data.csv')
data = data[data['desc'] != 'none']
data = data.reset_index(drop=True)

In [6]:
def tokenize(text):
    # Remove stop words
    tokens = word_tokenize(text)
    return tokens

In [7]:
tokenized_documents = data['desc_tokens'].apply(ast.literal_eval)
documents = [' '.join(doc) for doc in tokenized_documents]

In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

Downloading (…)b6d67/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)13d78b6d67/README.md:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)d78b6d67/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)b6d67/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)13d78b6d67/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)78b6d67/modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [9]:
# Training the model on the daaset
encoded_data = model.encode(documents)

In [19]:
import faiss
encoded_data = np.asarray(encoded_data.astype('float32'))
encoded_data = encoded_data / np.linalg.norm(encoded_data, axis=1, keepdims=True)
index = faiss.IndexIDMap(faiss.IndexFlatL2(len(encoded_data[0])))
index.add_with_ids(encoded_data, np.array(range(0, len(encoded_data))))
faiss.write_index(index, 'books_euc.index')



In [30]:
import textwrap
query = 'A fantacy about child who goes to a secret magicians school'
query_vector = model.encode([query])
top_k = index.search(query_vector, 10)

print(f"The Query: {query}","\n")
print("Related Books:")
for i in top_k[1][0]:
  wrapped_paragraph = textwrap.fill(data['desc'][i], width=120)

  print(data['book_title'][i],"\n")
  print(wrapped_paragraph, "\n\n\n")

The Query: ring 

Related Books:
The Lord of the Rings' Metaphors 

 This book is about a rare achievement in history, that of forecasting the present. While JRR Tolkiens epic saga, The
Lord of the Rings, unfolds in an imaginary world set in ancient time, he speaks in metaphor about our time. It is as if
he explored our present world fifty years in advance on the basis of established trends in our responses to universal
principles. In his explorations he sets before us critical choices and asks us to forecast our future according to the
logic of the choices that we are willing to make. -- The exploration in this book of The Lord of the Rings metaphors
assumes that the reader is somewhat familiar with the storyline of Tolkiens great tale, The Lord of the Rings. The book
is designed to enrich this story by bringing into focus surprising elements of it, rather than to retell the story
itself. 



The Lord of the Rings (Literature Guide) 

 This engaging guide is packed with teaching ideas

In [32]:
query = 'a Journey of two young people through an ancient world to destroy a masterious ring'
query_vector = model.encode([query])
top_k = index.search(query_vector, 10)

print(f"The Query: {query}","\n")
print("Related Books:")
for i in top_k[1][0]:
  wrapped_paragraph = textwrap.fill(data['desc'][i], width=120)

  print(data['book_title'][i],"\n")
  print(wrapped_paragraph, "\n\n\n")

The Query: a Journey of two young people through an ancient world to destroy a masterious ring 

Related Books:
The Lord Of The Rings Tarot Book 

 The Lord of the Rings Tarot unites two great traditions: the spiritual, mystical tradition of the tarot, and the world
of folklore and fairy tales that is most delightfully depicted in the works of J.R.R. Tolkien. The added factor of a
card game summons you to travel through Middle-Earth to reach Mount Doom and destroy the One Ring! 



Castaways of the Flying Dutchman 

 A boy and dog trapped aboard the Flying Dutchman, are sent off on an eternal journey by an avenging angel, roaming the
earth throughout the centuries in search of those in need. Their travels lead them to Chapelvale, a sleepy nineteenth
century village whose existence is at stake. Only by discovering the buried secrets and solving the dust-laden riddles
of the ancient village can it be saved.  



Faith Journey Through Fantasy Lands: A Christian Dialogue with Harry Potter,

In [12]:
index