# Environment

In [98]:
import os
import pandas as pd
from dotenv import load_dotenv
import streamlit as st

load_dotenv()  # take environment variables from .env.

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import create_extraction_chain
from langchain.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, MWETokenizer
from nltk.stem import WordNetLemmatizer
import numpy as np
from gensim.models import Word2Vec
from data_processing import preprocess, save_metadata, pull_from_book_metadata, calculate_embeddings


BOOK = "The-Murder-of-Roger-Ackroyd"

# Get the data

In [99]:
with open(f"Data/books/{BOOK}.txt") as f:
    book = f.read().strip()

#get book parts
book_parts = []

#grab 8000 token sections of the book and add it to the list of book parts
for i in range(0, len(book), 10000):
    book_parts.append(book[i:i+10000:])

In [100]:
#Get the book for langchain
loader = TextLoader(file_path="Data/books/The-Murder-of-Roger-Ackroyd.txt")
data = loader.load()

In [101]:
# For the embeddings, we can use Word2Vec

tokenizer = MWETokenizer([
        ('hercule', 'poirot'),
        ('mr.', 'poirot'),
        ('dr.', 'shepherd'),
        ('dr.', 'james', 'shepherd'),
        ('james', 'shepherd'),
        ('roger', 'ackroyd')
    ], separator='')

word_dict = {
    'jamesshepherd': 'jamesshepherd', #technically not necessary if mapping to itself but here for clarity
    'dr.jamesshepherd': 'jamesshepherd',
    'shepherd': 'jamesshepherd',
    'james': 'jamesshepherd',
    'herculepoirot': 'poirot',
    'poirot': 'poirot',
    'mr.poirot': 'poirot',
    'roger': 'roger',
    'rogerackroyd': 'roger',
}

tokens = preprocess(f"./Data/books/{BOOK}.txt", tokenizer, word_dict, lem=True)

model = Word2Vec(tokens, vector_size=100, window=5, min_count=1, sg=0) #CBOW model

In [102]:
#Combine 

embeddings = OpenAIEmbeddings()

db = FAISS.from_documents(data, embeddings)

In [103]:
docs = db.similarity_search("Protagonist")
print(docs)

for match in docs[:2]:
    print(match.page_content)

﻿The Project Gutenberg eBook of The murder of Roger Ackroyd

    

This ebook is for the use of anyone anywhere in the United States and

most other parts of the world at no cost and with almost no restrictions

whatsoever. You may copy it, give it away or re-use it under the terms

of the Project Gutenberg License included with this ebook or online

at www.gutenberg.org. If you are not located in the United States,

you will have to check the laws of the country where you are located

before using this eBook.



Title: The murder of Roger Ackroyd



Author: Agatha Christie



Release date: October 2, 2022 [eBook #69087]

                Most recently updated: January 31, 2024



Language: English



Original publication: United States: Grosset & Dunlap, 1926



Credits: Emmanuel Ackerman, Robert Tonsing and the Online Distributed Proofreading Team at https://www.pgdp.net (This book was produced from images made available by the HathiTrust Digital Library.)





*** START OF THE PROJEC

# OpenAI

In [104]:
schema = {
    "properties": {
        "protagonist": {"type": "string"},
        "antagonist": {"type": "string"},
        "murder weapon": {"type": "string"},
        "victim": {"type": "string"}
    }
}

# Run Chain
llm = ChatOpenAI()
chain = create_extraction_chain(schema, llm)


prompt_template = PromptTemplate(
    input_variables=["embedding"],
    template="Using the embedding: {embedding}, generate a response."
)

# Initialize the LLM and the chain
chain = LLMChain(llm=llm, prompt=prompt_template)


  chain = LLMChain(llm=llm, prompt=prompt_template)


In [105]:
from gensim.models import KeyedVectors

# Create a word2vec model
# convert model above to binary model

# For the embeddings, we can use Word2Vec

tokenizer = MWETokenizer([
        ('hercule', 'poirot'),
        ('mr.', 'poirot'),
        ('dr.', 'shepherd'),
        ('dr.', 'james', 'shepherd'),
        ('james', 'shepherd'),
        ('roger', 'ackroyd')
    ], separator='')

word_dict = {
    'jamesshepherd': 'jamesshepherd', #technically not necessary if mapping to itself but here for clarity
    'dr.jamesshepherd': 'jamesshepherd',
    'shepherd': 'jamesshepherd',
    'james': 'jamesshepherd',
    'herculepoirot': 'poirot',
    'poirot': 'poirot',
    'mr.poirot': 'poirot',
    'roger': 'roger',
    'rogerackroyd': 'roger',
}

tokens = preprocess(f"./Data/books/{BOOK}.txt", tokenizer, word_dict, lem=True)

model = Word2Vec(tokens, vector_size=100, window=5, min_count=1, sg=0) #CBOW model

In [106]:
# Convert book parts to embeddings
embeddings = []
for part in book_parts:
    tokens = part.split()  # Tokenize the text
    part_embedding = [
        model.wv[token] for token in tokens if token in model.wv
    ]
    # Average the embeddings for the part
    if part_embedding:
        avg_embedding = sum(part_embedding) / len(part_embedding)
        embeddings.append(avg_embedding)
    else:
        embeddings.append(None)  # Handle cases with no valid tokens


In [107]:
class EmbeddingChain:
    def __init__(self, chain, embedding_model):
        self.chain = chain
        self.embedding_model = embedding_model

    def invoke_with_embedding(self, embedding):
        # Ensure the chain accepts embedding input
        input_data = {"embedding": embedding.tolist()}  # Example format
        return self.chain.invoke(input_data)

In [108]:
responses = []
for embedding in embeddings:
    if embedding is None:
        responses.append("No valid tokens in this part.")
        continue

    # Use the embedding with the chain
    response = chain.invoke_with_embedding(embedding)
    responses.append(response['text'])
    print(response['text'])


AttributeError: 'LLMChain' object has no attribute 'invoke_with_embedding'

# Single

In [None]:
for book_part in book_parts:
    response = chain.invoke(book_part)['text']
    print(response)

[{'protagonist': 'Dr. Sheppard', 'antagonist': 'Caroline', 'victim': 'Mrs. Ferrars', 'murder weapon': 'veronal'}]
[{'protagonist': 'Caroline', 'antagonist': 'Mrs. Ferrars', 'victim': 'Mrs. Ferrars'}]
[{'protagonist': 'Dr. Sheppard', 'antagonist': 'Miss Russell', 'victim': 'Mrs. Ferrars'}]
[{'protagonist': 'Caroline', 'antagonist': 'Mr. Porrott', 'victim': 'vegetable marrow'}]
[{'protagonist': 'Ralph Paton', 'antagonist': 'stepfather', 'victim': 'old man'}]
[{'protagonist': 'Flora', 'antagonist': 'Mrs. Ackroyd'}]
[{'protagonist': 'Roger Ackroyd', 'victim': 'Mrs. Ferrars', 'murder weapon': 'Poison'}]
[{'protagonist': 'Dr. Sheppard', 'antagonist': 'Unknown', 'murder weapon': 'dagger', 'victim': 'Roger Ackroyd'}]
[{'protagonist': 'Miss Flora', 'antagonist': 'unknown', 'victim': 'Mr. Ackroyd'}]
[{'protagonist': 'Flora', 'antagonist': 'Parker', 'murder weapon': 'Tunisian Dagger', 'victim': 'Roger Ackroyd'}]
[{'protagonist': 'Major Blunt', 'murder weapon': 'Tunisian dagger', 'victim': 'Uncle'

KeyboardInterrupt: 

In [None]:
import numpy as np

# Function to get embedding for a word
def get_embedding(word, model):
    if word in model.wv:
        return model.wv[word]
    else:
        # Handle OOV words: for example, return a zero vector or a random vector
        print(f"Word '{word}' not in vocabulary. Assigning zero vector.")
        return np.zeros(model.vector_size)

# Map schema words to their embeddings
schema_embeddings = {}
for word in schema:
    embedding = get_embedding(word, model)
    schema_embeddings[word] = embedding

# Display the results
for word, embedding in schema_embeddings.items():
    print(f"Word: {word}, Embedding Shape: {embedding.shape}")

#We have the embedding for protagonist. Ask it who the protagonist is

# Calculate the similarity between the schema embeddings and the document embeddings

def calculate_similarity(schema_embeddings, document_embeddings):

    # Calculate the similarity between the schema embeddings and the document embeddings
    similarities = {}
    for schema_word, schema_embedding in schema_embeddings.items():
        for document_word, document_embedding in document_embeddings.items():
            similarity = np.dot(schema_embedding, document_embedding) / (np.linalg.norm(schema_embedding) * np.linalg.norm(document_embedding))
            similarities[(schema_word, document_word)] = similarity

    return similarities

sims = calculate_similarity(schema_embeddings, model.wv)

# Display the results

for sim in sims:
    print(f"Similarity between '{sim[0]}' and '{sim[1]}': {sims[sim]}")


Word: victim, Embedding Shape: (100,)


AttributeError: 'KeyedVectors' object has no attribute 'items'

In [None]:
from gensim.models import Word2Vec
from langchain.vectorstores import FAISS

# Define schema words
schema = ["poirot", "jamesshepherd", "roger"]

# Assume a trained Word2Vec model
# model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Embedding class
class GensimWord2VecEmbeddings:
    def __init__(self, model):
        self.model = model.wv  # Use KeyedVectors from Word2Vec

    def embed_query(self, text):
        if text in self.model:
            return self.model[text]
        else:
            raise ValueError(f"Word '{text}' not in vocabulary")

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            if text in self.model:
                embeddings.append(self.model[text])
            else:
                raise ValueError(f"Word '{text}' not in vocabulary")
        return embeddings

    def __call__(self, text):
        return self.embed_query(text)

# Instantiate embeddings object
gensim_embeddings = GensimWord2VecEmbeddings(model)

# Add schema words to the vector store
faiss_store = FAISS.from_texts(
    texts=schema,
    embedding=gensim_embeddings  # Pass the callable object
)

# Query the vector store
query = "protagonist"
try:
    results = faiss_store.similarity_search(query, k=1)  # Find the closest match
    for result in results:
        print(f"Most similar to '{query}': {result}")
except ValueError as e:
    print(e)  # Handle cases where the query word is not in the vocabulary


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Word 'protagonist' not in vocabulary


In [None]:
from langchain_community.document_loaders import TextLoader

file_path = "Data/books/The-Mysterious-Affair-at-Styles.txt"

# with open(file_path) as f:
#     text = f.read()

#preprocess the text

# from data_processing import preprocess

# text = preprocess(text)

#save the text


#load only chapter XIII

# text = text.split("CHAPTER XIII")[1]

# text = text.split("CHAPTER XIV")[0]

# with open("Data/books/The-Mysterious-Affair-at-Styles-Chapter-XIII.txt", "w") as f:
#     f.write(text)

loader = TextLoader(file_path)

docs = loader.load()

print(len(docs))

1


In [33]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

. POIROT EXPLAINS









CHAPTER I.

I GO TO STYLES





The intense interest aroused in the publi
{'source': 'Data/books/The-Mysterious-Affair-at-Styles-Chapter-XIII.txt'}


In [34]:
import getpass
import os

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [35]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

In [37]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, give it your best guess. "
    "Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Who is the murderer?"})

results

{'input': 'Who is the murderer?',
 'context': [Document(id='1579a2df-dc2c-4ea2-916a-3ea01e516c13', metadata={'source': 'Data/books/The-Mysterious-Affair-at-Styles-Chapter-XIII.txt'}, page_content='been murdered, you felt certain that you would know by instinct who the\n\ncriminal was, even if you were quite unable to prove it?”\n\n\n\n“Yes, I remember saying that. I believe it too. I suppose you think it\n\nnonsense?”\n\n\n\n“Not at all.”\n\n\n\n“And yet you will pay no attention to my instinct against Alfred\n\nInglethorp.”\n\n\n\n“No,” said Poirot curtly. “Because your instinct is not against Mr.\n\nInglethorp.”\n\n\n\n“What?”\n\n\n\n“No. You wish to believe he committed the crime. You believe him\n\ncapable of committing it. But your instinct tells you he did not commit\n\nit. It tells you more—shall I go on?”\n\n\n\nShe was staring at him, fascinated, and made a slight affirmative\n\nmovement of the hand.\n\n\n\n“Shall I tell you why you have been so vehement against Mr. Inglethorp