In [1]:
import os

os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [2]:
from langchain.document_loaders import TextLoader

loader = TextLoader('textbook.md')
documents = loader.load()

In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 601, which is longer than the specified 500
Created a chunk of size 1017, which is longer than the specified 500


In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings()

from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
  client=client,
  documents=chunks,
  embedding=embeddings_model,
  by_text=False
)

  from .autonotebook import tqdm as notebook_tqdm
            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


Started /Users/nitin/.cache/weaviate-embedded: process ID 1281


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-03-25T10:30:14-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-03-25T10:30:14-07:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-03-25T10:30:14-07:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2024-03-25T10:30:15-07:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-03-25T10:30:15-07:00"}
{"level":"info","msg":"Created shard langchain_2fe83ac56c4748b8b548bb8ccf95f52a_tqLYuIwX9U8W in 15.314713ms","time":"2024-03-25T10:30:15-07:00"}
{"action

In [5]:
retriever = vectorstore.as_retriever()

In [17]:
from langchain.prompts import ChatPromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

In [18]:
from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    }
)

In [19]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [15]:
questions = []

with open("questions.txt") as f:
    for line in f:
        if line.strip():
            questions.append(line.strip())

In [20]:
answers = []

for question in questions:
    answer = chain.invoke(question)
    answers.append(answer)

In [28]:
processed_answers = []

for answer in answers:
    words = answer.split()
    for i, word in enumerate(words):
        if word == 'Answer:':
            processed_answers.append(" ".join(words[i+1:]))

In [30]:
with open("answers.txt", "w") as f:
    for answer in processed_answers:
        f.write(answer)
        f.write("\n")
        f.write("\n")

In [70]:
predicted_answer = processed_answers[0]

real_answer = """
Autoencoders are one of the deep learning types used for unsupervised learning. There are key layers of autoencoders, which are the input layer, encoder, bottleneck hidden layer, decoder, and output.

The three layers of the autoencoder are:-
1) Encoder - Compresses the input data to an encoded representation which is typically much smaller than the input data.
2) Latent Space Representation/ Bottleneck/ Code - Compact summary of the input containing the most important features
3) Decoder - Decompresses the knowledge representation and reconstructs the data back from its encoded form.
Then a loss function is used at the top to compare the input and output images.
NOTE- It's a requirement that the dimensionality of the input and output be the same. Everything in the middle can be played with.

Autoencoders have a wide variety of usage in the real world. The following are some of the popular ones:

1. Transformers and Big Bird (Autoencoders is one of these components in both algorithms): Text Summarizer, Text Generator
2. Image compression
3. Nonlinear version of PCA
"""

In [74]:
real_answers = []

with open('textbook.md', 'r') as f:
    text = f.read()

for t in text.split('---'):
    real_answers.append(t)

In [77]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

similarities = []

for i in range(len(real_answers)):
    real_embedding = embeddings_model.embed_query(real_answers[i])
    predicted_embedding = embeddings_model.embed_query(predicted_answer)
    similarity = cosine_similarity([predicted_embedding], [real_embedding])[0][0]
    similarities.append(similarity)

similarities = np.asarray(similarities)

In [78]:
print('Average similarity:', np.mean(similarities))
print('Max similarity:', np.max(similarities))
print('Min similarity:', np.min(similarities))
print('Standard deviation of similarities:', np.std(similarities))

Average similarity: 0.3212592276825651
Max similarity: 0.7797517213497003
Min similarity: 0.09688542536650491
Standard deviation of similarities: 0.1781878540278423
