In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import time


In [None]:
import requests

response = requests.get("https://www.gutenberg.org/cache/epub/64317/pg64317.txt")

In [None]:
book_complete_text = response.text
book_complete_text = book_complete_text[5:]
len(book_complete_text)

In [None]:
file_path = "./book.txt"

with open(file_path, "w", encoding="utf-8") as f:
    f.write(book_complete_text)

In [None]:
with open(file_path, "r",  encoding="utf-8") as f:
    text = f.read()

text = text.replace('\t', ' ')

In [None]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=5000, chunk_overlap=300)


In [None]:
documents = text_splitter.create_documents([text])
num_documents = len(documents)

print (f"Now our book is split up into {num_documents} documents")

In [None]:
len(documents)

In [None]:
chroma_path = "./chroma"
#delete collection if this exists

In [None]:

#embeddings = OllamaEmbeddings(model="mistral:7b", num_gpu=1, show_progress=True)

In [None]:
embeddings2 = OllamaEmbeddings(model="qwen:0.5b-text",  show_progress=True)

In [None]:
import datetime

In [None]:
datetime.datetime.now()

In [None]:
time1 = datetime.datetime.now()
embeddings2 = OllamaEmbeddings(model="qwen:0.5b-text",  show_progress=True)
emb2 = embeddings2.embed_documents(texts=documents)
time2 = datetime.datetime.now()

In [None]:
(time2-time1).seconds

In [None]:
len(emb2[0])

In [None]:
emb2[0]

In [None]:
from sklearn.cluster import KMeans
import numpy as np

In [None]:
n_clusters = 8

In [None]:
### Mistral 7B

In [None]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(emb2)

In [None]:
kmeans.labels_

In [None]:
# t-SNE (t-distributed Stochastic Neighbor Embedding) 

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
# Taking out the warnings
import warnings
from warnings import simplefilter

# Filter out FutureWarnings
simplefilter(action='ignore', category=FutureWarning)

# Perform t-SNE and reduce to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)
reduced_data_tsne = tsne.fit_transform(np.array(emb2))

# Plot the reduced data
plt.scatter(reduced_data_tsne[:, 0], reduced_data_tsne[:, 1], c=kmeans.labels_)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Book Embeddings Clustered')
plt.show()

In [None]:
closest_indices = []

# Loop through the number of clusters you have
for i in range(n_clusters):

    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(emb2 - kmeans.cluster_centers_[i], axis=1)

    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)

    # Append that position to your closest indices list
    closest_indices.append(closest_index)

selected_indices = sorted(closest_indices)

selected_docs = [documents[doc] for doc in selected_indices]



In [None]:
selected_indices

In [None]:
from langchain_community.llms import Ollama
llm = Ollama(model="qwen:0.5b-text")

In [None]:
final_text = documents[9].page_content.replace("\n", " ")

In [None]:
prompt = f"""Summarize this text in 50 words 
{final_text}"""

In [None]:
len(prompt)

In [None]:
time1 = datetime.datetime.now()
response = llm.invoke(prompt)
time2 = datetime.datetime.now()

In [None]:
(time2-time1).seconds

In [None]:
len(response)

In [None]:
time1 = datetime.datetime.now()
final_text=""
for x in selected_indices:
    txt = documents[x].page_content.replace("\n", " ")
    prompt = f"""Summarize this text in 50 words 
{txt}"""
    response = llm.invoke(prompt)
    final_text= final_text + " " + response


time2 = datetime.datetime.now()

In [None]:
len(final_text)

In [None]:
(time2-time1).seconds / 60

In [None]:

llm = Ollama(model="qwen:0.5b-text")


In [None]:
prompt = f"""Summarize this text in 50 words 
{final_text}"""

In [None]:
time1 = datetime.datetime.now()
response = llm.invoke(prompt)
time2 = datetime.datetime.now()

In [None]:
(time2-time1).seconds

In [None]:
### Orca Mini3B

In [None]:
kmeans2 = KMeans(n_clusters=n_clusters, random_state=42).fit(emb2)

In [None]:
closest_indices2 = []

# Loop through the number of clusters you have
for i in range(n_clusters):

    # Get the list of distances from that particular cluster center
    distances2 = np.linalg.norm(emb2 - kmeans2.cluster_centers_[i], axis=1)

    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index2 = np.argmin(distances2)

    # Append that position to your closest indices list
    closest_indices2.append(closest_index2)

selected_indices2 = sorted(closest_indices2)

selected_docs2 = [documents[doc] for doc in selected_indices2]

In [None]:
selected_indices2

In [None]:
db  = Chroma.from_documents(documents=documents,
                                 embedding=embeddings,
                                 collection_metadata = {"hnsw:space": "cosine"},
                                 persist_directory=chroma_path)

In [None]:
query = """Lufthansa flies back to profit

German airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003.

In a preliminary report, the airline announced net profits of 400m euros ($527.61m; £274.73m), compared with a loss of 984m euros in 2003. Operating profits were at 380m euros, ten times more than in 2003. Lufthansa was hit in 2003 by tough competition and a dip in demand following the Iraq war and the killer SARS virus. It was also hit by troubles at its US catering business. Last year, Lufthansa showed signs of recovery even as some European and US airlines were teetering on the brink of bankruptcy. The board of Lufthansa has recommended paying a 2004 dividend of 0.30 euros per share. In 2003, shareholders did not get a dividend. The company said that it will give all the details of its 2004 results on 23 March.
"""
docs_with_score = db.similarity_search_with_score(query, k=5)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)
    print(f"Class {doc.metadata.get('classifier')}")
    print("-" * 80)

In [None]:
db.embeddings

In [None]:
docs_with_score = db.similarity_search_with_relevance_scores(query, k=5)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)
    print(f"Class {doc.metadata.get('classifier')}")
    print("-" * 80)