In [15]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.postprocessor.cohere_rerank import CohereRerank
from dotenv import load_dotenv
import os
import pypdf
import chromadb

In [16]:
load_dotenv()

True

In [17]:
def load_documents_with_metadata_included(data_path:str):
    all_docs = []
    for filename in os.listdir(data_path):
        if not filename.endswith('.pdf'):
            continue
        file_path = os.path.join(data_path, filename)
        reader = pypdf.PdfReader(file_path)

        if "lecture" in filename.lower():
            doc_type = "lecture"

        if 'lecture' not in filename.lower():
            doc_type = "textbook"

        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            doc = Document(text = text, 
                           metadata = {
                               "file_name" : filename, 
                               "page_num" : page_num, 
                               "doc_type" : doc_type,
                               "course" : "Machine Learning"
                           })     
            all_docs.append(doc)
            

    return all_docs    

In [18]:
documents = load_documents_with_metadata_included("../data")

Ignoring wrong pointing object 173 0 (offset 0)
Ignoring wrong pointing object 376 0 (offset 0)
Ignoring wrong pointing object 393 0 (offset 0)
Ignoring wrong pointing object 425 0 (offset 0)
Ignoring wrong pointing object 427 0 (offset 0)
Ignoring wrong pointing object 434 0 (offset 0)
Ignoring wrong pointing object 652 0 (offset 0)
Ignoring wrong pointing object 678 0 (offset 0)
Ignoring wrong pointing object 781 0 (offset 0)
Ignoring wrong pointing object 837 0 (offset 0)
Ignoring wrong pointing object 840 0 (offset 0)
Ignoring wrong pointing object 843 0 (offset 0)
Ignoring wrong pointing object 854 0 (offset 0)
Ignoring wrong pointing object 885 0 (offset 0)
Ignoring wrong pointing object 929 0 (offset 0)
Ignoring wrong pointing object 1050 0 (offset 0)
Ignoring wrong pointing object 1092 0 (offset 0)
Ignoring wrong pointing object 1125 0 (offset 0)
Ignoring wrong pointing object 1138 0 (offset 0)
Ignoring wrong pointing object 1140 0 (offset 0)
Ignoring wrong pointing object 1149

In [19]:
#required_exts = ['.pdf', '.tex']
#reader = SimpleDirectoryReader(input_dir = "../data", required_exts = required_exts)

In [20]:
#docs = reader.load_data()

In [21]:
#print(f"Loaded {len(docs)} documents")

In [23]:
# See what one chunk looks like
#print(f"Chunk length: {len(docs[0].text)} characters")
#print(f"Preview: {docs[0].text[:500]}")

In [24]:
splitter = SentenceSplitter(chunk_size = 512, chunk_overlap= 100)

We will change the default embedding model from the OpenAI one to the sentence transformer model from HuggingFace as it is open-source and free.

In [25]:
Settings.embed_model = HuggingFaceEmbedding(model_name = 'sentence-transformers/all-MiniLM-L6-V2')
Settings.node_parser = splitter

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1065.12it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-V2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [27]:
chroma_client = chromadb.PersistentClient(path = './chroma')
chroma_collection = chroma_client.create_collection('ml_textbook_col') 

In [28]:
vector_store = ChromaVectorStore(chroma_collection= chroma_collection,)
storage_context = StorageContext.from_defaults(vector_store = vector_store)

In [29]:
index = VectorStoreIndex.from_documents(documents, storage_context = storage_context)

In [63]:
query_engine = index.as_query_engine(response_mode = 'tree_summarize', verbose = True,  similarity_top_k = 10, 
                                     node_postprocessors = [CohereRerank(top_n = 5)])

response = query_engine.query(
    """Based on ST443 Lecture 5 content, generate 3 theoretical questions.
    
    Example of a GOOD question style:
    "Explain why ridge regression shrinks coefficients but never sets them exactly to zero, 
    while lasso can produce exactly zero coefficients. What is the geometric interpretation?"
    
    Example of a BAD question (don't do this):
    "How can machine learning be applied to data?"
    
    Now generate 3 questions in the GOOD style about regularisation topics from Lecture 5.
    """
)

1 text chunks after repacking


In [64]:
print(f"Answer : {response.response}")
print("\nSources: ")
for i, node in enumerate(response.source_nodes, 1):
    print(f"\n{i}. Score: {node.score:.3f}")
    print(f"   File: {node.metadata.get('file_name', 'Unknown')}")
    print(f"   Text: {node.text}...")

Answer : 1. How does ridge regression differ from lasso regression in terms of the way they perform shrinkage on the coefficients, and how does this difference impact the resulting models' interpretability and sparsity?
2. Can you explain the concept of soft-thresholding in the context of the lasso regression and how it leads to feature selection? How does this relate to the regularization process?
3. In the context of model selection criteria discussed in Lecture 5, why is selecting a good value of the regularization parameter crucial for both ridge regression and lasso regression? How can techniques like cross-validation help in determining the optimal value of this parameter?

Sources: 

1. Score: 1.000
   File: ST443_Lecture_5.pdf
   Text: Key learning points
▶ What is regularisation/penalisation?
▶ Best subset selection and stepwise selection
▶ Model selection criteria
▶ Ridge regression and shrinkage
▶ Lasso regression and the sparsity-inducing ℓ1 penalty
Milan Vojnović 2/36...



In [None]:
chroma_client.get_or_create_collection