In [1]:
import os
import re
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


In [2]:
pdf_path = r"..\..\data\Lectures\\"  
persist_dir = "pdf_store"  # Directory where the vector store will be saved


### 1. Load PDF

In [3]:
all_pdfs = [os.path.join(dirpath, f) for dirpath, _, files in os.walk(pdf_path) for f in files if f.lower().endswith('.pdf')]
all_pdfs

['..\\..\\data\\Lectures\\\\3001_ETA.pdf',
 '..\\..\\data\\Lectures\\\\Clustering_InClass_9.28.21-1.pdf',
 '..\\..\\data\\Lectures\\\\Clustering_InClass_9.28.21-2.pdf',
 '..\\..\\data\\Lectures\\\\Clustering_InClass_9.28.21.pdf',
 '..\\..\\data\\Lectures\\\\Decision_Trees_4.28.21.pdf',
 '..\\..\\data\\Lectures\\\\DS_3001_Day 2_Case_Study.pdf',
 '..\\..\\data\\Lectures\\\\fun_with_functions_dplyr.pdf',
 '..\\..\\data\\Lectures\\\\ggplot_overview.pdf',
 '..\\..\\data\\Lectures\\\\knitr_reference.pdf',
 '..\\..\\data\\Lectures\\\\Knn and Prob.pdf',
 '..\\..\\data\\Lectures\\\\machine_learning_bootcamp_II copy.pdf',
 '..\\..\\data\\Lectures\\\\machine_learning_bootcamp_II.pdf',
 '..\\..\\data\\Lectures\\\\machine_learning_III.pdf',
 '..\\..\\data\\Lectures\\\\machine_learning_overview.pdf',
 '..\\..\\data\\Lectures\\\\Overview_ML_and_Clustering_InClass_3.21.pdf',
 '..\\..\\data\\Lectures\\\\PerfMet.pdf',
 '..\\..\\data\\Lectures\\\\tidydata_reference_Thursday_II.pdf',
 '..\\..\\data\\Lectu

In [4]:

pages = []
for pdf in all_pdfs:
    print(f"Loading PDF from {pdf}")
    loader = PyPDFLoader(pdf)
    pdf_pages = loader.load()
    print(len(pdf_pages))
    pages.extend(pdf_pages)


Loading PDF from ..\..\data\Lectures\\3001_ETA.pdf
33
Loading PDF from ..\..\data\Lectures\\Clustering_InClass_9.28.21-1.pdf
57
Loading PDF from ..\..\data\Lectures\\Clustering_InClass_9.28.21-2.pdf
57
Loading PDF from ..\..\data\Lectures\\Clustering_InClass_9.28.21.pdf
60
Loading PDF from ..\..\data\Lectures\\Decision_Trees_4.28.21.pdf
40
Loading PDF from ..\..\data\Lectures\\DS_3001_Day 2_Case_Study.pdf
5
Loading PDF from ..\..\data\Lectures\\fun_with_functions_dplyr.pdf
28
Loading PDF from ..\..\data\Lectures\\ggplot_overview.pdf
137
Loading PDF from ..\..\data\Lectures\\knitr_reference.pdf
35
Loading PDF from ..\..\data\Lectures\\Knn and Prob.pdf
88
Loading PDF from ..\..\data\Lectures\\machine_learning_bootcamp_II copy.pdf
55
Loading PDF from ..\..\data\Lectures\\machine_learning_bootcamp_II.pdf
55
Loading PDF from ..\..\data\Lectures\\machine_learning_III.pdf
55
Loading PDF from ..\..\data\Lectures\\machine_learning_overview.pdf
46
Loading PDF from ..\..\data\Lectures\\Overview_M

In [5]:
print(f"{len(pages)} Pages in the PDF")
pages


1009 Pages in the PDF


[Document(metadata={'source': '..\\..\\data\\Lectures\\\\3001_ETA.pdf', 'page': 0}, page_content='Overview and Data Science\nBrian Wright\nbrianwright@virginia.edu\n'),
 Document(metadata={'source': '..\\..\\data\\Lectures\\\\3001_ETA.pdf', 'page': 1}, page_content='2Course Administration \n\uf0d8Everybody Reads Even Computers: Text Mining'),
 Document(metadata={'source': '..\\..\\data\\Lectures\\\\3001_ETA.pdf', 'page': 2}, page_content='Final Projects\n\uf0d8Work individually and use one of the areas below to answer \na broad questions related to a given dataset.  I’ll provide several datasets for you to potential use, but you are also welcome to chose your own.  You can also use any dataset from the class if you choose.\n\uf0d8Topics we will/have covered that can be a focus of the final \nproject:\n\uf076Data Visualization \n\uf076Fairness/Bias\n\uf076Text Mining\n\uf076KNN\n\uf076Tree based methods\n\uf0d8Ensemble –Random Forrest – time permitting '),
 Document(metadata={'source': 

In [6]:
all_text = "\n\n".join([page.page_content for page in pages])
all_text = re.split(r'(?<=[.!?])\s+', all_text.replace(' \n', ' '))
print("\n".join(all_text))

Overview and Data Science
Brian Wright
brianwright@virginia.edu


2Course Administration Everybody Reads Even Computers: Text Mining

Final Projects
Work individually and use one of the areas below to answer a broad questions related to a given dataset.
I’ll provide several datasets for you to potential use, but you are also welcome to chose your own.
You can also use any dataset from the class if you choose.
Topics we will/have covered that can be a focus of the final project:
Data Visualization Fairness/Bias
Text Mining
KNN
Tree based methods
Ensemble –Random Forrest – time permitting 
Final Projects
Generate a publishable Rmarkdown document with the following sections:
1.Question and background information on the data and why you are asking this question(s).
References to previous research/evidence generally would be nice to include.
2.Exploratory Data Analysis –Initial summary statistics and graphs with an emphasis on variables you believe to be important for your analysi

In [7]:
# 2. Split text into chunks
print("Splitting text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,   # Number of characters per chunk
    chunk_overlap=500, # Overlap to provide context
    separators=["\n\n", "\n", ". "] # Split by paragraphs, then lines, then words
)

texts = []
for page in all_text:
    cont = page.replace("\xad\n","").replace("\n","")
    chunks = text_splitter.split_text(cont)
    print(chunks)
    texts.extend(chunks)

print(f"Created {len(texts)} chunks of text")



Splitting text into chunks...
['Overview and Data ScienceBrian Wrightbrianwright@virginia.edu2Course Administration \uf0d8Everybody Reads Even Computers: Text MiningFinal Projects\uf0d8Work individually and use one of the areas below to answer a broad questions related to a given dataset.']
['I’ll provide several datasets for you to potential use, but you are also welcome to chose your own.']
['You can also use any dataset from the class if you choose.']
['\uf0d8Topics we will/have covered that can be a focus of the final project:\uf076Data Visualization \uf076Fairness/Bias\uf076Text Mining\uf076KNN\uf076Tree based methods\uf0d8Ensemble –Random Forrest – time permitting Final Projects\uf0d8Generate a publishable Rmarkdown document with the following sections:1.Question and background information on the data and why you are asking this question(s).']
['References to previous research/evidence generally would be nice to include.']
['2.Exploratory Data Analysis –Initial summary statistics

In [8]:
# 3. Create embeddings
print("Creating embeddings...")
device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device}
)



Creating embeddings...


  embeddings = HuggingFaceEmbeddings(







In [9]:
import shutil
if os.path.exists(persist_dir): os.system(f'sudo rmdir /s /q "{persist_dir}"')

In [10]:
# 4. Create and persist the vector store
print("Creating and persisting vector store...")

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    persist_directory=persist_dir,
    collection_name="pdf_collection"
)

Creating and persisting vector store...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [15]:
# Load vector store
device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device}
)


vector_store = Chroma(
    persist_directory="pdf_store",
    embedding_function=embeddings,
    collection_name="pdf_collection"
)

# Load language model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,
    device_map="auto",
)

def generate_with_context(query, k=5):
    context_docs = vector_store.similarity_search(query, k=k)
    
    context = " ".join([doc.page_content for doc in context_docs])

    print("\n".join([str(doc) for doc in context_docs]))
    
    augmented_prompt = f"""Context: {context}
    Question: {query}
    Answer based on the provided context:
    """
    
    inputs = tokenizer(augmented_prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs, 
        max_length=1300, 
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



Some parameters are on the meta device because they were offloaded to the cpu.


In [16]:
# Example usage
query = "what is machine learning?"
response = generate_with_context(query)
print(response)

page_content='11Engineering of Machine Learning Algos versus Software Development Source: https://towardsdatascience.com/stoend -to-end-data -science -life-cycle -6387523b5afcData Science Life Cycle (Everything includes Evaluation13TrainFeature EngineerTest DeployEvaluate Evaluate Evaluate EvaluateMonitor1.What is Machine Learning?'
page_content='Terms and Phases Engineering of Machine Learning Algos versus Software Development Source: https://towardsdatascience.com/stoend -to-end-data -science -life-cycle -6387523b5afcBrian’s Version of Data Science Lifecycle4Question IDBusiness UnderstandingData Acquisition -ETLInitial ModelEvaluationData Understanding -EDAInitial Model(s) BuildingEvaluation Criteria Value Metric Model Creation & Training Feature Engineering and EvaluationOptimization –Hyperpara and EvaluationModel DeploymentData Drift AnalysisModel Performance –Evaluation Value MetricModel Drift Analysis –Model EvaluationReports –Dashboards -Products G1 G2G3Gate Reviews5Machine Lear

ValueError: Input length of input_ids is 1342, but `max_length` is set to 1300. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [None]:
device