# 1. Data Preprocessing

### Import the documents, videos & audio files


In [39]:
import os

def list_files_in_folder(folder_path):
    try:
        return [
            os.path.join(folder_path, file)
            for file in os.listdir(folder_path)
            if os.path.isfile(os.path.join(folder_path, file))
        ]
    except FileNotFoundError:
        print(f"The folder '{folder_path}' does not exist.")
        return []


In [40]:
pdf_folder_path = 'rag_docs/pdf_files'
audio_folder_path = 'rag_docs/audio_files'


audio_files = list_files_in_folder(audio_folder_path)

pdf_files = [
    {"file_path": pdf_folder_path + "/Werum MES Optimization Pharma .pdf", "page_number_offset": 3},
    {"file_path": pdf_folder_path + "/ZVEI_MES_Brochure_EN.pdf", "page_number_offset": 3},
    {"file_path": pdf_folder_path + "/Manufacturing Execution Systems Integration and Intelligence.pdf", "page_number_offset": 10},
]
audio_files


['rag_docs/audio_files/What is MES (Manufacturing Execution System)_.mp3',
 'rag_docs/audio_files/What is MES_ Manufacturing Execution Systems.mp3',
 'rag_docs/audio_files/Top 10 Manufacturing Execution Systems [Best Manufacturing Software].mp3']

## Handling audio files

In [41]:
from transformers import pipeline
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
processor = AutoProcessor.from_pretrained("openai/whisper-base")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-base")


In [43]:
from IPython.display import Audio, display

def transcribe_audio_files(files, display_files=False) -> list[dict]:
    transcriptions = []
    for file in files:
        print(f"[INFO] transcribing file: {file}")
        # Call the transcription function (replace `transcribe` with your actual function)
        text = pipe(file, return_timestamps=True)['text']

        # Display the audio file and transcription
        if display_files:
          display(Audio(file))
          print(text)

        # Append transcription to the result list
        transcriptions.append({
            "audio_source_file": file,
            "text": text,
            "char_count": len(text),
            "word_count": len(text.split(" ")),
            "sentence_count_raw": len(text.split(". ")),
            "token_count": len(text) / 4,
        })
        print(f"[SUCCESS] {file} complete")

    return transcriptions

# Example usage
audio_to_text = transcribe_audio_files(audio_files)
audio_to_text[0]

[INFO] transcribing file: rag_docs/audio_files/What is MES (Manufacturing Execution System)_.mp3




[SUCCESS] rag_docs/audio_files/What is MES (Manufacturing Execution System)_.mp3 complete
[INFO] transcribing file: rag_docs/audio_files/What is MES_ Manufacturing Execution Systems.mp3




[SUCCESS] rag_docs/audio_files/What is MES_ Manufacturing Execution Systems.mp3 complete
[INFO] transcribing file: rag_docs/audio_files/Top 10 Manufacturing Execution Systems [Best Manufacturing Software].mp3




[SUCCESS] rag_docs/audio_files/Top 10 Manufacturing Execution Systems [Best Manufacturing Software].mp3 complete


{'audio_source_file': 'rag_docs/audio_files/What is MES (Manufacturing Execution System)_.mp3',
 'text': " What is MES? MES stands for manufacturing execution system. Meaning a control system for monitoring and managing work and process on the factory floor. But that's an oversimplification of what a successful MES software implementation can do for manufacturers. MES provides detailed resource scheduling and status, production, dispatch, and sequencing, traceability, genealogy, inventory, quality assurance, maintenance, management, document control, performance, analysis, and more. MES is crucial for manufacturers because it exists in a space between business-oriented applications like ERP and SCADA HMI systems designed to directly control plant floor operations. While an ERP can help allocate resources, it lacks the level of detail that MES provides. MES allows for real-time, minute-to-minute, or quicker resource scheduling, as well as handling execution and dispatch. MES connects pr

### Helper functions to cleanup text

In [45]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

### Processing PDF files

In [46]:

import fitz  # PyMuPDF
from tqdm.auto import tqdm

def open_and_read_pdfs(pdf_files: list[dict]) -> list[dict]:
    all_pages_and_texts = []

    for pdf in pdf_files:
        pdf_path = pdf["file_path"]
        page_number_offset = pdf["page_number_offset"]

        doc = fitz.open(pdf_path)  # Open the PDF document
        for page_number, page in tqdm(enumerate(doc), desc=f"Processing {pdf_path}"):  # Iterate through the pages
            text = page.get_text()  # Extract plain text
            text = text_formatter(text)  # Apply text formatting

            all_pages_and_texts.append({
                "source_file": pdf_path,  # Add source file metadata
                "page_number": page_number - page_number_offset,  # Adjust page numbers
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text) / 4,  # Estimate tokens
                "text": text,
            })

    return all_pages_and_texts




In [47]:
import random

pages_and_texts = open_and_read_pdfs(pdf_files)

random.sample(pages_and_texts, 2)

Processing rag_docs/pdf_files/Werum MES Optimization Pharma .pdf: 0it [00:00, ?it/s]

Processing rag_docs/pdf_files/ZVEI_MES_Brochure_EN.pdf: 0it [00:00, ?it/s]

Processing rag_docs/pdf_files/Manufacturing Execution Systems Integration and Intelligence.pdf: 0it [00:00, ?i…

[{'source_file': 'rag_docs/pdf_files/Manufacturing Execution Systems Integration and Intelligence.pdf',
  'page_number': 62,
  'page_char_count': 2082,
  'page_word_count': 360,
  'page_sentence_count_raw': 16,
  'page_token_count': 520.5,
  'text': '63  occur. This includes the request from QA to make Process Adjustments and reschedule  since a heat does not meet the specifications (not included in the model).  Heat Status at  Failure is the Process Status signal sent to A414 to reschedule the heats since a machine  has failed.  This signal is updated by the A6 module (see Section 3.6 below).   Therefore,  the A433 module updates the Process Status for QA, and the A6 module updates Process  Status for machine failures.  3.4.4.3.4.MANAGE HEAT QUALITY  The A434 module provides timely analysis of samples collected from heats to control  product quality. This is explained in Section 4.4.4.3.2.  Note when a heat does not meet  the specification it is the job of this module to recommend whi

In [48]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,source_file,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,rag_docs/pdf_files/Werum MES Optimization Phar...,-3,445,83,5,111.25,Optimizing the pharmaceutical supply chain by ...
1,rag_docs/pdf_files/Werum MES Optimization Phar...,-2,2460,396,16,615.0,Optimizing the pharmaceutical supply chain by ...
2,rag_docs/pdf_files/Werum MES Optimization Phar...,-1,3307,233,29,826.75,Optimizing the pharmaceutical supply chain by ...
3,rag_docs/pdf_files/Werum MES Optimization Phar...,0,2224,371,12,556.0,Optimizing the pharmaceutical supply chain by ...
4,rag_docs/pdf_files/Werum MES Optimization Phar...,1,692,109,3,173.0,Optimizing the pharmaceutical supply chain by ...


In [49]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,213.0,213.0,213.0,213.0,213.0
mean,44.92,2497.88,431.9,17.03,624.47
std,39.96,1891.42,322.21,14.96,472.85
min,-10.0,79.0,17.0,1.0,19.75
25%,12.0,1242.0,233.0,8.0,310.5
50%,32.0,2003.0,360.0,14.0,500.75
75%,77.0,2634.0,461.0,19.0,658.5
max,130.0,8625.0,2403.0,93.0,2156.25


### Processing text into sentences using nlp

In [50]:
def merge_sources(audio_data: list[dict], pdf_data: list[dict]) -> list[dict]:
    merged_results = []

    # Process audio data
    for audio_entry in audio_data:
        merged_results.append({
            "source": "audio",
            "source_file": audio_entry["audio_source_file"],
            "page_number": None,  # Page number is not applicable for audio
            "token_count": audio_entry["token_count"],
            "text": audio_entry["text"],
        })

    # Process PDF data
    for pdf_entry in pdf_data:
        merged_results.append({
            "source": "pdf",
            "source_file": pdf_entry["source_file"],
            "page_number": pdf_entry["page_number"],
            "token_count": pdf_entry["page_token_count"],
            "text": pdf_entry["text"],
        })

    return merged_results

In [51]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

def nlp_sentence_splitter(text: str) -> list[str]:
  sentences = list(nlp(text).sents)
  sentences = [str(sentence).strip() for sentence in sentences]
  return sentences



In [52]:
merged_sources = merge_sources(audio_to_text, pages_and_texts)
random.sample(merged_sources, 1)

[{'source': 'pdf',
  'source_file': 'rag_docs/pdf_files/Manufacturing Execution Systems Integration and Intelligence.pdf',
  'page_number': 19,
  'token_count': 468.75,
  'text': '20  The internal and interfacing data for the A4 block will be decomposed into the following  category rating system:    Rating  Description  ***   Frequent Data (timed in seconds, fixed sampling interval)  **     Moderate Data (timed in hours)  *       Long Term Data (timed in days/weeks)    This is done with the intent of:    1. Finding control points in the MES Module Interface Architecture, with the variables to  optimize or use to optimize outputs.    2. Determining the data used for real-time information and feedback control.  3. Defining the fundamental functions, information for certain issues, in order to know  what is taken for optimization, short and long term control.    In exploring these above issues, all the information data flow (input, output, control and  mechanism) going into the MES module

In [53]:
for item in tqdm(merged_sources):
  item["sentences"] = nlp_sentence_splitter(item["text"])
  item["sentence_count_nlp"] = len(item["sentences"])

random.sample(merged_sources, 2)

  0%|          | 0/216 [00:00<?, ?it/s]

[{'source': 'pdf',
  'source_file': 'rag_docs/pdf_files/ZVEI_MES_Brochure_EN.pdf',
  'page_number': 10,
  'token_count': 1728.75,
  'text': '27 5. 1  Pr oduct ion of act ive  p ha rm a c e u ti c a l i ngre d i e nts In the previous chapters, MES modules were industry-independently classified with regard to general  process applications using the IEC 62264 standard and VDI guideline 5600. This chapter presents sev- eral industry-specific application examples with special consideration of the following two aspects.  1.\t Solutions which are today associated with Manufacturing Execution Systems have developed „organ- ically“ over a number of years in various applications. Over time, significant differences have  emerged between the two principal industries „manufacturing industry“ and „process industry“. Yet  even within each of the principal industries, special and application-specific variants have been  established. These developments and the history they are based on will be outlined

### Chunking sentences together

In [54]:
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(merged_sources):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

merged_sources[0]

  0%|          | 0/216 [00:00<?, ?it/s]

{'source': 'audio',
 'source_file': 'rag_docs/audio_files/What is MES (Manufacturing Execution System)_.mp3',
 'page_number': None,
 'token_count': 1152.5,
 'text': " What is MES? MES stands for manufacturing execution system. Meaning a control system for monitoring and managing work and process on the factory floor. But that's an oversimplification of what a successful MES software implementation can do for manufacturers. MES provides detailed resource scheduling and status, production, dispatch, and sequencing, traceability, genealogy, inventory, quality assurance, maintenance, management, document control, performance, analysis, and more. MES is crucial for manufacturers because it exists in a space between business-oriented applications like ERP and SCADA HMI systems designed to directly control plant floor operations. While an ERP can help allocate resources, it lacks the level of detail that MES provides. MES allows for real-time, minute-to-minute, or quicker resource scheduling,

In [55]:
df = pd.DataFrame(merged_sources)
df.describe().round(2)

Unnamed: 0,page_number,token_count,sentence_count_nlp,num_chunks
count,213.0,216.0,216.0,216.0
mean,44.92,653.81,17.18,2.22
std,39.96,574.64,17.71,1.76
min,-10.0,19.75,1.0,1.0
25%,12.0,313.12,8.0,1.0
50%,32.0,500.88,14.0,2.0
75%,77.0,702.62,19.0,2.0
max,130.0,5349.0,176.0,18.0


### Splitting each chunk into its own item

In [56]:
import re

# Split each chunk into its own item
sources_and_chunks = []
for item in tqdm(merged_sources):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        chunk_dict["source"] = item["source"]
        chunk_dict["source_file"] = item["source_file"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

        sources_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(sources_and_chunks)

  0%|          | 0/216 [00:00<?, ?it/s]

479

In [57]:
random.sample(sources_and_chunks, 1)

[{'page_number': 21,
  'source': 'pdf',
  'source_file': 'rag_docs/pdf_files/Manufacturing Execution Systems Integration and Intelligence.pdf',
  'sentence_chunk': '22 and material routing.* / ** Equipment Operation Instructions Specific operation steps or recipes that are used to control machine movement, such as machining, welding, assembly, material movement, and so on. Outputted to: A6 ** Process Change Request Feedback from factory-floor production requesting changes to process plan when some problems in the process plan were found. Changes can be process parameter changes, tool changes, setup changes, and so on. Outputted to: A5, A1 (feedback) ** Product Inventory The inventory information on a product. The information is updated when finished products are sent to storage. Outputted to: A5 * Cost Report A report on the manufacturing costs of producing a part. It contains the costs of material, labor, usage of equipment, and so on. Outputted to: A5 *** Product Genealogy One of the

In [58]:
df = pd.DataFrame(sources_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,447.0,479.0,479.0,479.0
mean,40.71,1155.25,180.44,288.81
std,37.43,887.62,164.28,221.91
min,-10.0,20.0,5.0,5.0
25%,11.5,705.0,107.0,176.25
50%,29.0,1092.0,171.0,273.0
75%,69.0,1360.0,212.5,340.0
max,130.0,7981.0,2356.0,1995.25


Filtering chunks with under 30 tokens

In [59]:
min_token_length = 30
sources_and_chunks = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
# sources_and_chunks[:2]
len(sources_and_chunks)

468

In [60]:
import json

def save_to_json(data: list[dict], file_path: str) -> None:
    try:
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"An error occurred while saving data: {e}")

def load_from_json(file_path: str) -> list[dict]:
    try:
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
        print(f"Data successfully loaded from {file_path}")
        return data
    except Exception as e:
        print(f"An error occurred while loading data: {e}")
        return []

In [61]:
source_file_path = "sources_and_chunks.json"
already_saved_file = True
if not already_saved_file:
    save_to_json(sources_and_chunks, source_file_path)


# 2. Embedding Generations

In [62]:
import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)
def generate_embeddings(text_list):
  embeddings = embedding_model.encode(text_list, convert_to_tensor=True)
  return embeddings

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [63]:
%%time

text_chunks = [item["sentence_chunk"] for item in sources_and_chunks]
text_chunks_embeddings = generate_embeddings(text_chunks)

CPU times: user 4min 9s, sys: 48.3 s, total: 4min 58s
Wall time: 1min 17s


### Saving embeddings to vector database

In [64]:
import os 
from dotenv import load_dotenv

# Load variables from the .env file
load_dotenv()

# Access them
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "rag-pipeline-sources"

if index_name in pc.list_indexes().names():
    print("Using existing index")
else:
    print("Creating new index")
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [None]:
index = pc.Index(index_name)
def save_to_pinecone(embeddings, text_list):
    upsert_data = [
        (str(i), embeddings[i].tolist()) for i in range(len(text_list))
    ]
    index.upsert(vectors=upsert_data)

    print(f"Successfully stored {len(text_list)} embeddings in Pinecone.")

already_saved_to_db = True
if not already_saved_to_db:
    save_to_pinecone(text_chunks_embeddings, text_chunks)

Successfully stored 439 embeddings in Pinecone.


## Similarity search

In [None]:
def query_pinecone_db(query_text, k=5):
  query_embedding = generate_embeddings([query_text])[0].tolist()
  res = index.query(vector=query_embedding,top_k=k,include_values=True)
  res
  indices = [int(item["id"]) for item in res["matches"]]
  scores = [float(item["score"]) for item in res["matches"]]
  return indices,scores


In [None]:
query_text = "What are production activity management"
ids, scores = query_pinecone_db(query_text)
print(f"idexs: {ids}")
print(f"scores: {scores}")
for id in ids[:2]: # Printing top 2
  print("source: ", sources_and_chunks[id]["source"])
  print("source_file: ", sources_and_chunks[id]["source_file"])
  print("text: ", text_chunks[id])


idexs: [252, 70, 59, 413, 57]
scores: [0.599696219, 0.569156706, 0.565790355, 0.56132865, 0.560206175]
source:  pdf
source_file:  /content/drive/MyDrive/RAG_PIPELINE/context_docs/pdf_files/Manufacturing Execution Systems Integration and Intelligence.pdf
text:  21 Table 2.1: A4 Module Data Mechanism Data Collection Methods: The use of data collectors to obtain information on workpieces, timing, personnel, lots, and other critical entities for production management in a timely manner. Manufacturing Knowledge: The information (rules, logic, examples) that a manufacturing engineer brings to bear on manufacturing engineering problems, including production techniques and implementation techniques. Many different types of manufacturing knowledge are used in different manufacturing activities, such as decomposition knowledge, assignment knowledge, consolidation knowledge, and optimization knowledge, which are used in process planning, resource planning, production planning, and scheduling. Man