In [7]:
!pip install transformers datasets langchain faiss-gpu pdfplumber



In [8]:
import pdfplumber

file_path = 'student-handbook2023-24.pdf'
with pdfplumber.open(file_path) as pdf:
    full_text = ''.join([page.extract_text() for page in pdf.pages])

In [9]:
import os
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)

print(file_size_mb)

2.1110496520996094


In [10]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Process the text
doc = nlp(full_text)

# Create sentence chunks of ~500 characters
chunk_size = 500
text_chunks = []
current_chunk = ""

for sent in doc.sents:
    if len(current_chunk) + len(sent.text) <= chunk_size:
        current_chunk += " " + sent.text
    else:
        text_chunks.append(current_chunk.strip())
        current_chunk = sent.text

# Add the last chunk if any text remains
if current_chunk:
    text_chunks.append(current_chunk.strip())




In [11]:
with pdfplumber.open(file_path) as pdf:
    raw_text = ''
    for page in pdf.pages:
        raw_text += page.extract_text()

chunk_size = 500
text_chunks = [raw_text[i:i + chunk_size] for i in range(0, len(raw_text), chunk_size)]
num_documents = len(text_chunks)
print(f"Number of Documents (Chunks): {num_documents}")

Number of Documents (Chunks): 210


## Mini LM (Microsoft)

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(text_chunks)

In [13]:
import faiss
import numpy as np

# Convert embeddings to a NumPy array
embeddings = np.array(embeddings)

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [14]:
questions = [
    "What are the requirements for maintaining academic integrity?",
    "What is the attendance policy for IBA students?",
    "What are the prerequisites for student applying for the Data Science program?",
    "How are student assessments and grading conducted?",
    "What are the consequences for violating student policies?"
]

In [15]:
from transformers import pipeline
import time

# Start the overall time calculation
start_time = time.time()

# Load the QA model
qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')

# Assuming 'questions', 'index', and 'text_chunks' are already defined

# Initialize a list to store the answers and track the total time
answers = []

# Process each question
for question in questions:
    # Record the start time for this particular question
    question_start_time = time.time()

    # Search the FAISS index for relevant chunks
    query_embedding = model.encode([question])  # Encode the question
    _, relevant_indices = index.search(query_embedding, k=3)  # Retrieve top 3 relevant chunks
    relevant_texts = [text_chunks[i] for i in relevant_indices[0]]  # Get the corresponding text chunks

    # Combine the retrieved text for the QA model
    context = " ".join(relevant_texts)

    # Get the answer from the QA model
    answer = qa_pipeline({'question': question, 'context': context})

    # Store the answer
    answers.append(answer['answer'])

    # Record the end time for this particular question
    question_end_time = time.time()

    # Calculate the time taken for this question
    question_time = question_end_time - question_start_time
    print(f"Question: {question}")
    print(f"Answer: {answer['answer']}")
    print(f"Time taken for this question: {question_time:.2f} seconds")
    print("\n" + "=" * 70 + "\n")  # Separator for readability

# Record the overall end time
end_time = time.time()

# Calculate total time taken for all questions
total_time = end_time - start_time

# Print the total time taken for answering all questions
print(f"Total time taken to answer all {len(questions)} questions: {total_time:.2f} seconds")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: What are the requirements for maintaining academic integrity?
Answer: being honest and having strong moral principles
Time taken for this question: 2.83 seconds


Question: What is the attendance policy for IBA students?
Answer: decently dressed and in a manner that is appropriate for any institution

Time taken for this question: 0.88 seconds


Question: What are the prerequisites for student applying for the Data Science program?
Answer: 3
Time taken for this question: 0.82 seconds


Question: How are student assessments and grading conducted?
Answer: o ing of their exam material
Time taken for this question: 0.77 seconds


Question: What are the consequences for violating student policies?
Answer: Students are financially responsible for damages
Time taken for this question: 0.68 seconds


Total time taken to answer all 5 questions: 6.68 seconds


## Paraphrase Mini LM (Microsoft)

In [16]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = np.array(model.encode(text_chunks))

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [17]:
from transformers import pipeline
import time

# Start the overall time calculation
start_time = time.time()

# Load the QA model
qa_pipeline = pipeline('question-answering', model='bert-large-uncased-whole-word-masking-finetuned-squad')

# Assuming 'questions', 'index', and 'text_chunks' are already defined

# Initialize a list to store the answers and track the total time
answers = []

# Process each question
for question in questions:
    # Record the start time for this particular question
    question_start_time = time.time()

    # Encode the question
    query_embedding = model.encode([question])  # Encode the question

    # Search the FAISS index for relevant chunks
    _, relevant_indices = index.search(query_embedding, k=3)  # Retrieve top 3 relevant text chunks

    # Extract relevant texts based on the search results
    relevant_texts = [text_chunks[i] for i in relevant_indices[0]]  # Retrieve corresponding chunks

    # Combine the retrieved texts into a single context
    context = " ".join(relevant_texts)

    # Get the answer from the QA model
    answer = qa_pipeline({'question': question, 'context': context})

    # Store the answer
    answers.append(answer['answer'])

    # Record the end time for this particular question
    question_end_time = time.time()

    # Calculate the time taken for this question
    question_time = question_end_time - question_start_time
    print(f"Question: {question}")
    print(f"Answer: {answer['answer']}")
    print(f"Time taken for this question: {question_time:.2f} seconds")
    print("\n" + "=" * 70 + "\n")  # Separator for readability

# Record the overall end time
end_time = time.time()

# Calculate total time taken for all questions
total_time = end_time - start_time

# Print the total time taken for answering all questions
print(f"Total time taken to answer all {len(questions)} questions: {total_time:.2f} seconds")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: What are the requirements for maintaining academic integrity?
Answer: Students should be honest about their identity
Time taken for this question: 8.42 seconds


Question: What is the attendance policy for IBA students?
Answer: Students are expected to:
l Present themselves as mature
Time taken for this question: 3.35 seconds


Question: What are the prerequisites for student applying for the Data Science program?
Answer: high-performance computing to commutative algebra
Time taken for this question: 2.28 seconds


Question: How are student assessments and grading conducted?
Answer: fill the faculty and course evaluation
questionnaires
Time taken for this question: 2.20 seconds


Question: What are the consequences for violating student policies?
Answer: disciplinary action against the IBA
Time taken for this question: 2.50 seconds


Total time taken to answer all 5 questions: 19.70 seconds


In [18]:
model = SentenceTransformer('t5-small')
embeddings = np.array(model.encode(text_chunks))



In [19]:
from transformers import pipeline
from transformers import T5Tokenizer
import time

# Load the T5-small model for question answering
qa_pipeline = pipeline('text2text-generation', model='t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Track the start time of the whole process
start_time = time.time()

# Maximum token length (you can adjust this based on your model's limit)
max_input_length = 512  # You can adjust this based on your model's max token limit

# Process each question
for question in questions:
    # Start timing for each question
    question_start_time = time.time()

    # Combine the question and context (use FAISS to retrieve relevant chunks if needed)
    context = " ".join(text_chunks)  # or use the top relevant chunks from FAISS search

    # Encode the input to count tokens and truncate if necessary
    input_text = f"question: {question} context: {context}"
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")

    # If the number of tokens exceeds the max length, truncate
    if len(input_tokens[0]) > max_input_length:
        input_tokens = input_tokens[:, :max_input_length]

    # Decode back to text
    truncated_input_text = tokenizer.decode(input_tokens[0], skip_special_tokens=True)

    # Get the answer using T5
    answer = qa_pipeline(truncated_input_text)

    # Print results
    print(f"Question: {question}")
    print(f"Answer: {answer[0]['generated_text']}")
    print("\n" + "=" * 70 + "\n")  # Separator for readability

    # Track time taken for the current question
    question_end_time = time.time()
    question_time = question_end_time - question_start_time
    print(f"Time taken for this question: {question_time:.2f} seconds")

# Track total time taken for all questions
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken for processing all questions: {total_time:.2f} seconds")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (24294 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


Question: What are the requirements for maintaining academic integrity?
Answer: a not to be regarded as an roadmap to assist you in irrevocable contract


Time taken for this question: 1.09 seconds
Question: What is the attendance policy for IBA students?
Answer: 05 About IBA privileges


Time taken for this question: 1.00 seconds
Question: What are the prerequisites for student applying for the Data Science program?
Answer: 05 About IBA privileges


Time taken for this question: 1.03 seconds
Question: How are student assessments and grading conducted?
Answer: it is your academic grading policies, 07 School of Economics and Social Sciences responsibility to ensure


Time taken for this question: 1.45 seconds
Question: What are the consequences for violating student policies?
Answer: the Institution


Time taken for this question: 0.60 seconds
Total time taken for processing all questions: 5.17 seconds
