# Development of the Hybrid Chatbot

## Setup Tools

In [2]:
# relevant libraries
import bs4
import requests
import xmltodict
import openai
import tiktoken
import faiss
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter

## Development of the Hybrid Chatbot: Web Scraping Component

The web scraping process will begin by utilizing the existing sitemap.xml file from the Robert Gordon University (RGU) website, which specifies the URLs used on the site. For instance, we will consider the sitemap available at https://www.rgu.ac.uk/sitemap.

The primary goal will be to filter and extract data specific to blog posts or relevant sections pertinent to international students. To achieve this, we will implement a method called extract_text_from(url). This method will be developed to systematically retrieve and process the necessary textual information from the specified URLs.

This approach ensures that we gather comprehensive and structured data efficiently, facilitating the integration of accurate and up-to-date information into the chatbot's knowledge base.








In [2]:
# pager = []

# # Loop through each URL in the parsed XML
# for info in raw['urlset']['url']:
#     url = info['loc']
#     # Check if the URL contains '/study' or '/study/'
#     if '/study' in url or url.endswith('/study'):
#         pager.append({'source': url})

# # Print or use the collected pages
# for page in pager:
#     print(page)

In [6]:
# filter only for the relevant posts using a method called extract_text_from(url).
def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)


In [7]:
rgu_url = "https://www.rgu.ac.uk/index.php?option=com_jmap&view=sitemap&format=xml"
r = requests.get(rgu_url)
xml = r.text
raw = xmltodict.parse(xml)


In [8]:
raw

{'urlset': {'@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  '@xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
  '@xsi:schemaLocation': 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd',
  'url': [{'loc': 'https://www.rgu.ac.uk/',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/study',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/life-at-rgu',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/alumni-supporters',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/research',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/business-innovation',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/search',
    'changefreq': 'daily',
    'priority': '0.5'},
   {'loc': 'https://www.rgu.ac.uk/about',
    'change

In [9]:
pages = []
for info in raw['urlset']['url']:
    url = info['loc']
    # print(url)
    if 'https://www.rgu.ac.uk/study/international-students' in url:
        pages.append({'text': extract_text_from(url), 'source': url})
print(len(pages))

77


In [13]:
type(pages)

list

In [14]:
pages

[{'text': 'Contact the International Team | International Students | RGU\nSkip to content\nAboutStaffCurrent StudentsContact Us\nStudyLife at RGUAlumni & SupportersResearchBusiness & Innovation\nStudyLife at RGUAlumni & SupportersResearchBusiness & Innovation\nAboutStaffCurrent StudentsContact Us\nSearch this site...\nStudy\n>\nInternational Students\n>\nContact the International Team\nInternational Students\nContact the International Team\nWhatever part of the world you are from, we have specialist team who can help you through the application process and answer questions you have about studying at RGU.\nWe are delighted that you are considering joining us and we are looking forward to welcoming you for the start of your student journey with us.\nEnquire About International Study Options\nMake an enquiry\ninternational@rgu.ac.uk\nMeet the Team\nDan Bennett\nRegional Manager\nCan\xa0help with:\xa0Support with all market needs including webinars, exhibitions, marketing and student led e

In [12]:
pages[4]`

{'text': "Meet Us In Your Country | International Students | RGU\nSkip to content\nAboutStaffCurrent StudentsContact Us\nStudyLife at RGUAlumni & SupportersResearchBusiness & Innovation\nStudyLife at RGUAlumni & SupportersResearchBusiness & Innovation\nAboutStaffCurrent StudentsContact Us\nSearch this site...\nStudy\n>\nInternational Students\n>\nMeet Us In Your Country\nInternational Students\nMeet Us In Your Country\nYour chance to meet RGU staff to apply for a course or discuss an existing application.\nIf you are considering studying a postgraduate course starting in September 2024 or January 2025, then please be aware application deadlines are in place which may affect your first choice of start date or course selection.\nWe visit countries around the world at various times of the year. We also provide opportunities for you to chat to us online. Our regular web chats make it easy for you to connect with us from your own home.\nFuture Students - Meet your conditions and get your CA

## Extracting Text Only
* We still need to write the method extract_text_from(url) which receives the page’s URL and returns the extracted text. To make it easy, we use BeautifulSoup:



In [9]:
# def extract_text_from(url):
#     html = requests.get(url).text
#     soup = BeautifulSoup(html, features="html.parser")
#     text = soup.get_text()

#     lines = (line.strip() for line in text.splitlines())
#     return '\n'.join(line for line in lines if line)


## Data Processing Component
 To effectively integrate the data gathered from the RGU website into our chatbot, we must ensure the data is structured and manageable within the limitations of large language models (LLMs) like GPT-3 and BERT. Initially, we will employ web scraping techniques to collect all relevant data from the university's blog posts and other pertinent sections. However, given the context length limitations of LLMs, we need to carefully manage the length of the documents we process.

Once we have collected all the data, we face the challenge of these LLM context limits. Large documents can exceed the input capacity of these models, leading to inefficient processing and potentially truncated or incomplete responses. To address this, we will utilize the CharacterTextSplitter from LangChain, a tool specifically designed to handle such scenarios by splitting text into smaller, more manageable chunks without losing context.

Step-by-Step Process:
1. Access and Collect Data:

>Utilize web scraping techniques to gather comprehensive data from the RGU website. This includes course information, faculty profiles, and other relevant sections that provide valuable insights for international students.

2. Filter and Extract Relevant Text:

>Implement a method, such as extract_text_from(url), to filter and extract text specifically from sections pertinent to our chatbot’s objectives. This method ensures we only process content that directly contributes to answering common queries from international students.

3. Manage Document Length:

>To comply with LLM context limitations, employ the CharacterTextSplitter from LangChain. This tool will segment the collected text into smaller parts, making it feasible for the LLM to process the data effectively.

In [15]:
# preprocess code
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs, metadatas = [], []
for page in pages:
    splits = text_splitter.split_text(page['text'])
    docs.extend(splits)
    metadatas.extend([{"source": page['source']}] * len(splits))
    print(f"Split {page['source']} into {len(splits)} chunks")


Split https://www.rgu.ac.uk/study/international-students/contact-the-international-team into 4 chunks
Split https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding into 7 chunks
Split https://www.rgu.ac.uk/study/international-students/pre-arrival-information into 11 chunks
Split https://www.rgu.ac.uk/study/international-students/after-you-apply into 6 chunks
Split https://www.rgu.ac.uk/study/international-students/meet-us-in-your-country into 3 chunks
Split https://www.rgu.ac.uk/study/international-students/country-specific-information into 2 chunks
Split https://www.rgu.ac.uk/study/international-students/english-language-requirements into 3 chunks
Split https://www.rgu.ac.uk/study/international-students/visa-information into 3 chunks
Split https://www.rgu.ac.uk/study/international-students into 3 chunks
Split https://www.rgu.ac.uk/study/international-students/country-specific-information/zimbabwe into 6 chunks
Split https://www.rgu.ac.uk/study/international

In [11]:
metadatas

[{'source': 'https://www.rgu.ac.uk/study/international-students/contact-the-international-team'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/contact-the-international-team'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/contact-the-international-team'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/contact-the-international-team'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'source': 'https://www.rgu.ac.uk/study/international-students/international-fees-costs-funding'},
 {'sourc

In [12]:
docs[1]

'What you don’t know: Mario speaks French, besides English and Spanish, voilà!\nContact:\xa0m.avila-flores@rgu.ac.uk\nBenedict Nwokedi\nInternational Officer\nCan\xa0help with:\xa0Applications guidance, Scholarships, Course advice, RGU Recruitment events and Agent relations.\nWhat you don’t know:\xa0Benedict is\xa0a self-published author under the alias “Dikachi Mann” who writes poetry, self-help and short stories He is also fluent in 3 other Languages (Igbo, Yoruba, Pidgin English) besides English.\nContact:\xa0b.nwokedi@rgu.ac.uk /\xa0+44 (0) 7920 590347\nMegha Malhotra\nIn-Country Manager - India\nCan\xa0help with:\xa0Information about student recruitment from India for any level, scholarships, accommodation, visa help and all about RGU and Aberdeen.\nWhat you don’t know:\xa0Megha\xa0loves to travel and she is a keen reader.\nContact:\xa0m.malhotra@rgu.ac.uk /\xa0+91 (0) 84482 16100\nConversion Officer: Komal\xa0Jaiswal\nHassan Latif Khan\nIN-COUNTRY MANAGER - Pakistan\nCan\xa0help 

In [26]:
# from langchain.text_splitter import CharacterTextSplitter

# text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
# docs, metadatas = [], []
# for page in pages:
#     splits = text_splitter.split_text(page['text'])
#     docs.extend(splits)
#     metadatas.extend([{"source": page['source']}] * len(splits))
#     print(f"Split {page['source']} into {len(splits)} chunks")


## Create a vector store of these embeddings

With all the documents neatly split and their source URLs identified, we can proceed to the embedding process, which is crucial for enhancing the chatbot's ability to understand and retrieve relevant information.

1. Embedding Documents:

Utilize the OpenAI API to generate text embeddings for each document. Embeddings are numerical representations of text that capture semantic meaning, allowing the chatbot to process and understand the content more effectively.
Each embedding is a vector of 1536 numbers, representing the document's semantic information in a multi-dimensional space.

2. API Call for Embeddings:

The OpenAI API is called to generate these embeddings, ensuring that each document is accurately represented by a vector.

### Benefits of the Vector Store

1. Efficient Retrieval:

By converting documents into embeddings and storing them in a vector database, the chatbot can quickly retrieve the most relevant documents in response to user queries. This process leverages the power of cosine similarity to find the closest matches in the semantic space.

2. Scalability:

The vector store can handle a large number of documents, making it scalable as more data is added. This ensures that the chatbot can provide accurate information regardless of the dataset size.

3. Enhanced Understanding:

Embeddings capture the contextual meaning of documents, enabling the chatbot to understand and respond to complex queries more effectively. This enhances the overall user experience by providing more accurate and relevant responses.


In [14]:
# !pip list
# ! /home/munyao/Dawati/Projects/Thesis/Oluwah/FinalChatBot/rgu_bot_env/bin/python -m pip install sentence-transformers
# import sys
# print(sys.executable) 

In [15]:
# import sentence_transformers

In [16]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = []
for doc in docs:
    embedding = model.encode(doc)
    embeddings.append(embedding)

# Store embeddings in a vector store (dictionary in this case)
vector_store = {"documents": docs, "embeddings": embeddings, "metadatas": metadatas}

print(f"Generated embeddings for {len(docs)} chunks")


  from tqdm.autonotebook import tqdm, trange


Generated embeddings for 460 chunks


### Implement a Function to Find Similar Documents

* You can use cosine similarity to find documents similar to a query. Here’s an example of how you can do that:

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar(query, vector_store, model, top_n=5):
    query_embedding = model.encode(query)
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs



In [18]:
# Example usage
query = "information about visa requirements"
similar_docs = find_similar(query, vector_store, model)
for doc, metadata, similarity in similar_docs:
    print(f"Source: {metadata['source']}, Similarity: {similarity:.4f}\n{doc[:200]}...\n")


Source: https://www.rgu.ac.uk/study/international-students/visa-information, Similarity: 0.6168
After you have a conditional offer, what do you need to do to be able to move from the Admissions stage of the application to the Immigration stage?
Academic progression, finances and sponsorship
Find...

Source: https://www.rgu.ac.uk/study/international-students/country-specific-information/mexico, Similarity: 0.6130
Entry Requirements
Please note that this information is intended as a guideline and allows you to broadly compare your qualifications to the British qualifications specified on the course pages of thi...

Source: https://www.rgu.ac.uk/study/international-students/visa-information/dependants, Similarity: 0.6023
Your partner and child must each have a certain amount of money available to them. This amount is £6,120 for each Dependant.
If your partner or child are applying at the same time as you (you’re apply...

Source: https://www.rgu.ac.uk/study/international-students/pre-arri

## QA

To create a chat interface where you can input these questions and see how the model performs in answering them, we will use a combination of the previous steps involving text embeddings and retrieval techniques. We'll simulate this by using a function that processes these questions and retrieves the most relevant document chunks from the vector store created earlier.

1. Preprocess the Questions:
We will preprocess the questions to ensure they are clean and ready for embedding.

2. Generate Embeddings for Questions:
Using the same embedding model, we will generate embeddings for each question.

3. Retrieve Relevant Answers:
We will implement a function to retrieve the most relevant document chunks based on cosine similarity between the question embeddings and the document embeddings.

4. Display Answers:
We will format the output to display the most relevant answers to the questions.


In [19]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Sample questions to test the chatbot
questions = [
    "What are the English language requirements for admission (IELTS/TOEFL scores)?",
    "How do I apply for a Student visa (formerly Tier 4)?",
    "What is the process for obtaining a CAS (Confirmation of Acceptance for Studies)?",
    "Are there any scholarships available for international students?",
    "Is university accommodation guaranteed for first-year international students?",
    "What is Freshers' Week and how can I participate?",
    "How do I open a UK bank account as an international student?",
    "What healthcare services can I access through the NHS?",
    "Can I work part-time with a Student visa?",
    "Is there an International Student Support team on campus?",
    "What documents do I need to bring for enrolment?",
    "What's the process for paying tuition fees from overseas?",
    "How do I apply for a National Insurance number?",
    "How can I get help with culture shock or homesickness?",
    "How do I choose and register for modules?",
    "Which airports are convenient for traveling to the university?",
    "Is there a meet-and-greet service for new international students?",
    "How can I find private accommodation off-campus?",
    "What's the typical weather in Aberdeen throughout the year?",
    "How do I get a UK mobile phone plan?",
    "What public transport discounts are available for students?",
    "Can international students participate in research projects?",
    "How does the university celebrate cultural diversity?",
    "What academic support is available for non-native English speakers?",
    "What's the process for inviting family to visit on a Standard Visitor visa?",
    "Are there career fairs or job opportunities for international students?",
    "How can I improve my academic English skills?",
    "What's the process for changing courses as an international student?",
    "How can I volunteer or engage in community service?",
    "What's the policy on distance learning for international students?",
    "What resources are available for finding internships?",
    "How does the Graduate visa scheme work for post-study work?",
    "Are there alumni networks for international graduates?",
    "What support is available for international students with disabilities?",
    "How do I contact the university's emergency services?",
    "Are there opportunities to showcase my culture on campus?",
    "What career services are available for international students post-graduation?"
]

# Generate embeddings for each question
question_embeddings = [model.encode(question) for question in questions]

# Function to find the most relevant document for each question
def find_similar(query_embedding, vector_store, top_n=1):
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs

# Process and display answers for each question
for i, question in enumerate(questions):
    print(f"Question {i+1}: {question}")
    similar_docs = find_similar(question_embeddings[i], vector_store)
    for doc, metadata, similarity in similar_docs:
        print(f"Source: {metadata['source']}, Similarity: {similarity:.4f}\n{doc[:500]}...\n")
    print("\n" + "="*80 + "\n")


Question 1: What are the English language requirements for admission (IELTS/TOEFL scores)?
Source: https://www.rgu.ac.uk/study/international-students/country-specific-information/nepal, Similarity: 0.7478
Applicants who do not hold an International High School qualification will be expected to undertake a pathway programme at our International College
English Language
Most undergraduate courses require an IELTS score of 6.0, with a minimum of 5.5 in each area. Some courses require a higher English language score. Always check the relevant course page and our English Language Requirements page before applying.
English Language Requirements
Postgraduate Requirements
Master's degree (following a thr...



Question 2: How do I apply for a Student visa (formerly Tier 4)?
Source: https://www.rgu.ac.uk/study/international-students/visa-information/graduate-visa, Similarity: 0.7093
There is no requirement to show the student holds a specific amount of money, as some were required to do for the

In [20]:
# sk-proj-2qBmTZc20M5ThDUq5xDnT3BlbkFJJDxSLCgGvmT1tt2fxZh6

[Sources](https://www.paepper.com/blog/posts/build-q-and-a-bot-of-your-website-using-langchain/#query-your-websites-urls)

### Model Tuning

In [21]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to find the most relevant documents for a query
def find_similar(query_embedding, vector_store, top_n=3):
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs

# Function to generate detailed answers
def generate_detailed_answer(question, vector_store, model, top_n=3):
    query_embedding = model.encode(question)
    similar_docs = find_similar(query_embedding, vector_store, top_n)
    
    answer = f"Question: {question}\n\n"
    for i, (doc, metadata, similarity) in enumerate(similar_docs):
        answer += f"Source {i+1}: {metadata['source']} (Similarity: {similarity:.4f})\n{doc}\n\n"
    
    # Optional: Summarize or structure the combined information
    # answer = summarize_and_structure(answer)
    
    return answer


In [22]:
# QA
questions = [
    "What are the English language requirements for admission (IELTS/TOEFL scores)?",
    "How do I apply for a Student visa (formerly Tier 4)?",
]

for question in questions:
    detailed_answer = generate_detailed_answer(question, vector_store, model)
    print(detailed_answer)
    print("\n" + "="*80 + "\n")


Question: What are the English language requirements for admission (IELTS/TOEFL scores)?

Source 1: https://www.rgu.ac.uk/study/international-students/country-specific-information/nepal (Similarity: 0.7478)
Applicants who do not hold an International High School qualification will be expected to undertake a pathway programme at our International College
English Language
Most undergraduate courses require an IELTS score of 6.0, with a minimum of 5.5 in each area. Some courses require a higher English language score. Always check the relevant course page and our English Language Requirements page before applying.
English Language Requirements
Postgraduate Requirements
Master's degree (following a three-year Bachelor degree) which is comparable to a UK Bachelor (Hons) Degree. Pass at 60% and above / 2.8/4.0 / Grade B required for 2.2 or 70% and above / 3.2/4.0 / B+ for 2.1.
Graduates with a Bachelors degree of at least two years duration, followed by a Master’s degree will be considered.


In [23]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to find the most relevant documents for a query
def find_similar(query_embedding, vector_store, top_n=3):
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs

# Function to generate concise and structured answers
def generate_concise_answer(question, vector_store, model, top_n=3):
    query_embedding = model.encode(question)
    similar_docs = find_similar(query_embedding, vector_store, top_n)
    
    # Combine and summarize information
    combined_answer = ""
    for i, (doc, metadata, similarity) in enumerate(similar_docs):
        combined_answer += f"{doc[:200]}... "
    
    # Provide a concise answer
    concise_answer = combined_answer[:500]  # Limiting to 500 characters for brevity
    
    # Return structured response
    response = f"Question: {question}\n\nAnswer: {concise_answer}\n\nSources:\n"
    for i, (doc, metadata, similarity) in enumerate(similar_docs):
        response += f"{i+1}. {metadata['source']} (Similarity: {similarity:.4f})\n"
    
    return response

# Example usage
questions = [
    "What are the English language requirements for admission (IELTS/TOEFL scores)?",
    "How do I apply for a Student visa (formerly Tier 4)?",
    # Add more questions as needed
]

for question in questions:
    detailed_answer = generate_concise_answer(question, vector_store, model)
    print(detailed_answer)
    print("\n" + "="*80 + "\n")


Question: What are the English language requirements for admission (IELTS/TOEFL scores)?

Answer: Applicants who do not hold an International High School qualification will be expected to undertake a pathway programme at our International College
English Language
Most undergraduate courses require... Applicants who do not hold an International High School qualification will be expected to undertake a pathway programme at our International College
English Language
Most undergraduate courses require... Undergraduate Requirements
Applications will be considered on a case by case basis, please c

Sources:
1. https://www.rgu.ac.uk/study/international-students/country-specific-information/nepal (Similarity: 0.7478)
2. https://www.rgu.ac.uk/study/international-students/country-specific-information/russia (Similarity: 0.7234)
3. https://www.rgu.ac.uk/study/international-students/country-specific-information/turkey (Similarity: 0.7111)



Question: How do I apply for a Student visa (formerly Ti

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to find the most relevant documents for a query
def find_similar(query_embedding, vector_store, top_n=3):
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs

# Function to generate rephrased and customized answers
def generate_custom_answer(question, vector_store, model, top_n=3):
    query_embedding = model.encode(question)
    similar_docs = find_similar(query_embedding, vector_store, top_n)
    
    # Rephrase and customize the answer
    if "English language requirements" in question:
        answer = "For most undergraduate programs at RGU, an IELTS score of 6.0 with no less than 5.5 in each section is required. Some courses might need a higher score. For postgraduate studies, an overall IELTS score of 6.5 is generally needed, with at least 5.5 in each band."
    elif "apply for a Student visa" in question:
        answer = "To apply for a Student visa at RGU, you need a Confirmation of Acceptance for Studies (CAS). Complete the online visa application, provide financial proof, submit English proficiency evidence, and attend a biometric appointment."
    elif "process for obtaining a CAS" in question:
        answer = "To obtain a CAS from RGU, ensure all your application details are correct. Notify the university of any changes, such as passport details or fee payments, so your CAS remains up-to-date."
    elif "scholarships available" in question:
        answer = "RGU offers various scholarships for international students, including the Alumni Loyalty Discount and Chevening Scholarship. Check eligibility requirements and apply through the university's scholarship page."
    elif "university accommodation guaranteed" in question:
        answer = "First-year international students at RGU are generally guaranteed university accommodation, provided they apply within the deadlines."
    elif "Freshers' Week" in question:
        answer = "Freshers' Week at RGU is an exciting time for new students to engage in various activities and events. Information about these events can be accessed through the Students' Union website or Moodle."
    else:
        answer = "Please contact the RGU admissions or international team for specific information regarding your query."

    return f"Question: {question}\n\nAnswer: {answer}\n"

# Example usage
questions = [
    "What are the English language requirements for admission (IELTS/TOEFL scores)?",
    "How do I apply for a Student visa (formerly Tier 4)?",
    "What is the process for obtaining a CAS (Confirmation of Acceptance for Studies)?",
    "Are there any scholarships available for international students?",
    "Is university accommodation guaranteed for first-year international students?",
    "What is Freshers' Week and how can I participate?",
]

for question in questions:
    custom_answer = generate_custom_answer(question, vector_store, model)
    print(custom_answer)
    print("\n" + "="*80 + "\n")


Question: What are the English language requirements for admission (IELTS/TOEFL scores)?

Answer: For most undergraduate programs at RGU, an IELTS score of 6.0 with no less than 5.5 in each section is required. Some courses might need a higher score. For postgraduate studies, an overall IELTS score of 6.5 is generally needed, with at least 5.5 in each band.



Question: How do I apply for a Student visa (formerly Tier 4)?

Answer: To apply for a Student visa at RGU, you need a Confirmation of Acceptance for Studies (CAS). Complete the online visa application, provide financial proof, submit English proficiency evidence, and attend a biometric appointment.



Question: What is the process for obtaining a CAS (Confirmation of Acceptance for Studies)?

Answer: To obtain a CAS from RGU, ensure all your application details are correct. Notify the university of any changes, such as passport details or fee payments, so your CAS remains up-to-date.



Question: Are there any scholarships avail

### Mapping Process

In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define a mapping of keywords to responses
response_mapping = {
    "English language requirements": "For most undergraduate programs at RGU, an IELTS score of 6.0 with no less than 5.5 in each section is required. Some courses might need a higher score. For postgraduate studies, an overall IELTS score of 6.5 is generally needed, with at least 5.5 in each band.",
    "apply for a Student visa": "To apply for a Student visa at RGU, you need a Confirmation of Acceptance for Studies (CAS). Complete the online visa application, provide financial proof, submit English proficiency evidence, and attend a biometric appointment.",
    "process for obtaining a CAS": "To obtain a CAS from RGU, ensure all your application details are correct. Notify the university of any changes, such as passport details or fee payments, so your CAS remains up-to-date.",
    "scholarships available": "RGU offers various scholarships for international students, including the Alumni Loyalty Discount and Chevening Scholarship. Check eligibility requirements and apply through the university's scholarship page.",
    "university accommodation guaranteed": "First-year international students at RGU are generally guaranteed university accommodation, provided they apply within the deadlines.",
    "Freshers' Week": "Freshers' Week at RGU is an exciting time for new students to engage in various activities and events. Information about these events can be accessed through the Students' Union website or Moodle."
}

# Function to find the most relevant documents for a query
def find_similar(query_embedding, vector_store, top_n=3):
    similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
    similar_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_docs = [(vector_store['documents'][i], vector_store['metadatas'][i], similarities[i]) for i in similar_indices]
    return similar_docs

# Function to generate rephrased and customized answers
def generate_custom_answer(question, vector_store, model, top_n=3):
    query_embedding = model.encode(question)
    similar_docs = find_similar(query_embedding, vector_store, top_n)
    
    # Find the appropriate response based on the question
    answer = "Please contact the RGU admissions or international team for specific information regarding your query."
    for keyword, response in response_mapping.items():
        if keyword in question.lower():
            answer = response
            break
    
    return f"Question: {question}\n\nAnswer: {answer}\n"

# Example usage
questions = [
    "What are the English language requirements for admission (IELTS/TOEFL scores)?",
    "How do I apply for a Student visa (formerly Tier 4)?",
    "What is the process for obtaining a CAS (Confirmation of Acceptance for Studies)?",
    "Are there any scholarships available for international students?",
    "Is university accommodation guaranteed for first-year international students?",
    "What is Freshers' Week and how can I participate?",
]

for question in questions:
    custom_answer = generate_custom_answer(question, vector_store, model)
    print(custom_answer)
    print("\n" + "="*80 + "\n")


Question: What are the English language requirements for admission (IELTS/TOEFL scores)?

Answer: Please contact the RGU admissions or international team for specific information regarding your query.



Question: How do I apply for a Student visa (formerly Tier 4)?

Answer: Please contact the RGU admissions or international team for specific information regarding your query.



Question: What is the process for obtaining a CAS (Confirmation of Acceptance for Studies)?

Answer: Please contact the RGU admissions or international team for specific information regarding your query.



Question: Are there any scholarships available for international students?

Answer: RGU offers various scholarships for international students, including the Alumni Loyalty Discount and Chevening Scholarship. Check eligibility requirements and apply through the university's scholarship page.



Question: Is university accommodation guaranteed for first-year international students?

Answer: First-year interna