# step 1 load the dataset

In [1]:
# Example: Load text files from a folder
import os

def load_documents(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

# Replace 'sample_docs/' with your actual folder
documents = load_documents('sschatbot_docs/')
print(f"Loaded {len(documents)} documents.")


Loaded 5 documents.


# Step 2: Tokenizer

In this section we created a basic tokenizer to process the text documents. This tokenizer split each document into tokens (words) and removes punctuation. It also converts all tokens to lowercase and with a regular expression to remove any non-alphanumeric characters.

At the end of this block, we will have a list of tokens for each document.

In [2]:
import re

def tokenize(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Test on one document
tokens = tokenize(documents[0])
print(tokens[:20])  # Preview first 20 tokens

['academic', 'policies', 'and', 'procedures', 'faq', 'q', 'what', 'are', 'the', 'important', 'academic', 'policies', 'i', 'need', 'to', 'know', 'a', 'you', 'should', 'familiarize']


# Step 3: Normalization Pipeline (Stemming, Stop Word Removal, etc.)
Using nltk library, we will implement a normalization pipeline that includes stemming and stop word removal. This will help us reduce the vocabulary size and focus on the most relevant terms in our documents.

For example, the word "anyone" will be stemmed to "anyon", and "glimpse" will be stemmed to "glimps".

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords', quiet=True)  # Suppress download warnings
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def normalize_tokens(tokens):
    return [stemmer.stem(t) for t in tokens if t not in stop_words]

# Example: normalize one document
norm_tokens = normalize_tokens(tokens)
print(norm_tokens[:20])

['academ', 'polici', 'procedur', 'faq', 'q', 'import', 'academ', 'polici', 'need', 'know', 'familiar', 'academ', 'calendar', 'tabl', 'term', 'date', 'add', 'drop', 'deadlin', 'withdraw']


Step 4–6: Word2Vec Training & Queries

In [4]:
from gensim.models import Word2Vec

# Step 4: Prepare corpus for Word2Vec
corpus = []

for doc in documents:
    # Split by sentence using `.split('.')` to avoid nltk sentence tokenizer errors
    sentences = doc.split('.')
    for sentence in sentences:
        tokens = tokenize(sentence)  # from your earlier function
        norm = normalize_tokens(tokens)  # from your earlier function
        if norm:  # only add non-empty sentences
            corpus.append(norm)

print(f"Prepared {len(corpus)} tokenized & normalized sentences.")

# Step 5: Train Word2Vec model
model = Word2Vec(
    sentences=corpus,
    vector_size=100,
    window=5,
    min_count=1,  # use 1 to ensure all words are included
    sg=1,         # skip-gram model
    seed=42
)

print("Word2Vec model trained with", len(model.wv.index_to_key), "words in vocabulary.")

# Step 6: Run Example Queries
def safe_query(description, func):
    print(f"\n {description}")
    try:
        print(func())
    except KeyError as e:
        print(" Word not found in vocabulary:", e)

safe_query("Similarity between 'student' and 'support':",
           lambda: model.wv.similarity('student', 'support'))

safe_query("Most similar to 'exam':",
           lambda: model.wv.most_similar('exam'))

safe_query("Analogy: student + success - stress ≈ ?",
           lambda: model.wv.most_similar(positive=['student', 'success'], negative=['stress']))

safe_query("Odd one out: ['exam', 'assignment', 'deadline', 'cafeteria']",
           lambda: model.wv.doesnt_match(['exam', 'assignment', 'deadline', 'cafeteria']))


Prepared 269 tokenized & normalized sentences.
Word2Vec model trained with 772 words in vocabulary.

 Similarity between 'student' and 'support':
0.8369573

 Most similar to 'exam':
[('student', 0.815856397151947), ('q', 0.8031508326530457), ('financi', 0.7925630807876587), ('support', 0.778157114982605), ('tutor', 0.7773033380508423), ('result', 0.7696248292922974), ('learn', 0.7675812840461731), ('time', 0.7660902738571167), ('assist', 0.7647890448570251), ('access', 0.7647355198860168)]

 Analogy: student + success - stress ≈ ?
[('support', 0.6463963985443115), ('cours', 0.6363980770111084), ('learn', 0.6215872168540955), ('ye', 0.619388997554779), ('studi', 0.6171773076057434), ('assign', 0.6165121793746948), ('log', 0.6045294404029846), ('q', 0.6009884476661682), ('assist', 0.5998092293739319), ('academ', 0.5955801010131836)]

 Odd one out: ['exam', 'assignment', 'deadline', 'cafeteria']
cafeteria


Optional: Explore Vocabulary

In [5]:
# List a few words in the vocabulary
print("\n Sample vocab words:", model.wv.index_to_key[:20])



 Sample vocab words: ['q', 'student', 'academ', 'may', 'card', 'servic', 'cours', 'program', 'one', 'support', 'polici', 'grade', 'exam', 'access', 'faculti', 'lab', 'portal', 'learn', 'document', 'time']


# Build Response Vectors from Your Word2Vec Model

In [7]:
import numpy as np

# Step 4: Prepare corpus for Word2Vec (you already did this)
# corpus = [...]  # your preprocessed, tokenized, normalized sentences list

# Step 5: Train Word2Vec model (you already did this)
# model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, sg=1, seed=42)

# Step 6: Prepare your student queries or knowledge base responses
student_queries = [
    "What is the exam schedule for next semester?",
    "How can I apply for student support services?",
    "Where is the library located on campus?",
    "What are the deadlines for assignment submission?",
    "How to contact the academic advisor?"
]

# Preprocess queries same as training data
def preprocess_text(text):
    tokens = tokenize(text)
    norm_tokens = normalize_tokens(tokens)
    return norm_tokens

processed_queries = [preprocess_text(query) for query in student_queries]

# Get sentence embedding by averaging word vectors
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Create embeddings for all queries
query_vectors = np.array([get_sentence_vector(tokens, model) for tokens in processed_queries])

print(f"✅ Created embeddings for {len(query_vectors)} student queries.")


✅ Created embeddings for 5 student queries.


### 🧭 Matching User Queries via Cosine Similarity

When a user asks a question like "What time is the next bus to Toronto?", we embed the question and compare it against all stored responses.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# User query to match against student queries
user_query = "How do I submit my assignment on time?"

# Preprocess the user query same as training data
def preprocess_text(text):
    tokens = tokenize(text)
    norm_tokens = normalize_tokens(tokens)
    return norm_tokens

user_tokens = preprocess_text(user_query)

# Get embedding for user query
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

user_vec = get_sentence_vector(user_tokens, model)

# Compute cosine similarity with precomputed query_vectors
similarities = cosine_similarity([user_vec], query_vectors)[0]

# Rank student queries by similarity
ranked_indices = similarities.argsort()[::-1]

print("User Query:", user_query)
print("\nBest matches:")

for i in ranked_indices[:3]:  # top 3 matches
    print(f"- ({similarities[i]:.2f}) {student_queries[i]}")


User Query: How do I submit my assignment on time?

Best matches:
- (0.90) How can I apply for student support services?
- (0.84) How to contact the academic advisor?
- (0.84) What are the deadlines for assignment submission?


### 🧭 Extension: Building a Semantic Chatbot for Student advisor chatbot

In [10]:
def get_best_response(user_input, response_texts, response_vecs, model):
    def sentence_vector(sentence):
        # Use your tokenizer and normalizer instead of simple lower & split
        tokens = normalize_tokens(tokenize(sentence))
        vecs = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

    user_vec = sentence_vector(user_input)
    scores = cosine_similarity([user_vec], response_vecs)[0]
    top_index = scores.argmax()
    return response_texts[top_index], scores[top_index]


### 🧭 Running the Chatbot in a Loop

We now simulate a basic chatbot interface in a terminal-style loop. The user types a query, the assistant finds the best-matching predefined response using cosine similarity.

This demonstrates vector space proximity in action in a conversational system.

In [11]:
print("🎓 Student Support Bot (Jupyter Edition)")
print("Ask questions about student support. Type 'exit' to quit.\n")

# Precompute response vectors for your documents (or pre-defined answers)
response_texts = documents  # or use a list of prepared answer strings if you have one

def sentence_vector(sentence):
    tokens = normalize_tokens(tokenize(sentence))
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

response_vectors = np.array([sentence_vector(resp) for resp in response_texts])

max_turns = 10
turn = 0

while turn < max_turns:
    user_input = input("You: ")
    if user_input.strip().lower() in ["exit", "quit"]:
        print("Bot: Good luck with your studies! 👋")
        break

    reply, score = get_best_response(user_input, response_texts, response_vectors, model)
    print(f"Bot: {reply} (similarity: {score:.2f})\n")
    
    turn += 1


🎓 Student Support Bot (Jupyter Edition)
Ask questions about student support. Type 'exit' to quit.

Bot: # ONE Card FAQ

Q: What is a ONE Card?
A: The ONE Card is Conestoga’s official campus ID for students, staff, and faculty. It provides access to buildings, labs, library services, photocopying, and sometimes meal purchases depending on your campus.

Q: Who is eligible for one?
A: All registered students, including full-time, part-time, Continuing Education, and some affiliate learners, receive a ONE Card once admitted and have registered or paid a deposit.

Q: How do I get my card photo uploaded?
A: Log into the ONE Card Portal with your student credentials, follow prompts to upload a clear, front-facing photo, and submit. It should meet specified guidelines (white background, 2″×2″ passport style).

Q: When will my ONE Card be ready?
A: After uploading and submission, allow 1–2 business days for processing. You'll receive an email notification. Physical pick‑up is available at campu

In [13]:
print("🎓 Student Support Bot (Jupyter Edition)")
print("Ask questions about student support. Type 'exit' to quit.\n")

# --- STEP 1: Flatten grouped FAQ into individual entries (if needed) ---
# You must split documents into individual answers if they're grouped under a heading
# For example, if each document has multiple Q&A lines, split them by '\n\n' or similar
flat_documents = []
for doc in documents:
    parts = doc.strip().split("\n\n")
    for part in parts:
        if part.strip():
            flat_documents.append(part.strip())

# --- STEP 2: Sentence embedding ---
def sentence_vector(sentence):
    tokens = normalize_tokens(tokenize(sentence))
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

response_vectors = np.array([sentence_vector(resp) for resp in flat_documents])

# --- STEP 3: Best match based on cosine similarity ---
def get_best_response(user_input, responses, vectors):
    user_vec = sentence_vector(user_input)
    similarities = np.dot(vectors, user_vec) / (
        np.linalg.norm(vectors, axis=1) * np.linalg.norm(user_vec) + 1e-10
    )
    best_idx = int(np.argmax(similarities))
    return responses[best_idx], float(similarities[best_idx])

# --- STEP 4: Chat loop ---
max_turns = 10
turn = 0

while turn < max_turns:
    user_input = input("You: ")
    if user_input.strip().lower() in ["exit", "quit"]:
        print("Bot: Good luck with your studies! 👋")
        break

    reply, score = get_best_response(user_input, flat_documents, response_vectors)
    print(f"Bot: {reply} (similarity: {score:.2f})\n")

    turn += 1


🎓 Student Support Bot (Jupyter Edition)
Ask questions about student support. Type 'exit' to quit.

Bot: Q: Is there a replacement fee?
A: Fees vary but typically range from $20–$35. Temporary cards may be issued while a replacement is processed. (similarity: 0.58)

Bot: Good luck with your studies! 👋


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def preprocess_text(text):
    tokens = tokenize(text)
    return normalize_tokens(tokens)

def get_sentence_vector(tokens, model=model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

print("🎓 Student Support Chatbot (type 'done' to exit)\n")

while True:
    user_query = input("You: ").strip()
    
    if user_query.lower() in ["done", "exit", "quit"]:
        print("Bot: Goodbye! Contact student support if needed. 👋")
        break

    user_tokens = preprocess_text(user_query)
    user_vec = get_sentence_vector(user_tokens)

    similarities = cosine_similarity([user_vec], response_vectors)[0]
    best_idx = similarities.argmax()
    best_score = similarities[best_idx]

    if best_score > 0.4:  # You can adjust threshold
        print(f"Bot: {responses[best_idx]} (confidence: {best_score:.2f})\n")
    else:
        print("Bot: 🤖 Sorry, I’m not sure. Please contact your academic advisor.\n")


🎓 Student Success Advisor Bot
Ask a question (type 'exit' to quit)



NameError: name 'sbert_model' is not defined