In [1]:
# 🧪 1. Load and Explore Data
import pandas as pd

train_df = pd.read_csv("F:\\ML Projects\\health_chatbot\\data\\train_data_chatbot.csv")
val_df = pd.read_csv("F:\\ML Projects\\health_chatbot\\data\\validation_data_chatbot.csv")

print("Train Dataset Shape:", train_df.shape)
print("Validation Dataset Shape:", val_df.shape)
print("\nSample from Training Data:")
print(train_df.head())

Train Dataset Shape: (47603, 4)
Validation Dataset Shape: (11901, 4)

Sample from Training Data:
                                      short_question  \
0  can an antibiotic through an iv give you a ras...   
1  can you test positive from having the hep b va...   
2  what are the dietary restrictions for celiac d...   
3  can i transmit genital warts seventeen years a...   
4                          is all vitamin d the same   

                                        short_answer                   tags  \
0  yes it can even after you have finished the pr...  ['rash' 'antibiotic']   
1  test positive for what if you had a hep b vacc...        ['hepatitis b']   
2  omitting gluten from the diet is the key to co...     ['celiac disease']   
3  famotidine pepcid products is in a drug class ...               ['wart']   
4  hi this means you do not have hepatitis b and ...          ['vitamin d']   

   label  
0    1.0  
1    1.0  
2    1.0  
3   -1.0  
4   -1.0  


In [2]:
# 🧼 2. Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# 🔄 3. Apply Preprocessing
train_df['clean_question'] = train_df['short_question'].apply(preprocess_text)
train_df['clean_answer'] = train_df['short_answer'].apply(preprocess_text)

val_df['clean_question'] = val_df['short_question'].apply(preprocess_text)
val_df['clean_answer'] = val_df['short_answer'].apply(preprocess_text)

In [4]:
# 🧠 4. TF-IDF Similarity-based Answer Retrieval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = vectorizer.fit_transform(train_df['clean_question'])
tfidf_val = vectorizer.transform(val_df['clean_question'])

predicted_answers = []
for i in range(tfidf_val.shape[0]):
    sim_scores = cosine_similarity(tfidf_val[i], tfidf_train)[0]
    best_match_idx = np.argmax(sim_scores)
    predicted_answers.append(train_df.iloc[best_match_idx]['clean_answer'])

print('Accuracy:', accuracy_score(val_df['clean_answer'], predicted_answers))

Accuracy: 0.0008402655239055541


In [5]:
# 5. Encode Questions with Sentence Embeddings
import os
os.environ["USE_TF"] = "0"


from sentence_transformers import SentenceTransformer, util

# Load pre-trained model (can be replaced with more specific ones like 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for all training questions
question_embeddings = model.encode(train_df['clean_question'].tolist(), convert_to_tensor=True)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
# 6. Define the search function

def get_answer(user_question, question_embeddings, questions, answers, model, top_k=1):
    # Encode the user query
    query_embedding = model.encode(user_question, convert_to_tensor=True)

    # Semantic search
    hits = util.semantic_search(query_embedding, question_embeddings, top_k=top_k)[0]

    # Return best-matching answer
    top_hit = hits[0]
    return answers[top_hit['corpus_id']]


In [7]:
# 7. Test Chatbot Locally

user_question = "What are the symptoms of diabetes?"
response = get_answer(user_question, question_embeddings, train_df['clean_question'], train_df['clean_answer'], model)
print("Bot:", response)


Bot: may see one two drop normal see two drop may received full dose inject another dose talk healthcare provider prevent dripping leaking sure firmly push hold knob thumb 10 second removing needle skin
