1. Read csv :

In [None]:
import pandas as pd

df = pd.read_csv('Mental_Health_FAQ.csv',sep=';')
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,Mental illness does not discriminate; it can ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [66]:
df.dtypes

Unnamed: 0,0
Question_ID,int64
Questions,object
Answers,object


2. Convert to lower case :

In [67]:
df['Questions'] = df['Questions'].str.lower()
df['Answers'] = df['Answers'].str.lower()
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,what does it mean to have a mental illness?,mental illnesses are health conditions that di...
1,2110618,who does mental illness affect?,it is estimated that mental illness affects 1 ...
2,6361820,what causes mental illness?,mental illness does not discriminate; it can ...
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...
4,7657263,can people with mental illness recover?,"when healing from mental illness, early identi..."


3. Remove special caracters :

In [68]:
import re
def remove_special_characters(text):
  text=re.sub('[^a-zA-Z0-9]',' ',text)
  text=re.sub('\s+',' ',text)
  return text

df['Questions'] = df['Questions'].apply(lambda x : remove_special_characters(x))
df['Answers'] = df['Answers'].apply(lambda x : remove_special_characters(x))
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...
1,2110618,who does mental illness affect,it is estimated that mental illness affects 1 ...
2,6361820,what causes mental illness,mental illness does not discriminate it can a...
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...
4,7657263,can people with mental illness recover,when healing from mental illness early identif...


4. Remove punctuation :

In [69]:
import string
def remove_punctuations(text):
  punctuations = string.punctuation
  return text.translate(str.maketrans('','',punctuations))

df['Questions'] = df['Questions'].apply(remove_punctuations)
df['Answers'] = df['Answers'].apply(remove_punctuations)
df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...
1,2110618,who does mental illness affect,it is estimated that mental illness affects 1 ...
2,6361820,what causes mental illness,mental illness does not discriminate it can a...
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...
4,7657263,can people with mental illness recover,when healing from mental illness early identif...


5. Tokenization :

In [70]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences=df[['Questions','Answers']].values.tolist()
print(sentences)
questions = df['Questions'].tolist()
answers = df['Answers'].tolist()

tokenize = Tokenizer(num_words=100 , oov_token="<oov>")
tokenize.fit_on_texts(sentences)
word_index = tokenize.word_index
print(word_index)



6. remove stopwords :

In [71]:
!pip install nltk



In [72]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
import nltk
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
#", ".join(stopwords.words('english'))

def remove_stopwords(text):
  return " ".join([word for word in text.split() if word not in STOPWORDS])

df['Questions'] = df['Questions'].apply(remove_stopwords)
df['Answers'] = df['Answers'].apply(remove_stopwords)
df.head()


Unnamed: 0,Question_ID,Questions,Answers
0,1590140,mean mental illness,mental illnesses health conditions disrupt per...
1,2110618,mental illness affect,estimated mental illness affects 1 5 adults am...
2,6361820,causes mental illness,mental illness discriminate affect anyone rega...
3,9434130,warning signs mental illness,symptoms mental health disorders vary dependin...
4,7657263,people mental illness recover,healing mental illness early identification tr...


6. Lemmatization :

In [74]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text) :
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

df['Questions'] = df['Questions'].apply(lambda text: lemmatize_words(text))
df['Answers'] = df['Answers'].apply(lambda text: lemmatize_words(text))
df.head()


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Question_ID,Questions,Answers
0,1590140,mean mental illness,mental illnesses health condition disrupt pers...
1,2110618,mental illness affect,estimate mental illness affect 1 5 adult ameri...
2,6361820,cause mental illness,mental illness discriminate affect anyone rega...
3,9434130,warn sign mental illness,symptom mental health disorder vary depend typ...
4,7657263,people mental illness recover,heal mental illness early identification treat...


7. Tf-idf

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
questions = df['Questions'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(questions)

8. Cosine Similarity :

In [79]:
def get_most_similar_question(user_question):
    user_question = user_question.lower()
    user_question = remove_special_characters(user_question)
    user_question = remove_punctuations(user_question)
    user_question = remove_stopwords(user_question)
    user_question = lemmatize_words(user_question)

    user_vec = vectorizer.transform([user_question])
    similarity_scores = cosine_similarity(user_vec, X)
    most_similar_idx = similarity_scores.argmax()
    max_score = similarity_scores.max()

    return df['Answers'].iloc[most_similar_idx], max_score

9. interface :

In [88]:
while True :
    question = input("Vous : ")
    if question.lower() in ["quit", "exit", "bye"]:
        print("Bot : Merci et à bientôt !")
        break
    response, score = get_most_similar_question(question)
    if score < 0.3:
        print("Sorry, I didn't understand your question. Could you please rephrase it?")
    else:
        print("Bot :", response)


Vous : recover
Bot : heal mental illness early identification treatment vital importance base nature illness range effective treatment available type treatment essential person affect proactive fully engage recovery process many people mental illness diagnose treat respond well although might experience return symptom even case careful monitor management disorder still quite possible live fulfil productive life
Vous : causes
Bot : mental illness discriminate affect anyone regardless gender age income social status ethnicity religion sexual orientation background although mental illness affect anyone certain condition may common different population instance eat disorder tend occur often females disorder attention deficit hyperactivity disorder prevalent child additionally age susceptible young old especially vulnerable mental illness usually strike individual prime live 75 percent mental health condition develop age 24 make identification treatment mental disorder particularly difficul

KeyboardInterrupt: Interrupted by user