<a href="https://colab.research.google.com/github/phaneendra2429/Mental_Health_Chatbot/blob/Liz/Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [None]:
#Install Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import string

#Counting KeyWords
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
#install transformers
from sentence_transformers import SentenceTransformer, util
model= SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")

In [None]:
print(df_mental_health.columns)
print(df_counsellor_chats.columns)
print(df_human_therapist.columns)

In [None]:
df_counsellor_chats.head()

Unnamed: 0.1,Unnamed: 0,questionID,questionTitle,questionText,topic,answerText,views,therapistInfo
0,0,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,"If everyone thinks you're worthless, then mayb...",2899,"Sherry Katz, LCSWCouples and Family Therapist,..."
1,1,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,"Hello, and thank you for your question and see...",3514,"Robin Landwehr, DBH, LPCC, NCCMental Health in..."
2,2,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,First thing I'd suggest is getting the sleep y...,5,Lee KingI use an integrative approach to treat...
3,3,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,Therapy is essential for those that are feelin...,31,"Shauntai Davis-YearginPersonalized, private on..."
4,4,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,I first want to let you know that you are not ...,620,Jordan WhiteLicensed Social Worker at Oak Root...


In [None]:
df_counsellor_chats.topic.value_counts()

In [None]:
#Only have depression related questions
df_counsellor_chats = df_counsellor_chats[df_counsellor_chats.topic == "depression"]

In [None]:
# Group by the questionTitle column and aggregate the answers into a single string
combined_answers = df_counsellor_chats.groupby('questionTitle')['answerText'].agg(lambda x: ' '.join(x)).reset_index()

In [None]:
# Rename the columns if needed
combined_answers.columns = ['questionTitle', 'combined_answers']

# Print the resulting DataFrame
combined_answers.head()

In [None]:
df1_selected= df_human_therapist[["Human", "Assistant"]]
df2_selected= df_mental_health[["Questions_cleaned", "Answers_cleaned"]]
df3_selected= combined_answers[["questionTitle", "combined_answers"]]

In [None]:
# Rename the columns for consistency
df2_selected.columns = ['Human', 'Assistant']
df3_selected.columns = ['Human', 'Assistant']

In [None]:
# Concatenate the rows of the DataFrames
combined_df = pd.concat([df1_selected, df2_selected, df3_selected], ignore_index=True)


In [None]:
combined_df.shape #327 rows of data

(327, 2)

In [None]:
# Count occurrences of each data type in the "Human" column
assistant_dtype_counts = combined_df['Human'].apply(type).value_counts()

# Print the counts
print("Counts of data types in 'Assistant' column:")
print(assistant_dtype_counts)

In [None]:
# Filter rows where the input in the "Human" column is not a float
combined_df = combined_df[~combined_df['Human'].apply(lambda x: isinstance(x, float))]

In [None]:
#Text preprocess with string. Remove punctuations and make lowercase
def column_preprocess(df, columnname):

  def word_preprocess(text):
    text_without_punct = text.translate(str.maketrans(" ", " ", string.punctuation))
    cleaned_text = text_without_punct.lower()
    return cleaned_text
  df[columnname]= df[columnname].apply(word_preprocess)

  return df

In [None]:
cleaned_df = column_preprocess(combined_df, "Human")
cleaned_df = column_preprocess(combined_df, "Assistant")

In [None]:
cleaned_df.reset_index(inplace=True)

In [None]:
sentences = cleaned_df["Human"]
#Encode all sentences
embeddings= model.encode(sentences)
#Compute cosine similarity between all pairs
cos_sim= util.cos_sim(embeddings, embeddings)

In [None]:
similarity_threshold= 0.6

#Add all pairs to a list
grouped_sentences= []

#question-group mapping

questions_to_be_grouped= {}

#counter number for grouping in numbers

group_number =1

#empty list for current group
current_group= []


for i in range(len(cos_sim)):

  #check if the index has already been grouped
  if i not in questions_to_be_grouped:
    current_group = [i]
    for j in range(len(cos_sim)):
      if i!=j and cos_sim[i][j]>=similarity_threshold:
        current_group.append(j)
        questions_to_be_grouped[j]= group_number

    group_number +=1
    if len(current_group) >1:
      grouped_sentences.append(current_group)

In [None]:
#Print the grouped questions with group numbers
for group_number, group in enumerate(grouped_sentences, start=1):
  print(f"Group {group_number}")
  for i in group:
    print(cleaned_df.loc[i, "Human"])
  print('\n')