### Dependencies

In [None]:
# Note : Install the dependencies from requirements.txt

In [None]:
!pip install --upgrade langchain langchain_ollama

In [1]:
#Libraries for openai
import pandas as pd
import re
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from pydantic import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI


#Libraries for llama
import pandas as pd
from langchain_core.prompts import PromptTemplate,ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List

from langchain.output_parsers import PydanticOutputParser,RetryOutputParser
from langchain_core.exceptions import OutputParserException

from langchain_community.llms import Ollama

from datasets import load_dataset
from collections import defaultdict

In [2]:
COUNSEL_CHAT_RAW_DATA = '../data/counsel-chat_depression_original.json'
COUNSEL_CHAT_CLEANED_DATA = '../data/counsel-chat_depression_cleaned.csv'
QUERY_RAW_FILE = '../data/queries_generated_raw_uncleaned.parquet'
QUERY_GENERATED_FILE = '../data/queries_generated.parquet'
GEMMA2_QUERY_FILE = "../data/gemma2_queries.parquet"
LLAMA3_QUERY_FILE = "../data/llama3_queries.parquet"
QUERY_EVALUATED_FILE = '../data/queries_evaluated.parquet'

## Loading the dataset

In [5]:
# Print the first entry to verify
from datasets import load_dataset
from collections import defaultdict

# Load the dataset from Hugging Face
dataset = load_dataset("nbertagnolli/counsel-chat")

# Dictionary to group by question ID
grouped_questions = defaultdict(lambda: {
    "question_id": None,
    "question_title": None,
    "question": None,
    "question_link": None,
    "topic": None,
    "answers": []
})

count = 0
max_questions = 55

# Group the data by question ID and combine answers
for entry in dataset['train']:
    question_id = entry["questionID"]
    
    # Filter to only include depression entries with a title length of 20 or more
    if entry["topic"] != "depression" or len(entry["questionTitle"]) < 20:
        continue
    # Ensure max questions limit
    if count >= max_questions:
        break
    
    if entry["questionText"] is not None and entry["questionTitle"] is not None:
        # If first time encountering this question, store question details
        if grouped_questions[question_id]["question_id"] is None:
            grouped_questions[question_id]["question_id"] = question_id
            grouped_questions[question_id]["question_title"] = entry["questionTitle"]
            grouped_questions[question_id]["question"] = entry["questionText"]
            grouped_questions[question_id]["question_link"] = entry["questionLink"]
            grouped_questions[question_id]["topic"] = entry["topic"]
            grouped_questions[question_id]["answers"] = []  # Initialize answers list

        # Add answer only if less than 10 answers for this question
        if len(grouped_questions[question_id]["answers"]) < 5 and entry["answerText"] is not None:
            grouped_questions[question_id]["answers"].append({
                "answer": entry["answerText"],
                "therapist_info": entry["therapistInfo"]
            })

        # Increment only when a new question is added
        if len(grouped_questions[question_id]["answers"]) == 1:
            count += 1

# Convert grouped questions into a list format
formatted_data = list(grouped_questions.values())

with open(COUNSEL_CHAT_RAW_DATA, "w") as json_file:
    json.dump(formatted_data, json_file, indent=4)


Repo card metadata block was not found. Setting CardData to empty.


In [3]:
import re

with open(COUNSEL_CHAT_RAW_DATA, "r") as json_file:
    json_data = json.load(json_file)

# Convert the loaded JSON data into a DataFrame
df = pd.DataFrame(json_data)
df

Unnamed: 0,question_id,question_title,question,question_link,topic,answers
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,[{'answer': 'It is very common for people to h...
1,1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,https://counselchat.com/questions/my-apartment...,depression,[{'answer': 'This can be a difficult situation...
2,2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,https://counselchat.com/questions/i-feel-like-...,depression,[{'answer': 'Do you live with your mom and hav...
3,3,Why do I feel like I don't belong anywhere?,There are many people willing to lovingly prov...,https://counselchat.com/questions/why-do-i-fee...,depression,[{'answer': 'I truly understand what you are s...
4,4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,https://counselchat.com/questions/how-can-i-he...,depression,[{'answer': 'You're probably not going to like...
5,5,Can i learn to be happy alone?,I'm dealing with an illness that will never go...,https://counselchat.com/questions/can-i-learn-...,depression,[{'answer': 'The power of acceptance is key! ...
6,6,How can I deal with my posttraumatic stress di...,"I feel angry, anxious, and depressed. The PTSD...",https://counselchat.com/questions/how-can-i-de...,depression,[{'answer': 'I second the suggestion to find a...
7,7,How do I know if I have depression?,I had a very troubled up bringing and I'm curr...,https://counselchat.com/questions/how-do-i-kno...,depression,[{'answer': 'Thanks for reaching out with this...
8,8,How do I make new friends?,"In the past year, two of my best and only clos...",https://counselchat.com/questions/how-do-i-mak...,depression,[{'answer': 'I am sending your loving vibes as...
9,9,How can I stop feeling empty?,I feel lazy and numb. I have no interest in th...,https://counselchat.com/questions/how-can-i-st...,depression,[{'answer': 'The feeling empty could stem from...


### Data cleaning

In [7]:
# Convert all columns to string type
#https://github.com/mahnazkoupaee/WikiHow-Dataset/blob/master/process.py
import re

# Define the cleaning function
def clean_text(text):
    # Replace non-breaking space with a regular space
    text = re.sub(r'\u00a0', ' ', text)  
    
    # Remove extra commas and periods followed by commas
    text = text.replace(".,", ".")
    
    # Remove extra commas and newline characters
    text = re.sub(r'[.]+[\n]+[,]', ". ", text)
    
    # Remove newline characters
    text = text.replace("\n", " ")
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)  
    
    # Strip leading and trailing punctuation and special characters from words
    text = re.sub(r'^[^\w\s]+|[^\w\s]+$', '', text)  # Remove leading/trailing special characters
    
    return text.strip()

# Assume df is your DataFrame with columns: 'question_title', 'question', and 'answers'
# Clean the title and question columns
df['question_title'] = df['question_title'].apply(clean_text)
df['question'] = df['question'].apply(clean_text)

# Clean the answers
df['answers'] = df['answers'].apply(lambda answers: [
    {**answer, "answer": clean_text(answer["answer"])} for answer in answers
])

df.to_parquet(COUNSEL_CHAT_CLEANED_DATA)

In [8]:
df = pd.read_parquet(COUNSEL_CHAT_CLEANED_DATA)

# Generate Synthetic Data

### Prompt for Generating Synthetic data q from context

In [9]:
# Define prompt template for generating questions

QUERY_GENERATION_PROMPT_TEMPLATE = """
You are an AI assistant designed to generate realistic user queries related to various topics on depression.
Your task is to create diverse, natural-sounding questions that reflect what actual users might ask when seeking 
information on a specific subject.

Generate 5 diverse, natural, short, specific, and concise user queries (not more than 10 words) related to the given topic on depression.
Ensure queries vary in complexity, specificity, and format.
Mimic real users seeking information on a chatbot platform.
Use simple language and maintain a neutral, conversational tone.

**Guidelines:**
1. Vary complexity, specificity, and format.
2. Mimic chatbot users seeking information.
3. Use simple language and maintain a neutral tone.
4. Avoid ambiguity and ensure specificity.

Example Input:
Topic: ### I've been feeling sadness. When I get around a particular person or when I go home, I'm just sad or irritated.
The feeling comes and goes all day every day. I don't know what wrong. 
I'm not sure if I'm just making up symptoms and problems in my head or if there is something actually wrong.

Example Queries:
What should I do if I feel sad around certain people?
How can I discern between real emotional issues and imagined symptoms?
What are common symptoms of emotional distress?
When should I seek professional help for persistent sadness?
How can I cope with feelings of irritation at home?

Generate the output in the list of string format
{format_instructions}

Now, generate user queries based on the following topic:
{topic}
"""

### Generating Synthetic data q from context

In [10]:
## https://nanonets.com/blog/langchain/
def set_generate_model_prompt_chain(GENRATIVE_MODEL):
    
    class LLMGenerate_OutputFormat(BaseModel):
        """LLM output format"""
        responses: List[str] = Field(description="List of string responses generated by the LLM",
            min_items=1)
        
    model_generate_synthetic = Ollama(model=GENRATIVE_MODEL)
    
    output_parser = PydanticOutputParser(pydantic_object=LLMGenerate_OutputFormat)

    prompt = PromptTemplate(
        template=QUERY_GENERATION_PROMPT_TEMPLATE,
        input_variables=["topic"],
        partial_variables={"format_instructions": output_parser.get_format_instructions()},
    )
    
    chain = prompt | model_generate_synthetic 
    
    return chain 
    

def llm_generate_queries_for_context(topic,chain,GENRATIVE_MODEL):
    """
    Generate queries for a given context topic using LLaMA.

    Args:
    topic (str): Context topic.

    Returns:
    List[str]: List of generated queries.

    Raises:
    Exception: If LLaMA invocation fails.
    """
    try:
        queries = chain.invoke({"topic": topic})
        
        if(GENRATIVE_MODEL!="llama3.2"):
            
            llm_output_str = queries.strip()

            # Find the JSON part
            json_start = llm_output_str.find("{")
            json_end = llm_output_str.rfind("}")

            # Extract JSON and Parse JSON
            json_str = llm_output_str[json_start:json_end+1]
            llm_output = json.loads(json_str)

            # Extract responses
            responses = llm_output["responses"]

            return responses
        else:
            llm_output_str = queries.strip()

            # Find the JSON part
            json_start = llm_output_str.find("{")
            json_end = llm_output_str.rfind("}")

            # Extract JSON and Parse JSON
            json_str = llm_output_str[json_start:json_end+1]
            llm_output = json.loads(json_str)

            # Extract responses
            responses = llm_output["responses"]

            return responses

    except Exception as e:
        print(e)
        return []

In [13]:
GENRATIVE_MODEL = "gemma2:2b"
generation_chain = set_generate_model_prompt_chain(GENRATIVE_MODEL)

row_num = 2
input_text = df.iloc[row_num]['question_title'] + df.iloc[row_num]['question']
queries = llm_generate_queries_for_context(input_text,generation_chain,GENRATIVE_MODEL)

# Print generated queries
print(input_text)
print("\nOutput Queries all:")
print(queries)
print("")
for i, query in enumerate(queries, start=1):
    print(f"Query {i}: {query}")

I feel like my mother doesn't support meMy mother is combative with me when I say I don't want to talk with her about my depression.    She hasn't been supportive of me in the past and she isn't someone that I feel comfortable opening up to. She constantly tries to instigate conversations where she asks me questions that I don't want to or can't answer. I tell her I don't want to talk and she starts arguments with me.    How can I get her to understand?

Output Queries all:
['How can I make my mom understand I need space during depression?', 'What are ways to communicate my boundaries to my mother effectively?', 'My mother is triggering, how do I manage conversations with her about depression?', "Help me talk to my mom about not wanting to talk when she's pushing", "I feel like my mom doesn't support me - what should I do? "]

Query 1: How can I make my mom understand I need space during depression?
Query 2: What are ways to communicate my boundaries to my mother effectively?
Query 3: 

### Applying Synthetic data q  Generation from context

In [11]:
GENRATIVE_MODEL = "gemma2:2b"
generation_chain = set_generate_model_prompt_chain(GENRATIVE_MODEL)

def llm_generate_queries_for_context_helper(row):
    text = row['question_title'] + row['question']
    queries = llm_generate_queries_for_context(text,generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return queries  

df = pd.read_parquet(COUNSEL_CHAT_CLEANED_DATA)
# Apply the function to each row in the DataFrame and create a new column 'queries'
df['gemma_queries'] = df.apply(llm_generate_queries_for_context_helper, axis=1)
df.to_parquet(QUERY_RAW_FILE)

Expecting ',' delimiter: line 6 column 1 (char 212)


In [12]:
GENRATIVE_MODEL = "llama3"
generation_chain = set_generate_model_prompt_chain(GENRATIVE_MODEL)

def llm_generate_queries_for_context_helper(row):
    text = row['question_title'] + row['question']
    queries = llm_generate_queries_for_context(text,generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return queries  


df=pd.read_parquet(QUERY_RAW_FILE)

# Apply the function to each row in the DataFrame and create a new column 'queries'
df['llama3_queries'] = df.apply(llm_generate_queries_for_context_helper, axis=1)
df.to_parquet(QUERY_RAW_FILE)

Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 6 column 1 (char 224)


### Clean the Synthetic data generated q file

In [13]:
def clear_generated_query_data(file_name):
    df= pd.read_parquet(file_name)
    
    df = df[df['gemma_queries'].apply(lambda x: len(x) != 0) & df['llama3_queries'].apply(lambda x: len(x) != 0)]
    
    df.reset_index(drop=True, inplace=True)
    print(df.shape)
    df.to_parquet(QUERY_GENERATED_FILE)

    # Explode the 'LLM_Query' column and
    df_exploded_gemma2 = df.explode(['gemma_queries'])
    df_exploded_gemma2.reset_index(drop=True, inplace=True)
    df_exploded_gemma2.drop(columns=['llama3_queries'], inplace=True)
    df_exploded_gemma2.to_parquet(GEMMA2_QUERY_FILE)

    # Explode the 'LLM_Query' column and
    df_exploded_llama3 = df.explode(['llama3_queries'])
    df_exploded_llama3.reset_index(drop=True, inplace=True)
    df_exploded_llama3.drop(columns=['gemma_queries'], inplace=True)
    df_exploded_llama3.to_parquet(LLAMA3_QUERY_FILE)
    
    # Display the result
    return df, df_exploded_gemma2,df_exploded_llama3

raw_query_df, gemma_query_df,llama_query_df = clear_generated_query_data(QUERY_RAW_FILE)
raw_query_df.head(2)

(52, 8)


Unnamed: 0,question_id,question_title,question,question_link,topic,answers,gemma_queries,llama3_queries
0,0,Do I have too many issues for counseling,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,[{'answer': 'It is very common for people to h...,[Is there a point where the issues are too com...,[Can I really expect one counselor to help wit...
1,1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,https://counselchat.com/questions/my-apartment...,depression,[{'answer': 'This can be a difficult situation...,[How do I handle a landlord denying me a servi...,[How can I convince my apartment manager that ...


### LLM-as-a-judge :: Evaluating the quality of generated q data

#### Defining prompts for Evaluating the Synthetic data

In [15]:
# Define evaluation prompt template
Q_EVALUATION_PROMPT_TEMPLATE = """
You are an AI evaluator tasked with assessing the relevance and effectiveness of user queries generated for a specific topic.
Your evaluation will help refine the query generation process.
########
Instructions:
Evaluate the relevance, clarity,overall quality of the provided queries.
Score each query from 1-5 based on the following criteria:
- Relevance: Does the Query directly or indirectly related to the topic, although not focused?
- Clarity: How easy is the query to understand?
- Quality: Does the Query demonstrate user needs?

Provide constructive feedback for improvement.
########
Now, evaluate the user queries based on the following topic:
{topic}
Queries:
{queries}

Provide your evaluation in the following format:

Query {{number}}: {{query}}
Score: {{score}} (1-5)
Feedback: {{feedback}}
"""

#### Evaluate the quality

In [16]:
# Define llm model
def set_evaluate_model_prompt_chain(evaluate_llm_model):
    
    model_evaluate_synthetic = Ollama(model=evaluate_llm_model)
    
    prompt_template = PromptTemplate(
            template=Q_EVALUATION_PROMPT_TEMPLATE,
            input_variables=["topic", "queries"],
        )

    evaluation_chain = prompt_template | model_evaluate_synthetic
    return evaluation_chain 


def llm_evaluate_queries_for_context(topic: str, queries: List[str],evaluation_chain,evaluate_llm_model):
    """Evaluate queries using LLaMA model."""
    try:

        llm_response = evaluation_chain.invoke({"topic": topic, "queries": queries})
        
        llm_response = llm_response.replace('**', '').replace('*', '')  # Remove asterisks
        llm_response = llm_response.replace(':', ': ')  # Standardize colon spacing
        
        if(evaluate_llm_model=="llama3" or evaluate_llm_model=="gemma2:2b"):
            FEEDBACK_PATTERN = r'Feedback: (.*)\n'
            feedbacks = re.findall(FEEDBACK_PATTERN, llm_response, re.MULTILINE)
            
            scores = []
            SCORE_PATTERN = r'Score:\s*(\d+)'
            matches = re.findall(SCORE_PATTERN, llm_response)

            if matches:
                scores = [int(match) for match in matches]
                
            return scores, feedbacks
        
        else:
            FEEDBACK_PATTERN = r'Feedback:\s*(.*?)(?=\n|$)'
            SCORE_PATTERN = r'\*\*Score:\*\* (\d+)'

            # Extract the scores
            scores = [int(score) for score in re.findall(SCORE_PATTERN, llm_response)]
            print(scores)

             # Extract feedbacks
            feedbacks_raw = re.findall(FEEDBACK_PATTERN, llm_response)

            # Append feedbacks one by one to the list
            feedbacks = []
            for feedback in feedbacks_raw:
                feedback = feedback.replace('**', '')
                feedbacks.append(feedback.strip()) 

            return scores, feedbacks

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return [], []


In [21]:
### Test evaluation
EVALUATION_MODEL = "gemma2:2b"
evaluation_chain = set_evaluate_model_prompt_chain(EVALUATION_MODEL)

# Evaluate queries
scores,feedbacks = llm_evaluate_queries_for_context(input_text, queries, evaluation_chain,EVALUATION_MODEL)
print('Answer Evaluation results')
print(scores)
print(feedbacks)

Answer Evaluation results
[4, 5, 4, 3, 2]
["  This query is highly relevant, as it directly addresses the user's struggle to communicate their need for space while dealing with depression. It also demonstrates a clear desire for guidance on achieving this goal. While slightly broad in its wording, it captures the essence of the problem.", " This query is incredibly relevant and effective because it specifically addresses the user's desire for guidance on setting and communicating boundaries. It demonstrates a clear understanding of the root of the problem and seeks actionable solutions.", " This query is highly relevant and focuses on a specific challenge (triggering behavior). The user clearly wants help managing their interactions with their mom, making it a strong expression of their needs. It's slightly more vague in its scope compared to the other queries but still captures the essence of the problem.", " This query is quite relevant, directly addressing the user's challenge with 

In [17]:
def llm_evaluate_queries_for_context_helper(row,evaluation_chain,EVALUATION_MODEL,query_col_name):
    text = row['question_title'] + row['question']
    scores,feedbacks = llm_evaluate_queries_for_context(text, row[query_col_name],evaluation_chain,EVALUATION_MODEL)  # Generate queries based on the text
    return pd.Series([scores, feedbacks])

In [19]:
EVALUATION_MODEL = "llama3"
evaluation_chain = set_evaluate_model_prompt_chain(EVALUATION_MODEL)

raw_query_df[['gemma_queries_llama3_score', 'gemma_queries_llama3_feedback']] = raw_query_df.apply(
    lambda row: llm_evaluate_queries_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='gemma_queries'), axis=1)

raw_query_df.to_parquet(QUERY_EVALUATED_FILE)

In [None]:
EVALUATION_MODEL = "gemma2:2b"
evaluation_chain = set_evaluate_model_prompt_chain(EVALUATION_MODEL)
raw_query_df = pd.read_parquet(QUERY_EVALUATED_FILE)

raw_query_df[['llama3_queries_gemma_score', 'llama3_queries_gemma_feedback']] = raw_query_df.apply(
    lambda row: llm_evaluate_queries_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='llama3_queries'), axis=1)

raw_query_df.to_parquet(QUERY_EVALUATED_FILE)