### Dependencies

In [None]:
# Note : Install the dependencies from requirements.txt

In [None]:
!pip install --upgrade langchain langchain_ollama

In [19]:
#Libraries for openai
import pandas as pd
import re
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from pydantic import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI


#Libraries for llama
import pandas as pd
from langchain_core.prompts import PromptTemplate,ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List

from langchain.output_parsers import PydanticOutputParser,RetryOutputParser
from langchain_core.exceptions import OutputParserException

from langchain_community.llms import Ollama

## Loading the dataset

In [76]:
WIKI_SAMPLE_FILE = '../data/wikihowAll_sample.parquet'
WIKI_SAMPLE_CLEANED_FILE = '../data/wikihowAll_sample_cleaned.parquet'

WIKI_QUERY_RAW_FILE = '../data/wikihow_queries_generated_raw_uncleaned.parquet'
WIKI_QUERY_GENERATED_FILE = '../data/wikihow_queries_generated.parquet'
GEMMA2_QUERY_FILE = "../data/wikihow_gemma2_queries.parquet"
LLAMA3_QUERY_FILE = "../data/wikihow_llama3_queries.parquet"
WIKI_QUERY_EVALUATED_FILE = '../data/wikihow_queries_evaluated.parquet'

WIKI_QUERY_ANSWER_RAW_FILE = '../data/wikihow_queries_answers_generated_raw_uncleaned.parquet'
WIKI_QUERY_ANSWER_GENERATED_FILE = '../data/wikihow_queries_answers_generated.parquet'
GEMMA2_QA_DATA_FILE = "../data/wikihow_gemma2_qa_data.parquet"
LLAMA3_QA_DATA_FILE = "../data/wikihow_llama3_qa_data.parquet"
GEMMA2_QA_DATA_EVALUATED_FILE = "../data/wikihow_gemma2_qa_data_evaluated.parquet"
LLAMA3_QA_DATA_EVALUATED_FILE = "../data/wikihow_llama3_qa_data_evaluated.parquet"

In [4]:
#https://github.com/mahnazkoupaee/WikiHow-Dataset?tab=readme-ov-file
## Wikihowall.csv link is below
#https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358
df_all = pd.read_csv('wikihowAll.csv')
## dataset is very big we take specific samples
df_all = df_all.sample(frac=0.001)
df_all.to_parquet(WIKI_SAMPLE_FILE)

df_all.shape

(215, 3)

### Preparing the dataset

In [105]:
# Convert all columns to string type
#https://github.com/mahnazkoupaee/WikiHow-Dataset/blob/master/process.py
import re

df_all = pd.read_parquet(WIKI_SAMPLE_FILE)
df_all = df_all.astype(str)

# Get the shape of the dataframe
rows, columns =df_all.shape

# Define cleaning function
def clean_data(row):
    abstract = row['headline']
    article = row['text']
    
    # Common cleaning function for both abstract and article
    def clean_text(text):
        # Remove extra commas and periods followed by commas
        text = text.replace(".,", ".")
        
        # Remove extra commas and newline characters
        text = re.sub(r'[.]+[\n]+[,]', ". ", text)
        
        # Remove newline characters
        text = text.replace("\n", " ")
        
        # Remove extra whitespace
        text = re.sub(' +', ' ', text)
        
        # Strip leading and trailing punctuation and special characters from words
        text = re.sub(r'^[^\w\s]+|[^\w\s]+$', '', text)  # Remove leading/trailing special characters
        
        return text.strip()
    
    # Apply cleaning to both abstract and article
    abstract = clean_text(abstract)
    article = clean_text(article)
    
    # Check if the cleaned text contains meaningful content
    if not re.search(r'\w', abstract) or not re.search(r'\w', article):
        return None  # Drop this row if no meaningful content is found
    
    # Drop rows if the abstract or article is too short (e.g., less than 100 characters)
    if len(abstract) < 80 or len(article) < 1400:
        return None
    
    return {
        'headline': abstract,
        'text': article
    }
    

# Apply cleaning function to each row
df = df_all.apply(clean_data, axis=1)

# Remove None values (rows that didn't meet the threshold)
df = df.dropna()

# Convert the cleaned data back to a dataframe
df = df.apply(pd.Series)
df.reset_index(drop=True, inplace=True)
print(df.shape)
if df.shape[0] > 110:
    df = df.head(110)
    
print(df.shape)
#df.to_parquet(WIKI_SAMPLE_CLEANED_FILE)

(116, 2)
(110, 2)


In [106]:
df = pd.read_parquet(WIKI_SAMPLE_CLEANED_FILE)

# Generate Synthetic Data

## Option 1: LLM :: To Generate both q & a from context

### Prompt for Generating Synthetic data q & a from context

In [55]:
# Define prompt template for generating questions

QUERY_AND_ANSWER_GENERATION_PROMPT_TEMPLATE = """
You are an AI assistant designed to generate realistic user queries and its corresponding answer related to various topics.
Your task is to create diverse, natural-sounding questions that reflect what actual users might ask when seeking
information on a specific subject.

**Key aspects:**
Generate 5 diverse, natural, short, specific, and concise user queries (≤ 10 words) and its corresponding answers (60-150 words)
based solely on the provided topic.
Do not refer to external sources or use prior knowledge.
Only use the provided topic to generate queries and answers. 
Ensure all queries have answers and ensure the answer generated is a direct and relevant response to the query.
Mimic real users seeking information on a chatbot platform.
Use simple language and maintain a neutral, conversational tone.

**Guidelines:**
1. Vary complexity, specificity, and format.
2. Mimic chatbot users seeking information.
3. Use simple language and maintain a neutral tone.
4. Avoid ambiguity and ensure specificity.
5. Ensure all queries have answers.
6. Ensure an answer directly and specifically addresses its query instead of giving vaugue answer.
7. Answers must directly address the query, providing specific and relevant information directly instead of referencing external sources.
8. If a query asks for steps, provide clear, numbered steps in the answer.

**Additional Requirements:**
Ensure query-answer pairs are logically connected and relevant to the topic.
When queries request procedures or steps, answers must provide sequential, numbered  instructions.
When queries ask for size or shape, answers must: Specify exact measurements (where applicable), Describe proportions or dimensions
Use concise and clear language and avoid vague or open-ended answers..

**Evaluation Criteria:**
Query relevance to the topic.
Answer accuracy and relevance to the query.
Answer completeness (60-150 words).
Query-answer alignment: How well does the answer respond to the query?
Step-by-step clarity (when applicable)
Size/shape explanation clarity (when applicable)

Example Input:
Topic: ### Planning a Budget-Friendly Vacation 
Planning a budget-friendly vacation requires careful research and flexibility. When planning a vacation on a budget,
consider destinations with affordable accommodations, transportation, and activities, such as:
Thailand: Known for its affordable beaches, street food, and cultural attractions.
Vietnam: Offers affordable accommodations, delicious cuisine, and scenic landscapes.
Peru: Provides affordable Inca Trail hikes, cultural experiences, and historical sites.
Researching and booking in advance can also help save money. Other budget-friendly strategies include:
Booking budget-friendly accommodations like hostels or guesthouses.
Using public transportation or budget airlines.
Planning free or low-cost activities like hiking or exploring local markets.

Example Queries and Answers:
Query 1: What are the best budget-friendly destinations worldwide?
Answer 1: Consider Thailand, Vietnam, or Peru for affordable accommodations and activities.
Query 2: How can I save money on flights and accommodations?
Answer 2: Book in advance, use travel rewards credit cards, and explore budget airlines.
Query 3: What's the cheapest way to travel across Europe?
Answer 3: Use budget airlines, buses, or trains, and book accommodations through hostels.
Query 4: How do I plan a 5-day budget trip from Singapore to Hawaii?
Answer 4: Research affordable flights, book budget-friendly accommodations, and plan free activities.
Query 5: What budgeting apps are best for travel expenses?
Answer 5: Popular options include Mint, Trail Wallet, and Budget Your Trip.

Generate the output in the list of string format
{format_instructions}

Now, generate user queries based on the following topic:
{topic}
"""

###  Generating Synthetic data q & a from context

In [53]:
from typing import Dict

def set_generate_model_prompt_chain_for_qa(GENRATIVE_MODEL):
    
    class LLMGenerate_OutputFormat(BaseModel):
        """LLM output format"""
        queries: List[str] = Field(description="List of string queries generated by the LLM", min_items=1)
        answers: List[str] = Field(description="List of string answers generated by the LLM", min_items=1)
        
    model_generate_synthetic = Ollama(model=GENRATIVE_MODEL)
    
    output_parser = PydanticOutputParser(pydantic_object=LLMGenerate_OutputFormat, temperature=0.2)
    prompt = PromptTemplate(
        template=QUERY_AND_ANSWER_GENERATION_PROMPT_TEMPLATE,
        input_variables=["topic"],
        partial_variables={"format_instructions": output_parser.get_format_instructions()},
    )
    
    chain = prompt | model_generate_synthetic 
    
    return chain 



def llm_generate_queries_and_answers_for_context(topic, chain, GENRATIVE_MODEL):
    """
    Generate queries and answers for a given context topic using LLaMA.

    Args:
    topic (str): Context topic.

    Returns:
    Tuple[List[str], List[str]]: List of generated queries and answers.

    Raises:
    Exception: If LLaMA invocation fails.
    """
    try:
        output = chain.invoke({"topic": topic})
        output = output.replace("```json", "").replace("```", "").strip()
        
        print("After cleaning ***")
        print(type(output))
        print(output)
        
        if(GENRATIVE_MODEL=="gemma2:2b"):
            parsed_data = json.loads(output)

            # Extract queries and answers
            queries = parsed_data['queries']
            answers = parsed_data['answers']
            return queries, answers
        
        elif(GENRATIVE_MODEL=="llama3"):
            llm_output_str = output.strip()

            # Remove leading/trailing whitespace and newline characters
            llm_output_str = llm_output_str.replace("\n", "").replace("\r", "")

            # Find the JSON part
            json_start = llm_output_str.find("{")
            json_end = llm_output_str.rfind("}")

            # Extract JSON
            json_str = llm_output_str[json_start:json_end+1]
            print(json_str)

            # Check if JSON string is valid
            if json_str and json_str.startswith("{") and json_str.endswith("}"):
                try:
                    # Parse JSON
                    llm_output = json.loads(json_str)

                    # Extract queries and answers
                    queries = llm_output.get("queries", [])
                    answers = llm_output.get("answers", [])
                    
                    return queries, answers
                except json.JSONDecodeError as e:
                    # Handle JSON parsing errors
                    print(f"JSON parsing error: {e}")
                    return [], []
            else:
                # Handle invalid JSON format
                print("Invalid JSON format")
                return [], []
            
        else:
            try:
                data_dict = json.loads(output)

                # Extract queries and answers
                queries = data_dict['queries']
                answers = data_dict['answers']
                
                return queries, answers
            except (json.JSONDecodeError, KeyError) as e:
                print(f"Error parsing output: {e}")
                return [], []
    
    except Exception as e:
        print(e)
        return [], []


### Applying Synthetic data q & a Generation from context

In [127]:
GENRATIVE_MODEL = "llama3"
generation_chain = set_generate_model_prompt_chain_for_qa(GENRATIVE_MODEL)

row_num = 2
input_text = df.iloc[row_num]['text']
queries, answers = llm_generate_queries_and_answers_for_context(input_text,generation_chain,GENRATIVE_MODEL)

# Print generated queries
print("\nOutput Queries all:")
print(queries)
print(answers)
print("")
for i, query in enumerate(queries, start=1):
    print(f"Query {i}: {query}")
    print(answers[i-1])

After cleaning ***
<class 'str'>
Here are the user queries based on the topic "Receding Gums":


{
  "queries": [
    "What causes receding gums?",
    "How can I prevent gum recession?",
    "Is smoking a risk factor for receding gums?",
    "What is the best way to brush my teeth to prevent receding gums?",
    "Can I fix receding gums on my own or do I need professional help?"
  ],
  "answers": [
    "Your dentist can help you identify any risk factors that may contribute to receding gums. Typical causes include gum disease, using a hard-bristled toothbrush, brushing too hard, being born with naturally thin or weak gums, smoking and using tobacco, and trauma to your gum tissue.",
    "Use a soft-bristled toothbrush to gently brush your teeth twice a day, and avoid brushing too hard. Be sure to brush all the different surfaces of your teeth, including the front, back, and top.",
    "Yes, smoking is a risk factor for receding gums. It can cause inflammation and damage to the gum tiss

In [56]:
GENRATIVE_MODEL = "gemma2:2b"
generation_chain = set_generate_model_prompt_chain_for_qa(GENRATIVE_MODEL)

def llm_generate_queries_answers_for_context_helper(row):
    
    index = row.name  # Get the index of the current row
    print(f"Processing gemma2 row {index}...")
    queries, answers = llm_generate_queries_and_answers_for_context(row['text'],generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return pd.Series([queries, answers])  

# Apply the function to each row in the DataFrame and create a new column 'queries'
df[['gemma2_queries', 'gemma2_answers']] = df.apply(lambda row: llm_generate_queries_answers_for_context_helper(row), axis=1)
df['gemma2_answers'] = df['gemma2_answers'].apply(lambda x: json.dumps(x))
df['doc_id'] = df.index
df.to_parquet(WIKI_QUERY_ANSWER_RAW_FILE)

In [57]:
GENRATIVE_MODEL = "llama3"
generation_chain = set_generate_model_prompt_chain_for_qa(GENRATIVE_MODEL)

def llm_generate_queries_answers_for_context_helper(row):
    index = row.name  # Get the index of the current row
    print(f"Processing row {index}...")
    
    queries, answers = llm_generate_queries_and_answers_for_context(row['text'],generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return pd.Series([queries, answers])    

df=pd.read_parquet(WIKI_QUERY_ANSWER_RAW_FILE)

# Apply the function to each row in the DataFrame and create a new column 'queries'
df[['llama3_queries', 'llama3_answers']]  = df.apply(lambda row: llm_generate_queries_answers_for_context_helper(row), axis=1)
df['llama3_answers'] = df['llama3_answers'].apply(lambda x: json.dumps(x))
df.to_parquet(WIKI_QUERY_ANSWER_RAW_FILE)

### Clean the Synthetic data generated q & a file

In [41]:
def get_indices_to_drop(sample_df):
    indices_to_drop = []
    for index, row in sample_df.iterrows():
        queries = row['gemma2_queries']
        answers = row['gemma2_answers']
        
        llama3_answers = row['llama3_answers']
        
        for j, answer in enumerate(llama3_answers):
            # Check if the answer is a dictionary
            if isinstance(answer, dict):  # Using isinstance to check the type
                #print(f"Index: {index}")  # Print the index of the row
                indices_to_drop.append(index)
                #print(f"A{j+1}: {answer} (Type: {type(answer).__name__})")
                

        for j, answer in enumerate(answers):
            # Check if the answer is a dictionary
            if isinstance(answer, dict):  # Using isinstance to check the type
                #print(f"Index: {index}")  # Print the index of the row
                indices_to_drop.append(index)
                #print(f"A{j+1}: {answer} (Type: {type(answer).__name__})")

    unique_indices_to_drop = list(set(indices_to_drop))
    return unique_indices_to_drop


def clear_generated_query_answer_data(file_name):
    df= pd.read_parquet(file_name)
    
    df = df[df['gemma2_queries'].apply(lambda x: len(x) != 0) & df['llama3_queries'].apply(lambda x: len(x) != 0)]
    
    df['gemma2_answers'] = df['gemma2_answers'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    df['llama3_answers'] = df['llama3_answers'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    unique_indices_to_drop = get_indices_to_drop(df)
    df.drop(index=unique_indices_to_drop, inplace=True)
    
    df.reset_index(drop=True, inplace=True)
    print(df.shape)
    df.to_parquet(WIKI_QUERY_ANSWER_GENERATED_FILE)

    # Explode the 'LLM_Query' column and
    df_exploded_gemma2 = df.explode(['gemma2_queries', 'gemma2_answers'])
    df_exploded_gemma2.reset_index(drop=True, inplace=True)
    df_exploded_gemma2.drop(columns=['llama3_queries','llama3_answers'], inplace=True)
    df_exploded_gemma2.to_parquet(GEMMA2_QA_DATA_FILE)
    
    
    # Explode the 'LLM_Query' column and
    df_exploded_llama3 = df.explode(['llama3_queries','llama3_answers'])
    df_exploded_llama3.reset_index(drop=True, inplace=True)
    df_exploded_llama3.drop(columns=['gemma2_queries', 'gemma2_answers'], inplace=True)
    df_exploded_llama3.to_parquet(LLAMA3_QA_DATA_FILE)
    
    # Display the result
    return df, df_exploded_gemma2,df_exploded_llama3

raw_qa_df, gemma_qa_df,llama_qa_df = clear_generated_query_answer_data(WIKI_QUERY_ANSWER_RAW_FILE)
raw_qa_df.head(2)

(58, 7)


Unnamed: 0,headline,text,gemma2_queries,gemma2_answers,doc_id,llama3_queries,llama3_answers
0,Choose your container. Understand why steriliz...,"The ideal container is wide, flat and shallow,...",[What type of containers are ideal for collect...,"[Containers like baking trays, takeaway food c...",0,[What's the best way to sterilize a container ...,[You can sterilize a container at home using s...
1,Avoid bright colors. Choose monochromatic elem...,The ideal palette for a minimalist tablescape ...,[What colors are ideal for a minimalist table ...,"[Black and white are ideal, with wood providin...",1,"[What colors make a good minimalist palette?, ...","[Select black, white, and wood colors for a cl..."


### LLM-as-a-judge :: Evaluating the quality of generated q & a data

#### Defining prompts for Evaluating the Synthetic data

In [92]:
# Define evaluation prompt template
Q_A_EVALUATION_PROMPT_TEMPLATE = """
You are an AI evaluator tasked with assessing the relevance and effectiveness of user answers generated for a specific query.
Your evaluation will help refine the answer generation process.
########
Instructions:
Evaluate the relevance, clarity,overall quality of the provided queries.
Score each answer from 1-5 based on the following criteria:
- Relevance: Does the answer directly or indirectly related to the query, although not focused?
- Clarity: How easy is the answer to understand?
- Quality: Does the answer demonstrate user needs?

Provide constructive feedback for improvement.
########
Now, evaluate the user answer based on the following query:
{query}
answer:
{answer}

Provide your evaluation in the following format:
Score: {{score}} (1-5)
Feedback: {{feedback}}
"""

#### Evaluate the quality

In [93]:
# Define llm model
def set_evaluate_model_prompt_chain_qa(evaluate_llm_model):
    
    model_evaluate_synthetic = Ollama(model=evaluate_llm_model)
    
    prompt_template = PromptTemplate(
            template=EVALUATION_PROMPT_TEMPLATE,
            input_variables=["query", "answer"],
        )

    evaluation_chain = prompt_template | model_evaluate_synthetic
    return evaluation_chain 


def llm_evaluate_queries_and_answers_for_context(query: str, answer: str,evaluation_chain,evaluate_llm_model):
    """Evaluate answer using LLaMA model."""
    try:

        llm_response = evaluation_chain.invoke({"query": query, "answer": answer})
        
        llm_response = llm_response.replace('**', '').replace('*', '')  # Remove asterisks
        llm_response = llm_response.replace(':', ': ')  # Standardize colon spacing
        
        #print(llm_response)
        if(evaluate_llm_model=="llama3" or evaluate_llm_model=="gemma2:2b"):
            FEEDBACK_PATTERN = r'Feedback: (.*)\n'
            feedbacks = re.findall(FEEDBACK_PATTERN, llm_response, re.MULTILINE)
            
            scores = []
            SCORE_PATTERN = r'Score:\s*(\d+)'
            matches = re.findall(SCORE_PATTERN, llm_response)

            if matches:
                scores = [int(match) for match in matches]
                
            return scores, feedbacks
        
        else:
            FEEDBACK_PATTERN = r'Feedback:\s*(.*?)(?=\n|$)'
            SCORE_PATTERN = r'\*\*Score:\*\* (\d+)'

            # Extract the scores
            scores = [int(score) for score in re.findall(SCORE_PATTERN, llm_response)]
            print(scores)

             # Extract feedbacks
            feedbacks_raw = re.findall(FEEDBACK_PATTERN, llm_response)

            # Append feedbacks one by one to the list
            feedbacks = []
            for feedback in feedbacks_raw:
                feedback = feedback.replace('**', '')
                feedbacks.append(feedback.strip()) 

            return scores, feedbacks

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return [], []


In [63]:
### Test evaluation
EVALUATION_MODEL = "gemma2:2b"
evaluation_chain = set_evaluate_model_prompt_chain_qa(EVALUATION_MODEL)
query = llama_df_qa.iloc[6]['llama3_queries']
#print('Query: ', query)

# Generate queries
answer  =  llama_df_qa.iloc[6]['llama3_answers']
print('Generated Answer: ', answer)

# Evaluate queries
scores,feedbacks = llm_evaluate_queries_and_answers_for_context(query, answer,evaluation_chain,EVALUATION_MODEL)
print('Answer Evaluation results')
print(scores)
print(feedbacks)

Generated Answer:  Choose tableware with clean lines and minimal patterns. Opt for entirely white ceramic plates and plain white or grey napkins.
## Evaluation of User Answer: 

Score:  4 

Feedback:   The answer demonstrates a good understanding of "minimalist" aesthetics and offers specific practical recommendations. It suggests using tableware with clean lines and minimal patterns, which aligns directly with the query's focus on minimalist tablescapes. The inclusion of white ceramic plates and plain napkins further reinforces this concept.

Here are some suggestions for improvement: 

 Expand on "minimal patterns":   The answer could benefit from elaborating on what constitutes "minimal patterns." Providing examples like single dots, subtle geometric designs, or even a single floral accent would enhance clarity and provide more specific guidance. 
 Contextualize choices:  Briefly mentioning the benefits of minimalism (e.g., promoting visual simplicity, fostering a sense of calm) cou

In [64]:
### Test evaluation
EVALUATION_MODEL = "llama3"
evaluation_chain = set_evaluate_model_prompt_chain_qa(EVALUATION_MODEL)
query = gemma_qa_df.iloc[6]['gemma2_queries']
#print('Query: ', query)

# Generate queries
answer  =  gemma_qa_df.iloc[6]['gemma2_answers']
print('Generated Answer: ', answer)

# Evaluate queries
scores,feedbacks = llm_evaluate_queries_and_answers_for_context(query, answer,evaluation_chain,EVALUATION_MODEL)
print('Answer Evaluation results')
print(scores)
print(feedbacks)

Generated Answer:  Use neutral colors in your furniture and decor, aiming for a clean and cohesive look.
Here is my evaluation: 

Score:  3
Feedback:  The answer partially addresses the query about minimalist table design. While it provides some general tips on achieving a clean and cohesive look, it does not directly relate to designing tables. The suggestion to use neutral colors is relevant but vague and could apply to any interior design, not specifically tables.

To improve this answer: 

 Provide more specific and concrete suggestions related to table design, such as using simple shapes, reducing ornamentation, or incorporating natural materials.
 Clarify how the suggested approach can be applied to table design in particular, rather than just general interior design.
Answer Evaluation results
[3]
[' The answer partially addresses the query about minimalist table design. While it provides some general tips on achieving a clean and cohesive look, it does not directly relate to des

In [66]:
def llm_evaluate_queries_and_answers_for_context_helper(row,evaluation_chain,EVALUATION_MODEL,query_col_name, answer_col_name):
    scores,feedbacks = llm_evaluate_queries_and_answers_for_context(row[query_col_name], row[answer_col_name],evaluation_chain,EVALUATION_MODEL)  # Generate queries based on the text
    return pd.Series([scores, feedbacks])

In [69]:
EVALUATION_MODEL = "gemma2:2b"
evaluation_chain = set_evaluate_model_prompt_chain_qa(EVALUATION_MODEL)

llama_df_qa[['llama3_queries_gemma_score', 'llama3_queries_gemma_feedback']] = llama_df_qa.apply(
    lambda row: llm_evaluate_queries_and_answers_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='llama3_queries',answer_col_name='llama3_answers'), axis=1)
llama_df_qa.to_parquet(GEMMA2_QA_DATA_EVALUATED_FILE)

In [None]:
EVALUATION_MODEL = "llama3"
evaluation_chain = set_evaluate_model_prompt_chain_qa(EVALUATION_MODEL)

gemma_qa_df[['gemma2_queries_llama3_score', 'gemma2_queries_llama3_feedback']] = gemma_qa_df.apply(
    lambda row: llm_evaluate_queries_and_answers_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='gemma2_queries', answer_col_name='gemma2_answers'), axis=1)
gemma_qa_df.to_parquet(LLAMA3_QA_DATA_EVALUATED_FILE)

## Option2: LLM :: To Generate only q from context

### Prompt for Generating Synthetic data q from context

In [71]:
# Define prompt template for generating questions

QUERY_GENERATION_PROMPT_TEMPLATE = """
You are an AI assistant designed to generate realistic user queries related to various topics.
Your task is to create diverse, natural-sounding questions that reflect what actual users might ask when seeking 
information on a specific subject.

Generate 5 diverse, natural, short, specific, and concise user queries (not more than 10 words) related to the given topic.
Ensure queries vary in complexity, specificity, and format.
Mimic real users seeking information on a chatbot platform.
Use simple language and maintain a neutral, conversational tone.

**Guidelines:**
1. Vary complexity, specificity, and format.
2. Mimic chatbot users seeking information.
3. Use simple language and maintain a neutral tone.
4. Avoid ambiguity and ensure specificity.

Example Input:
Topic: ### Planning a Budget-Friendly Vacation 
Planning a budget-friendly vacation requires careful research and flexibility.

Example Queries:
What are the best budget-friendly destinations worldwide?
How can I save money on flights and accommodations?
What's the cheapest way to travel across Europe?
How do I plan a 5-day budget trip from Singapore to Hawaii?
What budgeting apps are best for travel expenses?

Generate the output in the list of string format
{format_instructions}

Now, generate user queries based on the following topic:
{topic}
"""

### Generating Synthetic data q from context

In [72]:
## https://nanonets.com/blog/langchain/
def set_generate_model_prompt_chain(GENRATIVE_MODEL):
    
    class LLMGenerate_OutputFormat(BaseModel):
        """LLM output format"""
        responses: List[str] = Field(description="List of string responses generated by the LLM",
            min_items=1)
        
    model_generate_synthetic = Ollama(model=GENRATIVE_MODEL)
    
    output_parser = PydanticOutputParser(pydantic_object=LLMGenerate_OutputFormat)

    prompt = PromptTemplate(
        template=QUERY_GENERATION_PROMPT_TEMPLATE,
        input_variables=["topic"],
        partial_variables={"format_instructions": output_parser.get_format_instructions()},
    )
    
    chain = prompt | model_generate_synthetic 
    
    return chain 
    

def llm_generate_queries_for_context(topic,chain,GENRATIVE_MODEL):
    """
    Generate queries for a given context topic using LLaMA.

    Args:
    topic (str): Context topic.

    Returns:
    List[str]: List of generated queries.

    Raises:
    Exception: If LLaMA invocation fails.
    """
    try:
        queries = chain.invoke({"topic": topic})
        
        if(GENRATIVE_MODEL!="llama3.2"):
            
            llm_output_str = queries.strip()

            # Find the JSON part
            json_start = llm_output_str.find("{")
            json_end = llm_output_str.rfind("}")

            # Extract JSON and Parse JSON
            json_str = llm_output_str[json_start:json_end+1]
            llm_output = json.loads(json_str)

            # Extract responses
            responses = llm_output["responses"]

            return responses
        else:
            llm_output_str = queries.strip()

            # Find the JSON part
            json_start = llm_output_str.find("{")
            json_end = llm_output_str.rfind("}")

            # Extract JSON and Parse JSON
            json_str = llm_output_str[json_start:json_end+1]
            llm_output = json.loads(json_str)

            # Extract responses
            responses = llm_output["responses"]

            return responses

    except Exception as e:
        print(e)
        return []

### Applying Synthetic data q  Generation from context

In [73]:
GENRATIVE_MODEL = "gemma2:2b"
generation_chain = set_generate_model_prompt_chain(GENRATIVE_MODEL)

def llm_generate_queries_for_context_helper(row):
    queries = llm_generate_queries_for_context(row['text'],generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return queries  

df = pd.read_parquet(WIKI_SAMPLE_CLEANED_FILE)
# Apply the function to each row in the DataFrame and create a new column 'queries'
df['gemma_queries_sample'] = df.apply(llm_generate_queries_for_context_helper, axis=1)
df.to_parquet(WIKI_QUERY_RAW_FILE)

Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 1 column 309 (char 308)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Unterminated string starting at: line 1 column 253 (char 252)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)


In [74]:
GENRATIVE_MODEL = "llama3"
generation_chain = set_generate_model_prompt_chain(GENRATIVE_MODEL)

def llm_generate_queries_for_context_helper(row):
    queries = llm_generate_queries_for_context(row['text'],generation_chain,GENRATIVE_MODEL)  # Generate queries based on the text
    return queries  


df=pd.read_parquet(WIKI_QUERY_RAW_FILE)

# Apply the function to each row in the DataFrame and create a new column 'queries'
df['llama3_queries_sample'] = df.apply(llm_generate_queries_for_context_helper, axis=1)
df['doc_id'] = df.index
df.to_parquet(WIKI_QUERY_RAW_FILE)

Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 4 column 1 (char 77)
Expecting value: line 1 column 1 (char 0)


### Clean the Synthetic data generated q file

In [77]:
def clear_generated_query_data(file_name):
    df= pd.read_parquet(file_name)
    
    df = df[df['gemma_queries_sample'].apply(lambda x: len(x) != 0) & df['llama3_queries_sample'].apply(lambda x: len(x) != 0)]
    
    df.reset_index(drop=True, inplace=True)
    print(df.shape)
    df.to_parquet(WIKI_QUERY_GENERATED_FILE)

    # Explode the 'LLM_Query' column and
    df_exploded_gemma2 = df.explode(['gemma_queries_sample'])
    df_exploded_gemma2.reset_index(drop=True, inplace=True)
    df_exploded_gemma2.drop(columns=['llama3_queries_sample'], inplace=True)
    df_exploded_gemma2.to_parquet(GEMMA2_QUERY_FILE)

    # Explode the 'LLM_Query' column and
    df_exploded_llama3 = df.explode(['llama3_queries_sample'])
    df_exploded_llama3.reset_index(drop=True, inplace=True)
    df_exploded_llama3.drop(columns=['gemma_queries_sample'], inplace=True)
    df_exploded_llama3.to_parquet(LLAMA3_QUERY_FILE)
    
    # Display the result
    return df, df_exploded_gemma2,df_exploded_llama3

raw_query_df, gemma_query_df,llama_query_df = clear_generated_query_data(WIKI_QUERY_RAW_FILE)
raw_query_df.head(2)

(90, 5)


Unnamed: 0,headline,text,gemma_queries_sample,llama3_queries_sample,doc_id
0,Choose your container. Understand why steriliz...,"The ideal container is wide, flat and shallow,...",[What's the best way to sterilize a container ...,[What's the best way to sterilize a container ...,0
1,Avoid bright colors. Choose monochromatic elem...,The ideal palette for a minimalist tablescape ...,[What colors are best for a minimalist tablesc...,[What's the most effective way to balance blac...,1


### LLM-as-a-judge :: Evaluating the quality of generated q data

#### Defining prompts for Evaluating the Synthetic data

In [86]:
# Define evaluation prompt template
Q_EVALUATION_PROMPT_TEMPLATE = """
You are an AI evaluator tasked with assessing the relevance and effectiveness of user queries generated for a specific topic.
Your evaluation will help refine the query generation process.
########
Instructions:
Evaluate the relevance, clarity,overall quality of the provided queries.
Score each query from 1-5 based on the following criteria:
- Relevance: Does the Query directly or indirectly related to the topic, although not focused?
- Clarity: How easy is the query to understand?
- Quality: Does the Query demonstrate user needs?

Provide constructive feedback for improvement.
########
Now, evaluate the user queries based on the following topic:
{topic}
Queries:
{queries}

Provide your evaluation in the following format:

Query {{number}}: {{query}}
Score: {{score}} (1-5)
Feedback: {{feedback}}
"""

#### Evaluate the quality

In [87]:
# Define llm model
def set_evaluate_model_prompt_chain(evaluate_llm_model):
    
    model_evaluate_synthetic = Ollama(model=evaluate_llm_model)
    
    prompt_template = PromptTemplate(
            template=Q_EVALUATION_PROMPT_TEMPLATE,
            input_variables=["topic", "queries"],
        )

    evaluation_chain = prompt_template | model_evaluate_synthetic
    return evaluation_chain 


def llm_evaluate_queries_for_context(topic: str, queries: List[str],evaluation_chain,evaluate_llm_model):
    """Evaluate queries using LLaMA model."""
    try:

        llm_response = evaluation_chain.invoke({"topic": topic, "queries": queries})
        
        llm_response = llm_response.replace('**', '').replace('*', '')  # Remove asterisks
        llm_response = llm_response.replace(':', ': ')  # Standardize colon spacing
        
        if(evaluate_llm_model=="llama3" or evaluate_llm_model=="gemma2:2b"):
            FEEDBACK_PATTERN = r'Feedback: (.*)\n'
            feedbacks = re.findall(FEEDBACK_PATTERN, llm_response, re.MULTILINE)
            
            scores = []
            SCORE_PATTERN = r'Score:\s*(\d+)'
            matches = re.findall(SCORE_PATTERN, llm_response)

            if matches:
                scores = [int(match) for match in matches]
                
            return scores, feedbacks
        
        else:
            FEEDBACK_PATTERN = r'Feedback:\s*(.*?)(?=\n|$)'
            SCORE_PATTERN = r'\*\*Score:\*\* (\d+)'

            # Extract the scores
            scores = [int(score) for score in re.findall(SCORE_PATTERN, llm_response)]
            print(scores)

             # Extract feedbacks
            feedbacks_raw = re.findall(FEEDBACK_PATTERN, llm_response)

            # Append feedbacks one by one to the list
            feedbacks = []
            for feedback in feedbacks_raw:
                feedback = feedback.replace('**', '')
                feedbacks.append(feedback.strip()) 

            return scores, feedbacks

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return [], []


In [88]:
def llm_evaluate_queries_for_context_helper(row,evaluation_chain,EVALUATION_MODEL,query_col_name):
    scores,feedbacks = llm_evaluate_queries_for_context(row['text'], row[query_col_name],evaluation_chain,EVALUATION_MODEL)  # Generate queries based on the text
    return pd.Series([scores, feedbacks])

In [89]:
EVALUATION_MODEL = "llama3"
evaluation_chain = set_evaluate_model_prompt_chain(EVALUATION_MODEL)

raw_query_df[['gemma_queries_llama3_score', 'gemma_queries_llama3_feedback']] = raw_query_df.apply(
    lambda row: llm_evaluate_queries_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='gemma_queries_sample'), axis=1)

raw_query_df.to_parquet(WIKI_QUERY_EVALUATED_FILE)

In [90]:
EVALUATION_MODEL = "gemma2:2b"
evaluation_chain = set_evaluate_model_prompt_chain(EVALUATION_MODEL)
raw_query_df = pd.read_parquet(WIKI_QUERY_EVALUATED_FILE)

raw_query_df[['llama3_queries_gemma_score', 'llama3_queries_gemma_feedback']] = raw_query_df.apply(
    lambda row: llm_evaluate_queries_for_context_helper(row, evaluation_chain, EVALUATION_MODEL, 
                                                        query_col_name='llama3_queries_sample'), axis=1)

raw_query_df.to_parquet(WIKI_QUERY_EVALUATED_FILE)