In [10]:
import pandas as pd 
import os 

from tqdm import tqdm

import torch
import transformers

from transformers import pipeline 

from transformers import logging

logging.set_verbosity_error()

In [None]:
# This is a gated model, requiring access authorization on Hugging Face. To request access, please visit the following link: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
model_id='meta-llama/Llama-3.2-3B-Instruct'

# Memory Usage:
# This model configuration utilizes approximately 6.5 GB of VRAM due to the use of bfloat16 precision.

# For environments with limited GPU memory, consider these lower-precision options:
# FP8: Reduces VRAM usage to around 3.2 GB.
# INT4:Further decreases VRAM consumption to approximately 1.75 GB.

#Note: While lower precision formats can significantly reduce memory footprint, they may also lead to a gradual degradation in the quality and accuracy of the model's output.

pipe=pipeline(task='text-generation',model=model_id,device_map='auto',torch_dtype=torch.bfloat16)

Downloading shards: 100%|██████████| 2/2 [09:23<00:00, 281.85s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
Device set to use cuda:0


In [21]:
def generate_questions(conversation_type,num_questions,output_file):

    if conversation_type=='conversational':
        system_prompt=""" You are a helpful assistant that generates diverse , realistic conversational questions. These questions should be the type of questions people might ask
        each other in their everyday conversations . Do NOT include any answers. Only generate questions. Generate questions of varying length and complexity.

        Here are some examples:
        - How was your weekend?
        - What are you working on today?
        - Have you seen any good movies lately?
        - what is your favourite restaurant?
        - Do you have any plans for the holidays?
        """
    elif conversation_type=='coding':
        system_prompt=""" You are a helpful assistants that generates coding questions. These questions should be diverse and must cover a range of programming topics , languages
        (Python, JavaScript, Java, C, C++, etc. ). Do NOT generate any solutions or answers. Only generate the questions.
        
        Here are some examples:
        - How do you reverse a list in python?
        - Can you explain how recursion works?
        - How do I debug a segmentation fault in C++?
        - What is the best way to handle asynchronous operations in JavaScript?
        - What's the difference between an interface and an abstract class in Java?
        """
    
    elif conversation_type=='help':
        system_prompt="""You are a helpful assistant that generates questions that someone might ask when they need help with a task or problem. These should cover a variety of domains (e.g., tech support, cooking, home repair, travel planning).
        Do NOT include any answers.  Only generate the question. Be specific.
        
        Here are some examples:
        - My printer isn't working. What should I do?
        - How can I bake a cake?
        - How do I reset my Wi-Fi router?
        - I'm planning a trip to Japan. What are the must-see places?
        - How do I reset my PC?
        """

    else:
        raise ValueError(f'Invalid conversational type given : {conversation_type}')
    
    write_header= not os.path.exists(output_file)

    for i in tqdm(range(num_questions),desc=f'Generating {conversation_type} Questions',total=num_questions):
        message=[{"role":"system","content":system_prompt},
                 {"role":"user","content":"Generate a question."}]
        
        try:
            output=pipe(message,max_new_tokens=50,do_sample=True,top_p=1.0,temperature=1.0,return_full_text=False)[0]
            question = output['generated_text'].strip()

            new_row_df = pd.DataFrame({'conversational_type':conversation_type,'question':question},index=[0])

            new_row_df.to_csv(output_file,mode='a',header=write_header,index=False)

            write_header=False
        except Exception as e:
            print(f'Error generating question {i+1}: {e}')
        
    print(f'Generated {conversation_type} questions of size {num_questions} to {output_file}')

In [None]:
generate_questions(conversation_type='conversational',num_questions=5000,output_file='data/conversational_questions.csv')

Generating conversational Questions: 100%|██████████| 5000/5000 [1:12:16<00:00,  1.15it/s]

Generated conversational questions of size 5000 to conversational_data/conversational_questions.csv





In [None]:
generate_questions(conversation_type='coding',num_questions=5000,output_file='data/coding_questions.csv')

Generating coding Questions: 100%|██████████| 5000/5000 [1:40:12<00:00,  1.20s/it]

Generated coding questions of size 5000 to conversational_data/coding_questions.csv





In [None]:
generate_questions(conversation_type='help',num_questions=5000,output_file='data/help_questions.csv')

Generating help Questions: 100%|██████████| 5000/5000 [1:17:19<00:00,  1.08it/s]

Generated help questions of size 5000 to conversational_data/help_questions.csv



