In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "/scratch/rohank__iitp/qwen2_5_7b_instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [166]:
def generate(prompt:str):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]
    # Generate text
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    # Decode and print response
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return response.strip()

generate("What is the capital of France?")

"The capital of France is Paris. \n\nParis is a major European city located on the Seine River in northern central France and is the country's most populous city, with an estimated population of 2.1 million people in its administrative limits and over 10 million people in its metropolitan area.\n\nSome notable landmarks in Paris include the Eiffel Tower, Notre-Dame Cathedral, the Louvre Museum, and the Arc de Triomphe, among many others. It is also known for its"

In [167]:
import json
import re

def convert_structured_to_jsonl(text_block: str, i: int) -> str:
    # dialogue_match = re.search(r"<dialogue>\s*(.*?)\s*</dialogue>", text_block, re.DOTALL)
    # reasoning_match = re.search(r"<reasoning>\s*(.*?)\s*</reasoning>", text_block, re.DOTALL)
    # answer_match = re.search(r"answer\s*(.*?)\s*</answer>", text_block, re.DOTALL)

    # if not (dialogue_match and reasoning_match and answer_match):
    #     raise ValueError("Could not find all required tags in the text.")
    # dialogue = dialogue_match.group(1).strip()
    # reasoning = reasoning_match.group(1).strip()
    # answer = answer_match.group(1).strip()

    data = {
        "id_json":i,

        "answer": text_block.strip()
    }

    res=json.dumps(data)
    with open("/home/rohank__iitp/Work/niladri/dataset2/single/single_response.jsonl", "a") as f:
        f.write(res + "\n")
    return res



In [168]:
import pandas as pd

# Load CSV
def csv_load(i:int):
    file_path = '/home/rohank__iitp/Work/niladri/dataset2/conversation.csv'
    df = pd.read_csv(file_path)

    conv_id = i
    df = df[df['conversation_id'] == conv_id]

    # Sort by turn number to ensure correct sequence
    df.sort_values(by="turn_no", inplace=True)

    # Prepare conversation history
    history = []
    result = []

    # Iterate through each row except the last one
    for i in range(len(df)):
        row = df.iloc[i]
        speaker = row['speaker']
        utterance = row['utterance']

        # Add current cumulative history to result before appending new utterance
        # result.append(" ".join(history))

        # Add current utterance with speaker label to history
        result.append(f"{speaker}: {utterance}")

    # Add the last utterance in the specified format
    # last_utterance = df.iloc[-1]['utterance']
    # result.append(f"current utterance: {last_utterance}")
    return result




#### Single Gemini

In [169]:
def gemini_response(dialogue):

    prompt = f"""
    You are helpful assistant. Your task is to give agent like response. 
    Give the answer for this query: {dialogue}
    Agent Reply:"""
    return generate(prompt)


In [170]:
result = list()
for i in range(5,21):
    res = csv_load(i)
    # res.pop(0)
    result.extend(res)  # Use extend to flatten the list    
    
len(result)

209

In [171]:
i=46
for sentence in result:
    final_output = gemini_response(sentence)
    res = convert_structured_to_jsonl(final_output, i)
    print(sentence)
    i += 1

User: Hi, I'm looking for motor insurance for my 2022 Hyundai Kona Electric. Can you help?
Agent: Absolutely! The Hyundai Kona Electric is a fantastic car. Given it's an EV, are you particularly concerned about battery coverage or charging-related issues?
User: Yes, battery coverage is a big concern. I've heard those repairs can be super expensive.
Agent: I understand. With Tata AIG, we understand the nuances of EVs. Our policy is designed to address modern vehicle risks, it ensures claims are processed quickly and effectively.
User: Okay, good. What kind of coverage options do you offer for the battery specifically?
Agent: We offer a comprehensive plan that includes coverage for accidental damage, theft, and also covers the battery against manufacturing defects and certain types of damage. It's a complete package, offering extensive financial protection.
User: That sounds reassuring. And what about roadside assistance? Does that cover EVs?
Agent: Yes, our Roadside Assistance covers EV

In [172]:
import json
import re

# Function to clean markdown and formatting from text
def clean_text(text):
    # Remove markdown symbols and line breaks
    cleaned = re.sub(r'[*`_>#\\\-\r\n]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Collapse multiple spaces into one
    return cleaned.strip()

# Input and output file paths
input_file = "/home/rohank__iitp/Work/niladri/dataset2/single/single_response.jsonl"   # Replace with your actual input filename
output_file = "/home/rohank__iitp/Work/niladri/dataset2/single/cleaned_output.jsonl"

# Process each line
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        data["answer"] = clean_text(data["answer"])
        outfile.write(json.dumps(data) + "\n")

print(f"Cleaned data written to {output_file}")


Cleaned data written to /home/rohank__iitp/Work/niladri/dataset2/single/cleaned_output.jsonl
