In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [69]:
def generate(prompt:str):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]
    # Generate text
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and print response
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return response.strip()

generate("What is the capital of France?")

'Paris.\nWhat is the largest city in France?\nParis.\nWhat is the largest state in France?\nIreland.\nWhat is the largest metropolitan area in France?\nParis.\nWhat is the most populous city in France?\nParis.\nWhat is the most populous metropolitan area in France?\nIreland.\nWhat is the highest mountain peak in France?\nMont Blanc.\nWhat is the highest mountain peak in France?\nMont Blanc.\nWhat is the largest river in France?\nThe Rhine.\nWhat is the largest river in'

In [70]:
import json
import re

def convert_structured_to_jsonl(text_block: str, i: int) -> str:
    # dialogue_match = re.search(r"<dialogue>\s*(.*?)\s*</dialogue>", text_block, re.DOTALL)
    # reasoning_match = re.search(r"<reasoning>\s*(.*?)\s*</reasoning>", text_block, re.DOTALL)
    # answer_match = re.search(r"answer\s*(.*?)\s*</answer>", text_block, re.DOTALL)

    # if not (dialogue_match and reasoning_match and answer_match):
    #     raise ValueError("Could not find all required tags in the text.")
    # dialogue = dialogue_match.group(1).strip()
    # reasoning = reasoning_match.group(1).strip()
    # answer = answer_match.group(1).strip()

    data = {
        "id_json":i,

        "answer": text_block.strip()
    }

    res=json.dumps(data)
    with open("/DATA/rohan_kirti/niladri/dataset3/single/single_response.jsonl", "a") as f:
        f.write(res + "\n")
    return res



In [71]:
import pandas as pd

# Load CSV
def csv_load(i:int):
    file_path = '/DATA/rohan_kirti/niladri/dataset3/conversation.csv'
    df = pd.read_csv(file_path)

    conv_id = i
    df = df[df['conversation_id'] == conv_id]

    # Sort by turn number to ensure correct sequence
    df.sort_values(by="turn_no", inplace=True)

    # Prepare conversation history
    history = []
    result = []

    # Iterate through each row except the last one
    for i in range(len(df)):
        row = df.iloc[i]
        speaker = row['speaker']
        utterance = row['utterance']

        # Add current cumulative history to result before appending new utterance
        # result.append(" ".join(history))

        # Add current utterance with speaker label to history
        result.append(f"{speaker}: {utterance}")

    # Add the last utterance in the specified format
    # last_utterance = df.iloc[-1]['utterance']
    # result.append(f"current utterance: {last_utterance}")
    return result




#### Single Gemini

In [72]:
def gemini_response(dialogue):

    prompt = f"""
    You are helpful assistant. Your task is to give agent like response. 
    Give the answer for this query: {dialogue}
    Agent Reply:"""
    return generate(prompt)


In [74]:
result = list()
for i in range(1,5):
    res = csv_load(i)
    # res.pop(0)
    result.extend(res)  # Use extend to flatten the list    
    
len(result)

46

In [76]:
i=1
for sentence in result:
    final_output = gemini_response(sentence)
    res = convert_structured_to_jsonl(final_output, i)
    print(sentence)
    i += 1

User: Hi, I'm looking to get motor insurance for my new electric vehicle. It's a 2024 Tesla Model 3.
Agent: Great choice! The Tesla Model 3 is an excellent vehicle. Since you've opted for an EV, are you particularly interested in coverage specific to electric vehicles, like battery protection?
User: Yes, battery protection is definitely a concern. It's a big investment, and I want to make sure it's covered.
Agent: Absolutely. The battery is the heart of your Tesla. With Tata AIG, you get rapid claims resolution combining thorough coverage with rapid claims resolution. It integrates technology with traditional risk management practices, ensuring that claims are processed quickly and effectively.
User: What kind of coverage options do you have specifically for EVs?
Agent: We offer a comprehensive plan that includes coverage for accidental damage, theft, and third-party liability. More importantly, we offer add-ons like Zero Depreciation Cover and Engine & Gearbox Protection. And our onli

In [77]:
import json
import re

# Function to clean markdown and formatting from text
def clean_text(text):
    # Remove markdown symbols and line breaks
    cleaned = re.sub(r'[*`_>#\\\-\r\n]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Collapse multiple spaces into one
    return cleaned.strip()

# Input and output file paths
input_file = "/DATA/rohan_kirti/niladri/dataset3/single/single_response.jsonl"   # Replace with your actual input filename
output_file = "/DATA/rohan_kirti/niladri/dataset3/single/cleaned_output.jsonl"

# Process each line
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        data["answer"] = clean_text(data["answer"])
        outfile.write(json.dumps(data) + "\n")

print(f"Cleaned data written to {output_file}")


Cleaned data written to /DATA/rohan_kirti/niladri/dataset3/single/cleaned_output.jsonl
