In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load Gemma model
model_id = "google/gemma-2b-it"  # or "google/gemma-7b-it" if you have GPU resources
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)

# Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)


2025-06-17 17:42:20.047463: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-17 17:42:20.060655: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-17 17:42:20.077839: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-17 17:42:20.082698: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-17 17:42:20.095260: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
def gemma_extract_keywords(sentence):
    prompt = f"""
    Extract the 4 to 5 most important single-word keywords from the following sentence, focusing specifically on the Logical persuasion strategy.

    Logical persuasion includes reasoning, evidence, facts, statistics, cause-effect relationships, and structured arguments.

    Return only the keywords as a comma-separated list, with no explanation or extra text.
    Important: Do not include product names, brand names, place names, or proper nouns.


    Sentence: "{sentence}"
    """
    try:
        output = generator(prompt, do_sample=False)[0]["generated_text"]
        keyword_line = output.split("Keywords:")[-1].strip().split("\n")[0]
        keywords = [k.strip() for k in keyword_line.split(",") if k.strip()]
        return keywords[:5] + [""] * (5 - len(keywords))
    except Exception as e:
        print(f"Error on: {sentence[:40]}... -> {e}")
        return [""] * 5
    
def extract_keywords(sentence):
    prompt = f"""
You are a helpful assistant skilled in persuasion strategies.

Extract 4 to 5 **single-word** keywords from the following sentence using the *Logical* persuasion strategy.

Logical persuasion includes: reasoning, evidence, facts, statistics, cause-effect relationships, structured arguments.

Do NOT include product names, brand names, or places.

Only return keywords as a comma-separated list.

Sentence: "{sentence}"

Keywords:
"""
    try:
        output = generator(prompt, do_sample=False)[0]["generated_text"]
        keyword_line = output.split("Keywords:")[-1].strip().split("\n")[0]
        keywords = [k.strip() for k in keyword_line.split(",") if k.strip()]
        return keywords[:5] + [""] * (5 - len(keywords))
    except Exception as e:
        print(f"Error on: {sentence[:40]}... -> {e}")
        return [""] * 5


In [None]:
# hugging face token= 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

In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

# Load medgemma-4b-it model and processor (chat format)
model_id = "google/medgemma-4b-it"

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


2025-06-17 18:04:08.534661: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-17 18:04:08.547509: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-17 18:04:08.563758: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-17 18:04:08.568668: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-17 18:04:08.580487: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [14]:

def extract_keywords(sentence):
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are an assistant skilled in logical *persuasion strategy*."}]
        },
        {
            "role": "user",
            "content": [{
                "type": "text",
                "text": f"""Extract the *4 to 5 most important single-word keywords* from the following sentence, focusing specifically on the *Logical persuasion strategy*.


    Return only the keywords as a comma-separated list, with no explanation or extra text.
    Important: Do not include product names, brand names, place names, or proper nouns.

    Sentence: "{sentence}"
    """
            }]
        }
    ]


    try:
        # Prepare input for generation
        inputs = processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True,
            return_dict=True, return_tensors="pt"
        ).to(model.device, dtype=torch.bfloat16)

        input_len = inputs["input_ids"].shape[-1]

        # Generate output
        with torch.inference_mode():
            output = model.generate(**inputs, max_new_tokens=50, do_sample=False)
            output = output[0][input_len:]

        # Decode and clean response
        decoded = processor.decode(output, skip_special_tokens=True).strip()
        keywords = [kw.strip() for kw in decoded.split(",") if kw.strip()]
        return keywords[:5] + [""] * (5 - len(keywords))

    except Exception as e:
        print(f"Error on: {sentence[:40]}... -> {e}")
        return [""] * 5


In [3]:
!pwd
!ls


/DATA/rohan_kirti/niladri


 conversation1.csv   jupyter.log  'key_gemma copy.ipynb'   key_gemma.ipynb


In [16]:
extract_keywords("Yes, HDFC ERGO includes Roadside Assistance with services like towing, jump-start, flat tire help, and fuel delivery.")

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Assistance', 'Services', 'Benefits', 'Features', '']

In [17]:
import pandas as pd

df = pd.read_csv("conversation1.csv")  
keywords_list = []
for _, row in df.iterrows():
    if str(row["P-Strategy"]).strip().lower() == "logical":
        keywords = extract_keywords(row["utterance"])
        print(keywords)
    else:
        keywords = [""] * 5
        # print("hi")
    keywords_list.append(keywords)

keyword_df = pd.DataFrame(keywords_list, columns=["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"])
df_with_keywords = pd.concat([df, keyword_df], axis=1)

df_with_keywords.to_csv("gemma_logical_keywords1.csv", index=False)
print(" Output saved to 'logical_keywords_output.csv'")

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Recommend', 'Insurance', 'Logical', '', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Benefits', 'Policy', 'Assistance', 'Depreciation', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Policy', 'Premium', 'Include', 'What', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Logic', 'Reason', 'Evidence', 'Value', 'Cost']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Policy', 'Assistance', 'Roadside', 'Logic', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Assistance', 'Services', 'Benefits', 'Features', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Zero Depreciation', '', '', '', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Value', 'Claim', 'Depreciation', 'Cost', '']
['Easy', '', '', '', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Claim', 'Process', 'Tracking', 'Network', '']
['Benefit', '', '', '', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Bonus', 'Premium', 'Reduce', 'Qualify', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Cover\nDamage\nPolicy\nEngine', '', '', '', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Covered', 'Recommended', 'Opt', 'Protection', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Persuasion', 'Logic', 'Reason', 'Argument', 'Evidence']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Guide', 'Quote', 'Steps', 'Prepare', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Recommend', 'Insurance', 'Persuasion', 'Logical', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Benefits', 'Policy', 'Assistance', 'Depreciation', '']


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


['Policy', 'Premium', 'Include', 'What', '']
['Logic', 'Reason', 'Evidence', 'Value', 'Risk']
 Output saved to 'logical_keywords_output.csv'


## Gemma 4b-it

In [1]:
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
import torch

# Use gemma-3-4b-it (text-only version)
model_id = "google/gemma-3-4b-it"

# Load model and processor
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)


2025-06-17 20:20:03.130110: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-17 20:20:03.143838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-17 20:20:03.160887: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-17 20:20:03.165982: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-17 20:20:03.178254: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def logical_extract_keywords(sentence):
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are an assistant skilled in logical *persuasion strategy*."}]
        },
        {
            "role": "user",
            "content": [{
                "type": "text",
                "text": f"""Extract the *4 to 5 most important single-word keywords* from the following sentence, focusing specifically on the *Logical persuasion strategy*.

Return only the keywords as a comma-separated list, with no explanation or extra text.
Important: Do not include product names, brand names, place names, or proper nouns.

 Examples:
    Sentence: "I have a 2021 Honda Amaze. What insurance would you recommend?"
    Output: insurance, recommend

    Sentence: "It includes own damage, third-party liability, theft, natural disasters, and more. The premium is approx $1176 per year, based on IDV."
    Output: damage, theft, disaster, premium

    Sentence: "This plan provides better coverage and lower premium compared to the previous one."
    Output: coverage, premium, comparison

    Sentence: "The repair costs are significantly reduced with this policy."
    Output: repair, cost, policy

    Now, extract from the following:

Sentence: "{sentence}"
"""
            }]
        }
    ]



    # messages = [
    #     {
    #         "role": "system",
    #         "content": [
    #             {
    #                 "type": "text",
    #                 "text": "You are an assistant skilled in identifying keywords used in logical persuasion. Logical persuasion is based on facts, statistics, reasoning, cost-benefit analysis, and objective evaluations. It avoids emotional appeals or vague claims, and instead focuses on evidence, consequences, comparisons, functionality, efficiency, and measurable impact."
    #             }
    #         ]
    #     },
    #     {
    #         "role": "user",
    #         "content": [
    #             {
    #                 "type": "text",
    #                 "text": """Your task is to extract the *4 to 5 most important single-word keywords* from the sentence below. Focus strictly on the *Logical persuasion strategy*.

    # Logical persuasion involves factual or analytical reasoning—such as cost, risk, benefit, performance, features, warranty, coverage, or other evidence-based elements. Avoid including emotional, subjective, or brand-related words.

    #  Important Instructions:
    # - Return only the keywords as a comma-separated list.
    # - Do not include product names, brand names, places, or proper nouns.
    # - Only pick nouns, verbs, or adjectives related to logical decision-making or analysis.

    #  Examples:
    # Sentence: "I have a 2021 Honda Amaze. What insurance would you recommend?"
    # Output: insurance, recommend

    # Sentence: "It includes own damage, third-party liability, theft, natural disasters, and more. The premium is approx $1176 per year, based on IDV."
    # Output: damage, theft, disaster, premium

    # Sentence: "This plan provides better coverage and lower premium compared to the previous one."
    # Output: coverage, premium, comparison

    # Sentence: "The repair costs are significantly reduced with this policy."
    # Output: repair, cost, policy

    # Now, extract from the following:

    # Sentence: "{sentence}"
    # """
    #             }
    #         ]
    #     }
    # ]


    try:
        # Prepare input
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device, dtype=torch.bfloat16)

        input_len = inputs["input_ids"].shape[-1]

        # Generate output
        with torch.inference_mode():
            output = model.generate(**inputs, max_new_tokens=50, do_sample=False)
            output = output[0][input_len:]

        # Decode and clean output
        decoded = processor.decode(output, skip_special_tokens=True).strip()
        keywords = [kw.strip() for kw in decoded.split(",") if kw.strip()]
        return keywords[:5] + [""] * (5 - len(keywords))

    except Exception as e:
        print(f"Error on: {sentence[:40]}... -> {e}")
        return [""] * 5

In [None]:
import pandas as pd

df = pd.read_csv("conversation1.csv")  
keywords_list = []
for _, row in df.iterrows():
    if str(row["P-Strategy"]).strip().lower() == "logical":
        keywords = logical_extract_keywords(row["utterance"])
        print(keywords)
    elif str(row["P-Strategy"]).strip().lower() == "emotional":
        keywords = [""] * 5
        # print("hi")
    keywords_list.append(keywords)

keyword_df = pd.DataFrame(keywords_list, columns=["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"])
df_with_keywords = pd.concat([df, keyword_df], axis=1)

df_with_keywords.to_csv("gemma_logical_keywords2.csv", index=False)
print(" Output saved to 'gemma_logical_keywords2.csv'")



['insurance', 'recommend', '', '', '']
['policies', 'benefits', 'assistance', 'depreciation', '']
['policy', 'premium', 'include', '', '']
['damage', 'theft', 'disaster', 'premium', '']
['assistance', 'policy', 'roadside', '', '']
['assistance', 'services', 'delivery', 'help', '']
['add-ons', 'depreciation', '', '', '']
['Depreciation', 'claim', 'cost', 'valuable', '']
['process', 'easy', 'claim', '', '']
['process', 'tracking', 'network', 'repairs', '']
['claimed', 'benefit', '', '', '']
['qualify', 'premium', 'bonus', 'reduce', '']
['damage', 'policy', 'cover', '', '']
['covered', 'protection', 'recommended', '', '']
['interest', 'plan', '', '', '']
['quote', 'purchase', 'guide', '', '']
['insurance', 'recommend', '', '', '']
['policies', 'benefits', 'assistance', 'depreciation', '']
['policy', 'premium', 'include', '', '']
 Output saved to 'gemma_logical_keywords2.csv'
