In [None]:
import torch
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM

### Tokenizer and Model

In [2]:
model_id = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [93]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": [
      "lm_head",
      "multi_modal_p

### Data

In [94]:
DATA_DIR = Path.cwd() / "am_reasoning" / "data_files"

In [95]:
df = pd.read_csv((DATA_DIR / "pe_dataset.csv"), index_col=0)

In [111]:
df.columns

Index(['split', 'title', 'argument_bound_1', 'argument_bound_2', 'argument_id',
       'ac', 'label', 'essay'],
      dtype='object')

In [210]:
df.shape

(6089, 8)

### Template Messages

In [None]:
# sys_msg = """You are an expert in argumentation analysis. You will be given a complete argument. Your task is to analyze the argument holistically and classify it as exactly ONE of the following categories: Position, Claim, or Premise. 

# First, think through your analysis step by step, considering the function and structure of the argument. Explain your reasoning, identifying key characteristics that point to a specific classification.

# Then, provide your final classification as a single word: Position, Claim, or Premise."""

In [None]:
# instruction = """### You are an expert in Argument Mining. You are given an argument component from an essay. Your task is to classify the argument component as either "MajorClaim", "Claim" or "Premise". You must return the argument component type in following JSON format:: {"component_type": [component_type (str)]}


# """

In [287]:
# instruction = """### You are an expert in Argument Mining and discourse analysis. 

# INPUT:
# - You are given an argument component from an essay. 
# - You are given the full text of the essay.

# TASK:
# - Your task is to classify the whole argument component, as ONLY ONE of: "MajorClaim", "Claim" or "Premise".
# - Explain your reasoning step-by-step before making your final classification.

# Then, return your final classification in the following JSON format:
# {"component_type": [Answer]}

# Example:

# {"component_type": ["Claim"]}

# """

In [317]:
reasons_d = {}

In [318]:
instruction = """### You are an expert in Argument Mining and discourse analysis. 

INPUT:
- You are given a single argument component from an essay. 
- You are also given the full text of the essay for context.

TASK:
- Your task is to classify the ENTIRE argument component as ONE of these three categories ONLY: "MajorClaim", "Claim" or "Premise".
- Do NOT break down the argument component into smaller parts with different classifications.
- The WHOLE component must receive exactly ONE classification.

REASONING PROCESS:
First, analyze the component considering these factors:
1. Semantic content: What is the meaning and purpose of this component in the argument?
2. Syntactic structure: How is this component phrased? (declarative statement, supporting evidence, etc.)
3. Context: How does this component relate to the overall essay thesis?
4. Function: 
   - Does it express the author's central position on the main topic? (MajorClaim)
   - Does it express a specific stance that supports or develops the main position? (Claim)
   - Does it provide evidence, examples, or justification for a claim? (Premise)

Provide your step-by-step reasoning, then classify the ENTIRE argument component as exactly ONE of: "MajorClaim", "Claim", or "Premise".

Finally, return your classification in the following JSON format:
{"component_type": ["Your Answer"]}

Example:
{"component_type": ["Claim"]}

"""

In [319]:
df_s = df.sample(10)

In [320]:
sys_msgs = []
user_msgs = []

for _, row in df_s.iterrows():

    sys_msg = {"role": "system", "content": instruction}
    #usr_msg = {"role": "user", "content": f"""### Here is the argument component to be classified: {row['ac']}"""} 
    
    #msg_c = f"""### Here is the complete essay text: {df.iloc[0]['essay']}""" + "\n\n" + f"""### Here is the argument component to be classified: {row['ac']}"""
    
    usr_msg = {"role": "user", "content": f"""### Here is the complete essay text: {row['essay']}""" + "\n\n" + f"""### Here is the argument component to be classified: {row['ac']}"""}
    
    
    sys_msgs.append(sys_msg)
    user_msgs.append(usr_msg)    
    

In [321]:
messages = []

for i in range(len(sys_msgs)):
    messages.append([sys_msgs[i], user_msgs[i]])

In [322]:
len(messages)

10

In [323]:
messages[5][1]['content'].split("###")[2]

' Here is the argument component to be classified: When I was ten years old, I was a shy girl'

In [324]:
#print(usr_msg)

In [325]:
# messages = [
#     {"role": "system", "content": instruction},
#     {"role": "user", "content": usr_msg},
# ]

In [326]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [327]:
for i, message in tqdm(enumerate(messages)):

    input_ids = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device) # type: ignore
    outputs = model.generate(input_ids, max_new_tokens=1024, eos_token_id=terminators, do_sample=True, temperature=0.1, top_p=0.9)
    
    response = outputs[0][input_ids.shape[-1]:]
    reasons_d[message[1]['content'].split("###")[2]] = tokenizer.decode(response, skip_special_tokens=True)

0it [00:00, ?it/s]

10it [00:58,  5.80s/it]


In [328]:
reasons_d

{' Here is the argument component to be classified: school teachers are more qualified than parents in the field of education': '### Classification Process\n\n1. **Semantic Content**: The statement "school teachers are more qualified than parents in the field of education" conveys a specific aspect of the relationship between teachers and parents in children\'s education.\n\n2. **Syntactic Structure**: The phrase "school teachers are more qualified" is a declarative statement, which is a fundamental building block of an argument.\n\n3. **Context**: The context of the essay is that the author is discussing the best option for children\'s education. This context suggests that the author is making a claim about the superiority of teachers in this context.\n\n4. **Function**: The function of this argument component is to make a claim about the superiority of teachers in children\'s education. It does not provide evidence or examples to support this claim.\n\n### Classification\n\nBased on 

In [329]:
reasons_l = [v.split("\n\n")[1:] for k,v in reasons_d.items()]

In [330]:
reasons_l

[['1. **Semantic Content**: The statement "school teachers are more qualified than parents in the field of education" conveys a specific aspect of the relationship between teachers and parents in children\'s education.',
  '2. **Syntactic Structure**: The phrase "school teachers are more qualified" is a declarative statement, which is a fundamental building block of an argument.',
  "3. **Context**: The context of the essay is that the author is discussing the best option for children's education. This context suggests that the author is making a claim about the superiority of teachers in this context.",
  "4. **Function**: The function of this argument component is to make a claim about the superiority of teachers in children's education. It does not provide evidence or examples to support this claim.",
  '### Classification',
  'Based on the analysis, I classify the argument component as: **Claim**',
  'The argument component "school teachers are more qualified than parents in the fi

In [258]:
# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device) # type: ignore

In [259]:
#input_ids

In [260]:
#print(tokenizer.decode(input_ids[0]))

In [261]:
# terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

In [262]:
# outputs = model.generate(
#     input_ids,
#     max_new_tokens=1024,
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.1,
#     top_p=0.9,
# )

In [263]:
#response = outputs[0][input_ids.shape[-1]:]

In [264]:
#print(tokenizer.decode(response, skip_special_tokens=True))

In [265]:
#reasons_d[usr_msg] = tokenizer.decode(response, skip_special_tokens=True)

In [266]:
#reasons_d