In [1]:
import torch
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM

### Tokenizer and Model

In [4]:
model_id = "unsloth/Llama-3.1-8B-Instruct-bnb-4bit"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "unsloth/Llama-3.1-8B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": [
      "lm_head",
      "multi_modal

### Data

In [8]:
DATA_DIR = Path.cwd() / "am_reasoning" / "data_files"

In [9]:
df = pd.read_csv((DATA_DIR / "pe_dataset.csv"), index_col=0)

In [10]:
df.columns

Index(['split', 'title', 'argument_bound_1', 'argument_bound_2', 'argument_id',
       'ac', 'label', 'essay'],
      dtype='object')

In [11]:
df.shape

(6089, 8)

In [43]:
df.split.value_counts()

split
TRAIN    4823
TEST     1266
Name: count, dtype: int64

### Template Messages

In [12]:
# sys_msg = """You are an expert in argumentation analysis. You will be given a complete argument. Your task is to analyze the argument holistically and classify it as exactly ONE of the following categories: Position, Claim, or Premise. 

# First, think through your analysis step by step, considering the function and structure of the argument. Explain your reasoning, identifying key characteristics that point to a specific classification.

# Then, provide your final classification as a single word: Position, Claim, or Premise."""

In [13]:
# instruction = """### You are an expert in Argument Mining. You are given an argument component from an essay. Your task is to classify the argument component as either "MajorClaim", "Claim" or "Premise". You must return the argument component type in following JSON format:: {"component_type": [component_type (str)]}


# """

In [14]:
# instruction = """### You are an expert in Argument Mining and discourse analysis. 

# INPUT:
# - You are given an argument component from an essay. 
# - You are given the full text of the essay.

# TASK:
# - Your task is to classify the whole argument component, as ONLY ONE of: "MajorClaim", "Claim" or "Premise".
# - Explain your reasoning step-by-step before making your final classification.

# Then, return your final classification in the following JSON format:
# {"component_type": [Answer]}

# Example:

# {"component_type": ["Claim"]}

# """

In [15]:
reasons_d = {}

In [16]:
instruction = """### You are an expert in Argument Mining and discourse analysis. 

INPUT:
- You are given a single argument component from an essay. 
- You are also given the full text of the essay for context.

TASK:
- Your task is to classify the ENTIRE argument component as ONE of these three categories ONLY: "MajorClaim", "Claim" or "Premise".
- Do NOT break down the argument component into smaller parts with different classifications.
- The WHOLE component must receive exactly ONE classification.

REASONING PROCESS:
First, analyze the component considering these factors:
1. Semantic content: What is the meaning and purpose of this component in the argument?
2. Syntactic structure: How is this component phrased? (declarative statement, supporting evidence, etc.)
3. Context: How does this component relate to the overall essay thesis?
4. Function: 
   - Does it express the author's central position on the main topic? (MajorClaim)
   - Does it express a specific stance that supports or develops the main position? (Claim)
   - Does it provide evidence, examples, or justification for a claim? (Premise)

Provide your step-by-step reasoning, then classify the ENTIRE argument component as exactly ONE of: "MajorClaim", "Claim", or "Premise".

Finally, return your classification in the following JSON format:
{"component_type": ["Your Answer"]}

Example:
{"component_type": ["Claim"]}

"""

In [18]:
df_s = df.sample(20).reset_index(drop=True)

In [19]:
sys_msgs = []
user_msgs = []

for _, row in df_s.iterrows():

    sys_msg = {"role": "system", "content": instruction}
    #usr_msg = {"role": "user", "content": f"""### Here is the argument component to be classified: {row['ac']}"""} 
    
    #msg_c = f"""### Here is the complete essay text: {df.iloc[0]['essay']}""" + "\n\n" + f"""### Here is the argument component to be classified: {row['ac']}"""
    
    usr_msg = {"role": "user", "content": f"""### Here is the complete essay text: {row['essay']}""" + "\n\n" + f"""### Here is the argument component to be classified: {row['ac']}"""}
    
    
    sys_msgs.append(sys_msg)
    user_msgs.append(usr_msg)    
    

In [20]:
messages = []

for i in range(len(sys_msgs)):
    messages.append([sys_msgs[i], user_msgs[i]])

In [21]:
len(messages)

20

In [22]:
messages[5][1]['content'].split("###")[2]

" Here is the argument component to be classified: many people's jobs are now at high stake"

In [23]:
#print(usr_msg)

In [24]:
# messages = [
#     {"role": "system", "content": instruction},
#     {"role": "user", "content": usr_msg},
# ]

In [25]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [26]:
for i, message in tqdm(enumerate(messages), "generating CoT", len(messages)):

    input_ids = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device) # type: ignore
    outputs = model.generate(input_ids, max_new_tokens=1024, eos_token_id=terminators, do_sample=True, temperature=0.1, top_p=0.9)
    
    response = outputs[0][input_ids.shape[-1]:]
    reasons_d[message[1]['content'].split("###")[2]] = tokenizer.decode(response, skip_special_tokens=True)

generating CoT: 100%|██████████| 20/20 [03:20<00:00, 10.02s/it]


In [27]:
reasons_d

{' Here is the argument component to be classified: He or she should always keep the passion in her field': '### Step-by-Step Reasoning:\n\n1. **Semantic Content**: The component "He or she should always keep the passion in her field" expresses a specific requirement or condition for a professor to be effective in their role. It implies that maintaining passion in their field is essential for a professor\'s performance.\n\n2. **Syntactic Structure**: The component is phrased as a declarative statement, indicating a fact or a requirement. It is not a question or a request, but rather a statement that sets a condition for a professor\'s success.\n\n3. **Context**: In the context of the essay, this component is part of the argument that professors should spend more time doing research. It is used to justify why professors should prioritize research over teaching.\n\n4. **Function**: This component does not express the author\'s central position on the main topic (MajorClaim), nor does it 

In [28]:
reasons_l = [v.split("\n\n")[1:] for k,v in reasons_d.items()]

In [29]:
reasons_l

[['1. **Semantic Content**: The component "He or she should always keep the passion in her field" expresses a specific requirement or condition for a professor to be effective in their role. It implies that maintaining passion in their field is essential for a professor\'s performance.',
  "2. **Syntactic Structure**: The component is phrased as a declarative statement, indicating a fact or a requirement. It is not a question or a request, but rather a statement that sets a condition for a professor's success.",
  '3. **Context**: In the context of the essay, this component is part of the argument that professors should spend more time doing research. It is used to justify why professors should prioritize research over teaching.',
  "4. **Function**: This component does not express the author's central position on the main topic (MajorClaim), nor does it provide evidence or examples to support a claim (Premise). Instead, it expresses a specific stance that supports or develops the main

In [32]:
reasoning_chains = []
classifications = []

for chain in reasons_l:
    reasoning_chains.append(chain[:-2])
    classifications.append(chain[-1])

In [33]:
reasoning_chains

[['1. **Semantic Content**: The component "He or she should always keep the passion in her field" expresses a specific requirement or condition for a professor to be effective in their role. It implies that maintaining passion in their field is essential for a professor\'s performance.',
  "2. **Syntactic Structure**: The component is phrased as a declarative statement, indicating a fact or a requirement. It is not a question or a request, but rather a statement that sets a condition for a professor's success.",
  '3. **Context**: In the context of the essay, this component is part of the argument that professors should spend more time doing research. It is used to justify why professors should prioritize research over teaching.',
  "4. **Function**: This component does not express the author's central position on the main topic (MajorClaim), nor does it provide evidence or examples to support a claim (Premise). Instead, it expresses a specific stance that supports or develops the main

In [34]:
classifications

['{"component_type": ["Claim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["MajorClaim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Claim"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Premise"]}',
 '{"component_type": ["Claim"]}']

In [35]:
import json

In [38]:
predictions = [json.loads(elem)['component_type'][0] for elem in classifications]

In [39]:
grounds = df_s.label.tolist()

In [40]:
len(grounds), len(predictions)

(20, 20)

In [42]:
predictions

['Claim',
 'Premise',
 'Premise',
 'Premise',
 'MajorClaim',
 'Premise',
 'Premise',
 'Claim',
 'Premise',
 'Claim',
 'Premise',
 'Premise',
 'Claim',
 'Premise',
 'Claim',
 'Claim',
 'Claim',
 'Premise',
 'Premise',
 'Claim']

In [258]:
# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device) # type: ignore

In [259]:
#input_ids

In [260]:
#print(tokenizer.decode(input_ids[0]))

In [261]:
# terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

In [262]:
# outputs = model.generate(
#     input_ids,
#     max_new_tokens=1024,
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.1,
#     top_p=0.9,
# )

In [263]:
#response = outputs[0][input_ids.shape[-1]:]

In [264]:
#print(tokenizer.decode(response, skip_special_tokens=True))

In [265]:
#reasons_d[usr_msg] = tokenizer.decode(response, skip_special_tokens=True)

In [266]:
#reasons_d