In [230]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [231]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [232]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",   
    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

In [233]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)


ap: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 51760/51760 [00:00<00:00, 75129.72 examples/s]

In [234]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", 
    ),
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [235]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.591
2,1.986
3,1.7019
4,1.9132
5,1.7366
6,1.4934
7,1.0712
8,1.2645
9,1.1529
10,1.1003


In [236]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.",
        "1, 1, 2, 3, 5, 8", 
        "", 
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025']

In [8]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.",
        "1, 1, 2, 3, 5, 8", 
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5778577, 9275249,


## Accuracy

In [237]:
# Definisci il nuovo prompt
alpaca_prompt = """Given the golden answer and the answer from the student, return if the two can be considered the same by answering only with 'yes'or 'no'. Don't add anything else.
If the final answer given from the student is empty, then is a 'no'.


###gold answer:
{}

###final answer:
{}

###response:
{}"""

In [241]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [242]:
from datasets import load_dataset
dataset = load_dataset("lozziopadredeivizzi/mathematic_games_dataset_en", split="train")

[0m

In [205]:
file_path_jsonl = 'ToRA-7B/maj8/filtered_completions_tir.jsonl'

In [206]:
import json
import pandas as pd

data = []
with open('../fileJSONL/'+file_path_jsonl, 'r') as file:
    for line in file:
        data.append(json.loads(line))

In [207]:
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [208]:
results = []

for item in data:
    # Estrai il campo 'final_answer'
    id = item['id']
    final_answer = item['final_answer']
    gold_answer = item['gold_answer']

    if is_float(final_answer):
      if final_answer in gold_answer:
          answer = 'yes'
          results.append({
              "id":id,
              "gold_answer": gold_answer,
              "final_answer": final_answer,
              "model_response": answer
          })
      else:
          answer = 'no'
          results.append({
              "id":id,
              "gold_answer": gold_answer,
              "final_answer": final_answer,
              "model_response": answer
          })
    elif final_answer == "":
      answer = 'no'
      results.append({
          "id":id,
          "gold_answer": gold_answer,
          "final_answer": final_answer,
          "model_response": answer
      })
    else:
      answer = 'llm'
      results.append({
          "id":id,
          "gold_answer": gold_answer,
          "final_answer": final_answer,
          "model_response": answer
      })

# Crea un DataFrame dai risultati
df_result = pd.DataFrame(results)

In [209]:
df_result

Unnamed: 0,id,gold_answer,final_answer,model_response
0,2,Amerigo will be 58 years old,58,yes
1,6,The 2017th issue is 0,7,yes
2,7,Carla used 48 tiles,3,no
3,8,Nando takes 30 minutes,25,no
4,9,Liliana lost 49 Euro,21,no
...,...,...,...,...
1383,2175,464 mm,,no
1384,2176,19,,no
1385,2177,6 members,25,no
1386,2178,75 cents,10,no


In [210]:
df_result[df_result['model_response'] == 'yes']

Unnamed: 0,id,gold_answer,final_answer,model_response
0,2,Amerigo will be 58 years old,58,yes
1,6,The 2017th issue is 0,7,yes
21,33,Size area (2/5)×14×15 = 84 cm2,2,yes
40,59,The numbers are: 13,13,yes
43,62,The radius is worth 56mm.,5,yes
...,...,...,...,...
1357,2118,7,7,yes
1360,2124,8,8,yes
1362,2127,2 (=3-1),2,yes
1368,2139,4 guys,4,yes


In [211]:
ids_to_modify = [6,33,62,75,88,137,143,164,169,193,226,230,242,268,275,348,479,499,534,544,569,570,572,583,592,613,742,786,788,801,807,861,901,927,930,946,974,988,1028,1085,1131,1226,1229,1268,1309,1369,1391,1400,1407,1473,1501,1525,1528,1569,1588,1594,1610,1626,1704,1709,1719,1744,1764,1780,1788,1810,1839,1885,1894,1896,1899,1948,1953,1997,2088,2117]

In [212]:
# Iterate through the DataFrame and modify the 'model_response' for specified IDs
for index, row in df_result.iterrows():
  if row['id'] in ids_to_modify:
    df_result.loc[index, 'model_response'] = 'no'

In [213]:
df_result_2 = df_result[df_result['model_response'] == 'llm']
df_result_2

Unnamed: 0,id,gold_answer,final_answer,model_response
17,29,The product is valid 24/5 × 13/24 = 13/5,\frac{1}{2},llm
72,102,"The three ages are 1, 2, 9","1, 2, 9",llm
74,106,At 3h 46m,4:40,llm
75,107,Two sequences: 5136248 or 7136248,"8, 48, 72, 108, 162, 81, 27, 45",llm
79,112,August (8),August,llm
...,...,...,...,...
1316,2055,It'll be Sunday.,\text{Sunday},llm
1323,2068,The day after tomorrow will be Thursday,\text{Wednesday},llm
1328,2077,3,\text{A},llm
1341,2094,"4,14 meters",2\pi,llm


In [214]:
import json
import pandas as pd
from tqdm import tqdm
from collections import Counter

# Ciclo su ciascuna istanza nel file JSON, usando tqdm per la barra di avanzamento
for index, row in tqdm(df_result_2.iterrows(), desc="Processing items"):
    # Estrai i campi 'gold_answer' e 'final_answer'
    gold_answer = row['gold_answer']
    final_answer = row['final_answer']

    # Crea il prompt con i campi estratti
    prompt = alpaca_prompt.format(gold_answer, final_answer, "")

    res=[]
    # Tokenizza e invia al modello
    for i in range(8):
      inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
      outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

      # Decodifica l'output del modello
      decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      response = decoded_output[0].strip().lower()  # Rimuovi spazi extra e metti in minuscolo

      # Estrai la risposta dopo '###response:'
      if "###response:" in response:
          response = response.split("###response:")[1].strip()  # Prendi la parte dopo '###response:' e rimuovi spazi
          res.append(response.split()[0])

    # Conta le occorrenze di ciascuna risposta in 'res'
    counter = Counter(res)

    # Ottieni la risposta più comune
    most_common_response, count = counter.most_common(1)[0]

    df_result_2.loc[index, 'model_response'] = most_common_response



rocessing items: 150it [11:32,  4.62s/it]

In [215]:
df_final = pd.concat([df_result_2, df_result[df_result['model_response'] != 'llm']], ignore_index=True)

In [216]:
df_final.to_csv('../fileJSONL/ToRA-7B/maj8/tora_maj8_tir.csv', index=False)

In [217]:
yes_count = df_final['model_response'].value_counts().get('yes', 0)
print(f"The value 'yes' appears {yes_count} times in the 'model_response' column.")

The value 'yes' appears 158 times in the 'model_response' column.


In [218]:
# Calcola l'accuracy
accuracy = yes_count /df_final.shape[0]
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 11.38%


## Accuracy differenziata

In [243]:
df = pd.DataFrame(dataset)

In [220]:
merged_df = pd.merge(df_final, df[['id', 'difficulty']], on='id', how='left')

### Easy difficulty

In [221]:
easy_df = merged_df[(merged_df['difficulty'] == 'easy')]

In [222]:
yes_count = easy_df['model_response'].value_counts().get('yes', 0)
print(f"The value 'yes' appears {yes_count} times in the 'model_response' column.")

The value 'yes' appears 74 times in the 'model_response' column.


In [223]:
# Calcola l'accuracy
accuracy = yes_count /easy_df.shape[0]
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 13.83%


### Medium difficulty

In [224]:
medium_df = merged_df[(merged_df['difficulty'] == 'medium')]

In [225]:
yes_count = medium_df['model_response'].value_counts().get('yes', 0)
print(f"The value 'yes' appears {yes_count} times in the 'model_response' column.")

The value 'yes' appears 55 times in the 'model_response' column.


In [226]:
# Calcola l'accuracy
accuracy = yes_count /medium_df.shape[0]
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 12.97%


### Had difficulty

In [227]:
hard_df = merged_df[(merged_df['difficulty'] == 'hard')]

In [228]:
yes_count = hard_df['model_response'].value_counts().get('yes', 0)
print(f"The value 'yes' appears {yes_count} times in the 'model_response' column.")

The value 'yes' appears 29 times in the 'model_response' column.


In [229]:
accuracy = yes_count / hard_df.shape[0]
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 6.76%
