### Step 1: Install necesscary packages

In [None]:
#TEAM MEMBERS
# Muhammed Ikbal Ozbey   -  N2504208D
# Ahmet Bugra Kus      -   N2503674F


#  we prepare the project in our meetings, so each member works on same tasks and 
# brain-stormed together.

In [2]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm



### Step 2: Package imports and configuration

In [20]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 5e-4              #base_lr changed 1e-4 to 5e-4 because it did  not learn basic calculus with this rate, but with change it works fine
epochs = 10                 #epochs increased to 10 to make more loop and train better
batch_size = 64
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

### Step 3: Define helper functions

In [21]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss 

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [22]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

  ckpt = torch.load("../sft/gpt.pt", map_location=device)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [23]:
import os
import json
import random

# Function to generate one example pair (positive and negative)
def generate_example():
    a = random.randint(2, 100)
    b = random.randint(2, 100)
    op = random.choice(["+", "-", "*", "/"])

    # Operation chose to provide 4 type of calculus operation
    if op == "+":
        res = a + b
    elif op == "-":
        res = a - b
    elif op == "*":
        a = random.randint(2, 35)   #since the big values are not given in pdf, we are eliminated them to avoid confusion for machine
        b = random.randint(2, 35)
        res = a * b
    else:  # op == '/'
        if b == 0:
            b = 1
        res = int(round(a / b, 2))
        if res == 0:  #this control added in order to avoid zero division error
            res = 1

    # Randomly choose order of x in a question prompt
    chose_value = random.random()
    if chose_value < 0.33:
        # First type ---> x is on the left side
        question = f"x{op}{b}={res}, x=?"
        
        # Compute the correct answer and reasoning
        if op == "+":
            calc_x = f"{res}-{b}"
            ans = res - b
        elif op == "-":
            calc_x = f"{res}+{b}"
            ans = res + b
        elif op == "*":
            calc_x = f"{res}/{b}"
            ans = int(round(res / b, 2))
        else:
            calc_x = f"{res}*{b}"
            ans = int(round(res * b, 2))

        pos = f"{question} The answer is {ans} because {calc_x} equals {ans}."
        neg = f"{question} Sorry, I don't know!"

    elif chose_value > 0.33 and chose_value < 0.66:
        # Second type ---> x is on the right side
        question = f"{b}{op}x={res}, x=?"
        
        # coompute the correct answer and reasoning
        if op == "+":
            calc_x = f"{res}-{b}"
            ans = res - b
        elif op == "-":
            calc_x = f"{b}-{res}"
            ans = b - res
        elif op == "*":
            calc_x = f"{res}/{b}"
            ans = int(round(res / b, 2))
        else:
            calc_x = f"{b}/{res}"
            ans = int(round(b / res, 2))

        pos = f"{question} The answer is {ans} because {calc_x} equals {ans}."
        neg = f"{question} Sorry, I don't know!"

    else:
        #Third type -----> a+b=?
        question = f"{a}{op}{b}=?"
        pos = f"{question} The answer is {res} because {a}{op}{b} equals {res}."
        neg = f"{question} Sorry, I don't know!"

    return {"negative": neg, "positive": pos}


# We tried 500k example as well but it does not effect the quality of result so we just sticked to suggested number of samples (100k)
examples = [generate_example() for _ in range(100000)]


with open("pos_neg_pairs.json", "w", encoding="utf-8") as f:
    json.dump(examples, f, indent=2, ensure_ascii=False)

#to confirm that we created file correctly
print("pos_neg_pairs.json created with", len(examples))


pos_neg_pairs.json created with 100000


In [24]:
with open("pos_neg_pairs.json", "r") as f:
   lines = json.load(f)

print("Total samples:", len(lines))
print("Example pair:\n", lines[0])
#to see that we are created examples properly

# all the times we tried to run the program, it gives us key error with "!" 
# so we searched the error and we find that it did not defined in given values
# and we adding this code to solve error part

for p in lines:
    p["positive"] = p["positive"].replace("!", "")
    p["negative"] = p["negative"].replace("!", "")



Total samples: 100000
Example pair:
 {'negative': "x-62=21, x=? Sorry, I don't know!", 'positive': 'x-62=21, x=? The answer is 83 because 21+62 equals 83.'}


In [25]:
# Load data from ./data/pos_neg_pairs.json

### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [26]:

optimizer = torch.optim.AdamW(gpt.parameters(),lr=base_lr,betas=(0.9, 0.95),weight_decay=1e-4,)
#adamW method

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=epochs)
#scheduler with created optimizer

In [27]:
# recommend to use the AdamW optimizer 

### Step 7: Begin training (**students are required to complete this part!**)

In [18]:
total_steps = len(lines) // batch_size
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size))
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
        ###########################################################
        # Please complete the training code here!
        # Examples: 
        # ...
        # neg_logprob
        # pos_logprob 
        # loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1 
        # ...
        ###########################################################
        optimizer.zero_grad(set_to_none=True)
        #clear the gradient


        # compute log probabilities for pos/neg samples
        pos_logprob = compute_logprob(pos_tensor)
        neg_logprob = compute_logprob(neg_tensor)
        
        #loss computation according to formula
        loss = -F.logsigmoid(beta * (pos_logprob - neg_logprob)).mean() - 0.1 * pos_logprob.mean()


        #computes gradients for all model
        loss.backward()

        # update model
        optimizer.step()
        pbar.set_description(f"Epoch {epoch+1} | Loss {loss.item():.4f}")
     
    scheduler.step()
        
    ckpt_path = f"./dpo.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

Epoch 1 | Loss 0.0181: : 1562it [03:11,  8.17it/s]


Saved checkpoint to ./dpo.pt


Epoch 2 | Loss 0.0179: : 1562it [03:04,  8.48it/s]


Saved checkpoint to ./dpo.pt


Epoch 3 | Loss 0.0170: : 1562it [03:03,  8.49it/s]


Saved checkpoint to ./dpo.pt


Epoch 4 | Loss 0.0162: : 1562it [03:04,  8.48it/s]


Saved checkpoint to ./dpo.pt


Epoch 5 | Loss 0.0170: : 1562it [03:04,  8.48it/s]


Saved checkpoint to ./dpo.pt


Epoch 6 | Loss 0.0167: : 1562it [03:04,  8.47it/s]


Saved checkpoint to ./dpo.pt


Epoch 7 | Loss 0.0166: : 1562it [03:05,  8.42it/s]


Saved checkpoint to ./dpo.pt


Epoch 8 | Loss 0.0161: : 1562it [03:05,  8.43it/s]


Saved checkpoint to ./dpo.pt


Epoch 9 | Loss 0.0161: : 1562it [03:04,  8.47it/s]


Saved checkpoint to ./dpo.pt


Epoch 10 | Loss 0.0161: : 1562it [03:04,  8.47it/s]

Saved checkpoint to ./dpo.pt





In [28]:
import torch
print( torch.version.cuda)
print( torch.cuda.is_available())
if torch.cuda.is_available():
    print( torch.cuda.get_device_name(0))
else:
    print("No GPU")

#at first my jupyter run very slow like 1.30it/s, and I just add this block to see whether my GPU is working properly or not


12.1
True
NVIDIA GeForce RTX 3060 Laptop GPU


### Step 8: Begin testing (**students are required to complete this part!**)

In [29]:
# Load the fine-tuned model
ckpt_path = "dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).cuda()
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["17+19=?", "3*17=?", "72/4=?" ,"72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=60,x=?","x-10=34,x=?","x+10=34,x=?","7*8=?"]
#we added some more examples to compare 
with torch.no_grad():
    for prompt in test_set: 
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        ###########################################################

        # x --> the input tensor given to the model
        # it contains the form of tokenized prompt
        x = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]

        # y --> the model's generated output tokens
        # gpt.generate() --->  takes the input " x" and continues generating new tokens
        # max_new_tokens--> defines how many tokens to generate
        # temperature and top_k control randomness and sampling diversity
        y = gpt.generate(x, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)


        # output_text--> the decoded text form of y
        
        # when we added without flatten func, it gives error with list,
        output_text = decode(y[0].flatten().tolist())

        print(f"Q: {prompt}\nA: {output_text}\n{'-'*50}")

  checkpoint = torch.load(ckpt_path, map_location=device)


Q: 17+19=?
A: 17+19=? The answer is 36 because 17+19 equals 36.
--------------------------------------------------
Q: 3*17=?
A: 3*17=? The answer is 51 because 3*17 equals 51.
--------------------------------------------------
Q: 72/4=?
A: 72/4=? The answer is 18 because 72/4 equals 18.
--------------------------------------------------
Q: 72-x=34,x=?
A: 72-x=34,x=? The answer is 69 because 72-3 equals 69.
--------------------------------------------------
Q: x*11=44,x=?
A: x*11=44,x=? The answer is 4 because 44/1 equals 4.
--------------------------------------------------
Q: 3*17=?
A: 3*17=? The answer is 51 because 3*17 equals 51.
--------------------------------------------------
Q: 72/4=?
A: 72/4=? The answer is 18 because 72/4 equals 18.
--------------------------------------------------
Q: 72-x=60,x=?
A: 72-x=60,x=? The answer is 66 because 72-6 equals 66.
--------------------------------------------------
Q: x-10=34,x=?
A: x-10=34,x=? The answer is 33 because 3+10 equals 33.
--