In [1]:
import gc
import ast
import torch
import numpy as np
from Data import *
import pandas as pd
from helper import *
from BasicsModels import *
from peft import LoraConfig, get_peft_model, AutoPeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AutoModel

In [2]:
text_model_name = 'google-bert/bert-base-uncased'
code_model_name = 'meta-llama/CodeLlama-7b-hf'

# Define tokenizers
text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
code_tokenizer = AutoTokenizer.from_pretrained(code_model_name, device_map="auto", trust_remote_code=True)

# Set padding token if not already seta
if text_tokenizer.pad_token is None:
    text_tokenizer.pad_token = text_tokenizer.eos_token
if code_tokenizer.pad_token is None:
    code_tokenizer.pad_token = code_tokenizer.eos_token

In [3]:
df = pd.read_csv("../basic_model.csv")
df['prev_code'] = df['prev_code'].apply(ast.literal_eval)
df['prev_question'] = df['prev_question'].apply(ast.literal_eval)

In [4]:
all_elements_set = set(
    item 
    for sublist in df['prev_code'].explode() 
    for item in sublist
)

In [5]:
all_elements = list(all_elements_set)

In [6]:
train_dataloader, val_dataloader, test_dataloader = create_data_loader(df, DatasetCodeQuestion, text_tokenizer, code_tokenizer, batch_size=1)

Load exist spliting


In [7]:
def big_language_model_support(huggingface_code_model, code_model_name):
    """
    add support in big language model require LoRa training.
    """
    print(f"use {code_model_name}")
    big_language_model = ['meta-llama/CodeLlama-7b-hf']
    if not code_model_name in big_language_model:
        return huggingface_code_model.from_pretrained(code_model_name)
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = huggingface_code_model.from_pretrained(
        code_model_name,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
        output_hidden_states=True
    )

    lora_config = LoraConfig(
        r=2,
        lora_alpha=2,
        lora_dropout=0.1,
        target_modules=["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    return get_peft_model(model, lora_config)

In [10]:
code_model = big_language_model_support(AutoModelForCausalLM, code_model_name).to(device)
code_model.gradient_checkpointing_enable()

use meta-llama/CodeLlama-7b-hf


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.67s/it]


In [8]:
def process_in_batches(text_list, batch_size, device):
    embedding_dict = {}
    
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_text = text_list[i:i + batch_size]

        # Tokenize the batch
        encoding = code_tokenizer(batch_text, max_length=230, padding='max_length', truncation=True, return_tensors='pt')

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get embeddings
        with torch.no_grad():
            code_output = code_model(input_ids=input_ids, attention_mask=attention_mask).logits
    
        # Update the dictionary
        # Collect rows
        for j in range(batch_size):
            embedding_dict[tuple(input_ids[j].cpu().tolist())] = code_output[j, -1, :].cpu().tolist()

        # Clear memory
        del input_ids, attention_mask, code_output
        torch.cuda.empty_cache()
        gc.collect()
    
    return embedding_dict

In [11]:
embedding_dict = process_in_batches(all_elements, 8, device)
with open('Data/code_to_output_dict_llama.pkl', 'wb') as file:
    pickle.dump(embedding_dict, file)


  0%|          | 0/7631 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  8%|▊         | 604/7631 [13:12<2:33:45,  1.31s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 23.69 MiB is free. Process 2026383 has 3.45 GiB memory in use. Including non-PyTorch memory, this process has 4.62 GiB memory in use. Process 2055343 has 15.53 GiB memory in use. Of the allocated memory 4.03 GiB is allocated by PyTorch, and 143.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
def create_code_output(code_model, train_dataloader , valid_dataloader, test_dataloader, device):
    rows = {}
    def _to_each_data(dataloader):
        for i, batch in tqdm(enumerate(dataloader)):
            if i % 100 == 0:
                print(f"Batch {i} from {len(dataloader)}")
            batch_size, num_code, max_code_len = batch['code_input_ids'].size()
            
            for start in range(0, num_code, 1):
                end = min(start + 1, num_code)
                code_input_ids = batch['code_input_ids'].squeeze(0)[start:end].view(-1, max_code_len).to(device)
                code_attention_mask = batch['code_attention_mask'].squeeze(0)[start:end].view(-1, max_code_len).to(device)
            
                with torch.no_grad():
                    code_output = code_model(code_input_ids, code_attention_mask).logits

                # Collect rows
                for j in range(code_input_ids.size(0)):
                    rows[tuple(code_input_ids[j].cpu().tolist())] = code_output[j, -1, :].cpu().tolist()

                # Clear memory
                del code_input_ids, code_attention_mask, code_output
                torch.cuda.empty_cache()
                gc.collect()
            
    _to_each_data(train_dataloader)
    print(f"Finish train- dict size {len(rows)}")
    _to_each_data(valid_dataloader)
    print(f"Finish valid- dict size {len(rows)}")
    _to_each_data(test_dataloader)
    print(f"Finish test- dict size {len(rows)}")
    # Save the dictionary to a file
    with open('Data/code_to_output_dict.pkl', 'wb') as file:
        pickle.dump(rows, file)

In [8]:
create_code_output(code_model, train_dataloader, val_dataloader, test_dataloader, device)

0it [00:00, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Batch 0 from 5665


14it [16:49, 52.33s/it]

In [None]:
class CodeLSTMModel(nn.Module):
    """
    Model of LSTM of the snapshots before current question.
    """
    def __init__(self, code_model_name, hidden_size, num_layers, num_classes=1):
        super(CodeLSTMModel, self).__init__()
        self.code_model = big_language_model_support(AutoModel, code_model_name)
        self.lstm = nn.LSTM(self.code_model.config.hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        batch_size, num_code, max_code_len = input_ids.size()

        # Reshape input for Code Model
        input_ids = input_ids.view(batch_size * num_code, max_code_len)
        attention_mask = attention_mask.view(batch_size * num_code, max_code_len)

        def code_model_forward(input_ids, attention_mask):
        # Pass through the code model and extract last hidden state
            outputs = self.code_model(input_ids=input_ids, attention_mask=attention_mask)
            return outputs.last_hidden_state[:, 0, :]  # [CLS] token output

        # Use checkpoint with or without torch.no_grad()
        with torch.no_grad():
            sequence_output = checkpoint(code_model_forward, input_ids, attention_mask)

        # Reshape output for LSTM
        sequence_output = sequence_output.view(batch_size, num_code, -1)

        # Pass through LSTM
        lstm_output, _ = self.lstm(sequence_output)

        # Take the output of the last LSTM layer at the last time step
        lstm_last_output = lstm_output[:, -1, :]

        # Pass through the final linear layer 
        logits = self.fc(lstm_last_output)

        return logits

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CodeLSTMModel(text_model_name, code_model_name, hidden_size, num_layers).to(device)
pos_weight = torch.tensor([4.0]).to(device) # Adjust this based on your class imbalance
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=lr)

use google-bert/bert-base-uncased




TypeError: hidden_size should be of type int, got: str

In [None]:
model.code_model.print_trainable_parameters()

trainable params: 1,048,576 || all params: 6,608,457,728 || trainable%: 0.0159


In [None]:
model_arr = training_loop(model, train_dataloader, test_dataloader, optimizer, criterion, device, num_epochs)

Training on cuda
num_epochs: 5
Epoch: 0


  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Batch 0 from 4085
torch.Size([1, 1706, 230]) Error occurred: CUDA out of memory. Tried to allocate 5.99 GiB. GPU 0 has a total capacity of 47.41 GiB of which 510.75 MiB is free. Including non-PyTorch memory, this process has 46.90 GiB memory in use. Of the allocated memory 42.92 GiB is allocated by PyTorch, and 3.49 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
torch.Size([1, 1312, 230]) Error occurred: CUDA out of memory. Tried to allocate 4.61 GiB. GPU 0 has a total capacity of 47.41 GiB of which 856.75 MiB is free. Including non-PyTorch memory, this process has 46.56 GiB memory in use. Of the allocated memory 33.85 GiB is allocated by PyTorch, and 12.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try se

KeyboardInterrupt: 

In [None]:
df_results = pd.DataFrame(columns=['model', 'epoch', 'roc_auc', 'accuracy', 'precision', 'recall', 'f1'])
for i, y in enumerate(model_arr):
    results(df_results,"prev codes" ,i, y[0], y[1])

NameError: name 'pd' is not defined

In [None]:
df_results

Unnamed: 0,model,epoch,roc_auc,accuracy,precision,recall,f1
0,prev codes,1,0.539377,0.91759,0.0,0.0,0.0
