In [2]:
import pyarrow.parquet as pq
import pandas as pd
from datasets import Dataset

# Load the parquet file manually
table = pq.read_table('E:\\cybersecurity_32k_instruction_input_output\\data\\train-00000-of-00001.parquet')

# Convert the table to a Pandas DataFrame
df = table.to_pandas()

# Create a Hugging Face Dataset from the DataFrame
dataset = Dataset.from_pandas(df)

# Check the first few samples
print(dataset[0])


{'ds': 3, 'instruction': 'Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n', 'input': 'Text: If these reforms cause the money  market mutual fund market to contract, our business as a  servicer and manager of such funds could be impacted;     Tri-Party Repo Reform .\n', 'output': 'The label is Not applicable. The sentence does not relate to any of the cybersecurity risk management functions.', '__index_level_0__': 0}


In [2]:
# Check the first 5 samples
print(dataset[:5])

# Print column names
print(dataset.column_names)


{'ds': [3, 3, 3, 3, 3], 'instruction': ['Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n', 'Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n', 'Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n', 'Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n', 'Question: Read carefully the sentence below and assign a category from the NIST cyber risk framework described in the text provided. Explain the reasoning.\n'], 'input': ['Text: If these reforms cause the money  market mutual fund market to contract, our business as a  servicer and manager of such funds could be i

In [3]:
print(df.columns)

Index(['ds', 'instruction', 'input', 'output', '__index_level_0__'], dtype='object')


In [1]:
from transformers import AutoTokenizer

# Load the tokenizer from the local model directory
tokenizer = AutoTokenizer.from_pretrained('E:\\Meta-Llama-3.1-8B-Instruct')
# Set the padding token to be the same as the EOS token (if applicable)
tokenizer.pad_token = tokenizer.eos_token

In [3]:
import os
import pandas as pd
import gc
# Save tokens locally
# Set the directory where you want to save the files
output_dir = 'E:/tokenized_data'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# File to store the last processed chunk index
checkpoint_file = 'E:/tokenized_data/checkpoint.txt'

# Check if the checkpoint file exists and read the last processed index
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        last_processed_index = int(f.read().strip())
else:
    last_processed_index = 0

tokenized_chunks = []
chunk_size = 1  # Adjust chunk size as needed

for i in range(last_processed_index, len(dataset), chunk_size):
    chunk = dataset.select(range(i, min(i + chunk_size, len(dataset))))
    
    # Tokenize the chunk
    tokenized_chunk = []
    for example in chunk:
        tokenized_example = tokenizer(example['instruction'], example['input'], example['output'], padding="max_length", truncation=True)
        tokenized_chunk.append(tokenized_example)
    
    # Convert to DataFrame and save to file
    df = pd.DataFrame(tokenized_chunk)
    df.to_csv(os.path.join(output_dir, f'tokenized_chunk_{i//chunk_size}.csv'), index=False)
    
    # Update the checkpoint file
    with open(checkpoint_file, 'w') as f:
        f.write(str(i + chunk_size))  # Save the next index to process
    
    # Clear the chunk and call garbage collector
    del chunk, tokenized_chunk
    gc.collect()

# Clean up the checkpoint file after successful completion
if os.path.exists(checkpoint_file):
    os.remove(checkpoint_file)



KeyboardInterrupt: 

In [6]:
from safetensors import safe_open

for i in range(1, 5):
    shard_path = f'E:\\Meta-Llama-3.1-8B-Instruct\\model-0000{i}-of-00004.safetensors'
    try:
        with safe_open(shard_path, framework="pt") as f:
            print(f'Shard {i} loaded successfully: {f.keys()}')
    except Exception as e:
        print(f'Error loading shard {i}: {e}')



Shard 1 loaded successfully: ['model.embed_tokens.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.2.mlp.down_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'model.layers.2.mlp.up_proj.weight', 'model.layers.2.post_attention_layern

In [4]:
from transformers import LlamaForCausalLM, Trainer, TrainingArguments

# Load the model
model = LlamaForCausalLM.from_pretrained('E:\\Meta-Llama-3.1-8B-Instruct',  # Directory where the index and shard files are located
    use_safetensors=True )


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import gc

class TokenizedDataset(Dataset):
    def __init__(self, csv_file, max_length=512):
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'input_ids': torch.tensor(eval(row['input_ids']), dtype=torch.long)[:self.max_length],
            'attention_mask': torch.tensor(eval(row['attention_mask']), dtype=torch.long)[:self.max_length],
            'labels': torch.tensor(eval(row['labels']), dtype=torch.long)[:self.max_length],
        }

def train_on_chunk(model, csv_file, accumulation_steps=8):
    dataset = TokenizedDataset(csv_file)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed

    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)  # Adjust learning rate as needed
    loss_fn = torch.nn.CrossEntropyLoss()

    model.train()  # Set the model to training mode

    running_loss = 0.0
    optimizer.zero_grad()  # Clear previous gradients

    for epoch in range(1):  # Train for one epoch per chunk
        for batch_idx, batch in enumerate(dataloader, 1):  # Start batch_idx at 1 for easier counting
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
            loss = loss_fn(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))  # Compute loss

            loss.backward()  # Backward pass
            running_loss += loss.item()

            if batch_idx % accumulation_steps == 0:
                optimizer.step()  # Update weights
                optimizer.zero_grad()  # Clear gradients after update
                print(f"Processed {batch_idx} batches - Loss: {running_loss / accumulation_steps:.4f}")
                running_loss = 0.0  # Reset running loss for the next accumulation

        # Print final batch progress
        if batch_idx % accumulation_steps != 0:
            optimizer.step()  # Update weights for the remaining batches
            optimizer.zero_grad()  # Clear gradients after update
            print(f"Final batches in chunk processed - Loss: {running_loss / (batch_idx % accumulation_steps):.4f}")

# Load your model and tokenizer here
csv_files = [f"E:\\tokenized_data\\tokenized_chunk_{i}.csv" for i in range(0, 18)]

# Incrementally train on each chunk
for csv_file in csv_files:
    print(f"Training on file: {csv_file}")
    train_on_chunk(model, csv_file)
    # Clear dataset and invoke garbage collection
    torch.cuda.empty_cache()  # Clear CUDA cache if using GPU
    gc.collect()



Training on file: E:\tokenized_data\tokenized_chunk_0.csv


In [None]:
trainer.save_model('E:\\LLamafine_tuned_llama')