<a href="https://colab.research.google.com/github/oluwafemidiakhoa/MLprject/blob/main/DNASEQUENCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tokenizers sentencepiece





In [None]:
import zipfile
import os

# Define paths for the ZIP files
zip_files = [
    '/content/dog.txt.zip',
    '/content/chimpanzee.txt (1).zip',
    '/content/human.txt.zip'
]

# Directory to extract files
extracted_dir = '/mnt/data/unzipped/'

# Unzipping the files
for zip_file in zip_files:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extracted_dir)

print("Files unzipped successfully.")


Files unzipped successfully.


In [None]:
# Define paths for the text files and FASTA file
file_paths = [
    os.path.join(extracted_dir, 'dog.txt'),
    os.path.join(extracted_dir, 'chimpanzee.txt'),
    os.path.join(extracted_dir, 'human.txt'),
    '/content/example_dna.fa'
]

# Initialize a list to store sequences
sequences = []

# Read the text files
for file_path in file_paths[:3]:  # The first three are text files
    with open(file_path, 'r') as file:
        sequences.extend(file.readlines())

# Read the FASTA file
with open(file_paths[3], 'r') as file:
    sequences.extend([line.strip() for line in file if not line.startswith('>')])

print(f"Loaded {len(sequences)} sequences.")


Loaded 6928 sequences.


In [None]:
import sentencepiece as spm

class AdvancedBocaTokenizer:
    def __init__(self, vocab_size=500):
        self.vocab_size = vocab_size
        self.sp = spm.SentencePieceProcessor()

    def train(self, sequences, model_prefix="boca_model"):
        # Write sequences to a temporary file for training
        temp_file = "sequences.txt"
        with open(temp_file, "w") as f:
            for seq in sequences:
                f.write(f"{seq}\n")

        # Define special tokens
        special_tokens = ["[MASK]", "[CLS]", "[SEP]", "[PAD]", "[UNK]"]

        # Train the SentencePiece model
        spm.SentencePieceTrainer.train(
            input=temp_file,
            model_prefix=model_prefix,
            vocab_size=self.vocab_size,
            character_coverage=1.0,  # Ensures full coverage of characters in DNA sequences
            pad_id=3,  # Assuming [PAD] is the fourth token
            unk_id=4,  # Assuming [UNK] is the fifth token
            user_defined_symbols=special_tokens  # Adding special tokens
        )

        # Load the trained model
        self.sp.load(f"{model_prefix}.model")

        # Clean up
        os.remove(temp_file)
        print("Tokenizer trained successfully.")

    def tokenize(self, sequence):
        return self.sp.encode(sequence, out_type=str)

    def detokenize(self, tokens):
        return self.sp.decode(tokens)

    def save_vocab(self, path):
        print(f"Vocabulary saved at {path}.model")

    def load_vocab(self, path):
        self.sp.load(f"{path}.model")

# Initialize the tokenizer
tokenizer = AdvancedBocaTokenizer(vocab_size=500)

# Train the tokenizer with the sequences
tokenizer.train(sequences, model_prefix="dna_boca_tokenizer")


Tokenizer trained successfully.


In [None]:
# Tokenize sequences using the trained SentencePiece model
# The output should be a list of integers (token IDs)
tokenized_sequences = [{"input_ids": tokenizer.sp.encode(seq, out_type=int)} for seq in sequences]
print("Sequences tokenized successfully.")

# Example of tokenized output (now it should be lists of integers)
print(tokenized_sequences[:5])  # Display the first 5 tokenized sequences


Sequences tokenized successfully.
[{'input_ids': [63, 493, 492, 498, 499, 492, 497, 494, 492, 63, 494, 495, 496, 493, 493]}, {'input_ids': [163, 73, 107, 149, 50, 73, 58, 29, 174, 187, 143, 321, 316, 106, 54, 230, 51, 174, 289, 389, 24, 321, 164, 84, 31, 41, 403, 200, 187, 19, 148, 171, 371, 75, 81, 65, 356, 78, 155, 65, 336, 16, 307, 45, 282, 278, 37, 314, 41, 232, 187, 63, 485]}, {'input_ids': [163, 11, 16, 307, 45, 282, 278, 37, 314, 41, 232, 187, 136, 206, 54, 226, 22, 436, 327, 151, 282, 212, 141, 93, 335, 446, 67, 170, 356, 480, 327, 27, 356, 203, 220, 309, 32, 20, 321, 417, 323, 65, 20, 146, 315, 438, 295, 264, 15, 241, 161, 107, 106, 146, 313, 21, 32, 117, 160, 93, 22, 425, 417, 245, 400, 16, 174, 367, 32, 426, 16, 92, 390, 390, 373, 187, 367, 33, 24, 315, 72, 54, 24, 402, 33, 67, 117, 140, 82, 209, 160, 35, 16, 192, 249, 119, 461, 103, 410, 184, 312, 19, 265, 54, 117, 116, 146, 114, 9, 216, 129, 160, 31, 27, 160, 93, 30, 346, 22, 69, 124, 202, 319, 295, 40, 409, 436, 178, 341,

In [None]:
from datasets import Dataset

# Create a dictionary with the tokenized sequences
data_dict = {
    "input_ids": tokenized_sequences
}

# Convert it to a Dataset object
dataset = Dataset.from_dict(data_dict)

# Split the dataset into train and test sets
split_dataset = dataset.train_test_split(test_size=0.2)

# Now we have `train_dataset` and `test_dataset`
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


In [None]:
import torch
import sentencepiece as spm
from torch.utils.data import Dataset
from transformers import BertConfig, BertForMaskedLM, Trainer, TrainingArguments

# Load the SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load("dna_boca_tokenizer.model")

# Define a simple tokenizer wrapper to use with the transformers library
class SimpleTokenizer:
    def __init__(self, sp):
        self.sp = sp
        self.unk_token = "[UNK]"
        self.pad_token = "[PAD]"
        self.cls_token = "[CLS]"
        self.sep_token = "[SEP]"
        self.mask_token = "[MASK]"

    def encode(self, text):
        input_ids = self.sp.encode(text)
        return input_ids

    def batch_encode_plus(self, texts, padding=True, truncation=True, max_length=512):
        encoded_inputs = [self.encode(text) for text in texts]
        if padding:
            max_len = max(len(seq) for seq in encoded_inputs)
            padded_inputs = [seq + [self.pad_token_id] * (max_len - len(seq)) for seq in encoded_inputs]
            attention_masks = [[1] * len(seq) + [0] * (max_len - len(seq)) for seq in encoded_inputs]
        else:
            padded_inputs = encoded_inputs
            attention_masks = [[1] * len(seq) for seq in encoded_inputs]
        return {"input_ids": padded_inputs, "attention_mask": attention_masks}

    @property
    def vocab_size(self):
        return self.sp.get_piece_size()

    def piece_to_id(self, piece):
        return self.sp.piece_to_id(piece)

    def id_to_piece(self, id_):
        return self.sp.id_to_piece(id_)

    @property
    def mask_token_id(self):
        return self.piece_to_id(self.mask_token)

    @property
    def pad_token_id(self):
        return self.piece_to_id(self.pad_token)

    @property
    def cls_token_id(self):
        return self.piece_to_id(self.cls_token)

    @property
    def sep_token_id(self):
        return self.piece_to_id(self.sep_token)

    @property
    def unk_token_id(self):
        return self.piece_to_id(self.unk_token)

# Create an instance of the simple tokenizer
tokenizer = SimpleTokenizer(sp)

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer.batch_encode_plus([self.texts[idx]], max_length=self.max_length)
        return {
            "input_ids": torch.tensor(encoded["input_ids"][0], dtype=torch.long),
            "attention_mask": torch.tensor(encoded["attention_mask"][0], dtype=torch.long)
        }

# Configure the BERT model for Masked Language Modeling (MLM)
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=512,
    num_attention_heads=8,
    num_hidden_layers=4,
    hidden_size=512,
    intermediate_size=2048
)

# Initialize the BERT model for masked language modeling
model = BertForMaskedLM(config)

# Custom Data Collator
class CustomDataCollatorForLanguageModeling:
    def __init__(self, tokenizer, mlm=True, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm = mlm
        self.mlm_probability = mlm_probability

    def __call__(self, features):
        input_ids = torch.stack([f["input_ids"] for f in features])
        attention_mask = torch.stack([f["attention_mask"] for f in features])

        labels = input_ids.clone()
        probability_matrix = torch.full(labels.shape, self.mlm_probability)

        # Create special_tokens_mask where all tokens are allowed to be masked (value=0)
        special_tokens_mask = torch.zeros_like(input_ids, dtype=torch.bool)

        # Masking is only applied to the actual tokens, not the special tokens like [PAD]
        special_tokens_mask |= (input_ids == self.tokenizer.pad_token_id)
        special_tokens_mask |= (input_ids == self.tokenizer.cls_token_id)
        special_tokens_mask |= (input_ids == self.tokenizer.sep_token_id)

        # Apply mask to the probability matrix
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)

        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # Only compute loss on masked tokens

        input_ids[masked_indices] = self.tokenizer.mask_token_id

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


# Use the custom data collator
data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer)

# Dummy datasets (replace with your actual datasets)
train_texts = ["ATGCTAGCTAGGCTA", "GATCGATCGATCGAT"]
test_texts = ["TGCATGCATGCATGA", "CGATCGATCGATCGT"]

# Create the custom datasets
train_dataset = CustomDataset(train_texts, tokenizer)
test_dataset = CustomDataset(test_texts, tokenizer)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./fine_tuned_dna_language_model_boca")
print("Model trained and saved successfully.")


Step,Training Loss


Model trained and saved successfully.


In [None]:
!ls /content/fine_tuned_dna_language_model_boca


config.json  generation_config.json  model.safetensors


In [None]:
from transformers import BertForMaskedLM, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Load the tokenizer and model from the same source
tokenizer = BertTokenizer.from_pretrained("/content/fine_tuned_dna_language_model_boca/")
model = BertForMaskedLM.from_pretrained("/content/advanced_boca_tokenizer.model")

# Ensure vocab size alignment by adjusting the model's embedding layer
model.resize_token_embeddings(len(tokenizer))

# Prepare the test dataset
test_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "I love machine learning and natural language processing.",
    "Artificial intelligence is transforming the world."
]

# Tokenize the test texts
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

# Validate input IDs
for idx, input_ids in enumerate(test_encodings["input_ids"]):
    if input_ids.max() >= tokenizer.vocab_size:
        print(f"Invalid token ID found in input_ids at index {idx}")
        print(input_ids)

# Clamp input_ids to valid range if necessary
test_encodings["input_ids"] = torch.clamp(test_encodings["input_ids"], max=tokenizer.vocab_size - 1)

# Create a Dataset instance
test_dataset = TextDataset(test_encodings)

# Define the Trainer with evaluation arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
)

# Run the evaluation
print("Starting evaluation...")
eval_result = trainer.evaluate(eval_dataset=test_dataset)
print(f"Evaluation results: {eval_result}")


OSError: Can't load tokenizer for '/content/fine_tuned_dna_language_model_boca/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/content/fine_tuned_dna_language_model_boca/' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Model config vocab size: {model.config.vocab_size}")


Tokenizer vocab size: 30522
Model config vocab size: 500


In [None]:
for idx, input_ids in enumerate(test_encodings["input_ids"]):
    if input_ids.max() >= tokenizer.vocab_size:
        print(f"Invalid token ID found in input_ids at index {idx}")
        print(input_ids)
