In [None]:
import re
def clean_text(line):
    line = re.sub(r'-+',' ',line)
    #line = re.sub(r'[^a-zA-Z, ]+'," ",line)
    line = re.sub(r'[ ]+'," ",line)
    line += "."
    return line

In [None]:

text = 'In contrast, further deletion of the C-terminal transactivation domain in the Pax5 mutants B8 and B9 can abolish transcriptional stimulation, whereas internal deletion of the conserved octapeptide motif (OP) or the partial homeodomain (HD) of Pax5 did not have any effect (Figure 3B).'
print(clean_text(text))

In [None]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = BertForMaskedLM.from_pretrained("dmis-lab/biobert-v1.1")

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
print("inputs", inputs)
print("len(inputs.input_ids[0])", len(inputs.input_ids[0]))
with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
# mask labels of non-[MASK] tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
print("labels", labels)
print("len(labels[0])", len(labels[0]))
outputs = model(**inputs, labels=labels)
round(outputs.loss.item(), 2)

In [None]:

import numpy as np
import random
def get_random_indices(sentence):
    # Get the length of the sentence
    sentence_length = len(sentence.split())
  
    masked_idx = random.sample(range(sentence_length), 10)
    print("masked_idx", masked_idx)
    # Shuffle the indices of the sentence
    indices = np.arange(0, sentence_length-1, 1)
    np.random.shuffle(indices)

    # Select the first 10 unique indices
    random_indices = indices[:10]

    # Ensure that the selected indices are distinct
    while len(np.unique(random_indices)) < 10:
        np.random.shuffle(indices)
        random_indices = indices[:10]

    return random_indices


sentences = [
    "This is a sample sentence This is a sample sentence This is a sample sentence.",
    "Another example sentence with more words This is a sample sentence This is a sample sentence.",
    "A third sentence to demonstrate the process This is a sample sentence This is a sample sentence.",
]

for sentence in sentences:
    random_indices = get_random_indices(sentence)
    print(f"Sentence: {sentence}")
    print(f"Random Indices: {random_indices}")


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

sentences = [["This is the first sentence."], ['This is the sencond sentence.']] 
encoded_inputs = tokenizer(sentences, padding="max_length", truncation=True)

print(encoded_inputs)


In [23]:
import torch
import torch.nn.functional as F

# Example shapes
input_ids = torch.randn(32, 85)  # Assuming 32 batches, each with a sequence length of 85
logits = outputs.logits[0]  # Example logits, shape [85, 28996]
labels = torch.randint(0, 28996, (32, 85))  # Assuming labels for each position in the sequence

# Ensure that logits match the number of classes in your task
num_classes = logits.size(1)

# Transpose logits to be [sequence_length, batch_size, num_classes]
logits = logits.transpose(0, 1)

# Flatten the logits and labels for the loss calculation
logits_flat = logits.contiguous().view(-1, num_classes)
labels_flat = labels.view(-1)

# Assuming a simple sequence classification task
# You can use CrossEntropyLoss along the sequence dimension
loss_fn = torch.nn.CrossEntropyLoss()

# Calculate the loss
loss = loss_fn(logits_flat, labels_flat)

# Your training/update step here


NameError: name 'outputs' is not defined

In [None]:
import torch

# Example tensor with size [32, 85]
original_tensor = torch.randn(32, 85)
print(original_tensor)  
# Sum along the second dimension (axis 1)
summed_tensor = torch.sum(original_tensor, dim=1)
print(summed_tensor)
# Check the size of the resulting tensor
print(summed_tensor.size())  # Should print torch.Size([32])



In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model and tokenizer
model_name = 'dmis-lab/biobert-base-cased-v1.2'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Input text with a [MASK] token
text = "The [MASK] is a large mammal native to North America."

# Tokenize the input text
tokens = tokenizer(text, return_tensors='pt')

# Get the model's output logits
with torch.no_grad():
    outputs = model(**tokens)
    logits = outputs.logits

# Assuming you have a [MASK] token in the input, find its position
mask_position = tokens['input_ids'][0].tolist().index(tokenizer.mask_token_id)
print("mask_position", mask_position)
# Extract the logits for the masked position
masked_logits = logits[0, mask_position]

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(masked_logits, dim=-1)

# Get the token with the highest probability (predicted token)
predicted_token_id = torch.argmax(probabilities).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

# Replace the [MASK] token with the predicted token in the original text
result_text = text.replace('[MASK]', predicted_token)

# Print results
print("Original Text:", text)
print("Result Text:", result_text)
print("Predicted Token:", predicted_token)
print("Probabilities:", probabilities)

