In [None]:
!pip install requests biopython transformers torch pinecone-client
!pip install datasets





In [None]:
import requests
import json

# Ensembl API to get sequence by gene ID
def fetch_gene_sequence(gene_id):
    url = f"https://rest.ensembl.org/sequence/id/{gene_id}?content-type=application/json"
    response = requests.get(url)

    if response.ok:
        sequence_data = response.json()
        return sequence_data['seq']  # Return the sequence as a string
    else:
        print(f"Error fetching gene {gene_id}: {response.status_code}")
        return None

# Example: Fetch the BRCA1 gene sequence
gene_id = "ENSG00000012048"  # Example Ensembl gene ID for BRCA1
sequence = fetch_gene_sequence(gene_id)

if sequence:
    print(f"Sequence for gene {gene_id}: {sequence[:100]}...")  # Print the first 100 characters


Sequence for gene ENSG00000012048: AAAGCGTGGGAATTACAGATAAATTAAAACTGTGGAACCCCTTTCCTCGGCTGCCGCCAAGGTGTTCGGTCCTTCCGAGGAAGCTAAGGCCGCGTTGGGG...


In [None]:
# Fetch multiple gene sequences (you can add more gene IDs to this list)
gene_ids = ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"]  # BRCA1, BRCA2, TP53, etc.
sequences = []

for gene_id in gene_ids:
    seq = fetch_gene_sequence(gene_id)
    if seq:
        sequences.append(seq)

# Check how many sequences were successfully fetched
print(f"Fetched {len(sequences)} gene sequences")


Fetched 3 gene sequences


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

# Example: Create mock binary labels (adjust based on your task)
labels = [1, 0, 1]  # Example binary labels for gene outcomes

# Tokenize the genetic sequences
def tokenize_function(examples):
    return tokenizer(examples['sequence'], padding="max_length", truncation=True)

dataset = Dataset.from_dict({"sequence": sequences, "labels": labels})
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3
)
# Before saving the model, make sure the weights are contiguous:
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

# Train the model on the Ensembl data
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch,Training Loss,Validation Loss
1,No log,0.686199
2,No log,0.638163
3,No log,0.636793


TrainOutput(global_step=3, training_loss=0.6454786459604899, metrics={'train_runtime': 31.5959, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.095, 'total_flos': 13874997060.0, 'train_loss': 0.6454786459604899, 'epoch': 3.0})

In [None]:
import torch
import pinecone # Added this line to import the pinecone module
from pinecone import Pinecone
from pinecone import Pinecone, ServerlessSpec

# Function to generate embeddings
def get_embeddings(sequences):
    model.eval()
    inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)  # Get hidden states
        embeddings = torch.cat(outputs.hidden_states[-2:], dim=-1)[:, 0, :].detach().numpy()
    return embeddings

# Generate embeddings for all sequences
sequence_embeddings = get_embeddings(sequences)

# Initialize Pinecone
pinecone = Pinecone(api_key="55e2a7c0-355f-40ca-a0f7-c50aa6c29ef3", environment="us-east-1")  # Replaced pinecone.init with Pinecone class initialization
index_name = "ensembl-genetic-index" # Define index_name outside the if block
index_names = pinecone.list_indexes()  # Retrieve the existing indexes
if index_name in index_names:
    pinecone.delete_index(index_name)
    print(f"Deleted existing index '{index_name}'")

index = pinecone.Index(index_name)





# Insert embeddings into Pinecone
ids = [f"gene_{i}" for i in range(len(sequence_embeddings))]
vectors_to_insert = list(zip(ids, sequence_embeddings))
index.upsert(vectors=vectors_to_insert)


{'upserted_count': 3}

In [None]:
from sklearn.preprocessing import normalize

# Normalizing embeddings to have unit norm
sequence_embeddings = normalize(sequence_embeddings, norm='l2')


In [None]:
import numpy as np

def query_sequence(sequence):
    query_embedding = get_embeddings([sequence])[0]
    # Convert query_embedding to float32 explicitly
    query_embedding = query_embedding.astype(np.float32).tolist()


    # Check the type and range of values in query_embedding to get more insights
    query_embedding = [min(max(x, -1.0), 1.0) for x in query_embedding]

    # Wrap the query_embedding in an additional list for compatibility
    results = index.query(
        vector=query_embedding,  # Wrap the query_embedding with 'values' key
        top_k=5,
        include_metadata=True
    )

    return results
# Example query for a new sequence
new_sequence = "ATGGATTTGTCTAGAGGGTATTTGGGCTGCTGAAGAACTTCTCCATTCCCAGAGTGTCAGTTTGAAG"  # Replace with an actual genetic sequence
results = query_sequence(new_sequence)
print(results)


{'matches': [{'id': 'gene_2', 'score': 0.263771534, 'values': []},
             {'id': 'gene_0', 'score': 0.263771534, 'values': []},
             {'id': 'gene_1', 'score': 0.263771534, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
