<a href="https://colab.research.google.com/github/mihirmaurya31/GeneGPT/blob/main/GeneGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests biopython transformers torch pinecone-client
!pip install datasets



Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [3]:
import requests

# Function to fetch gene IDs based on a filter (e.g., human genes on a specific chromosome)
def fetch_gene_ids(species="human", chromosome="1", start=1, end=1000000):
    url = f"https://rest.ensembl.org/overlap/region/{species}/{chromosome}:{start}-{end}?feature=gene;content-type=application/json"

    response = requests.get(url)
    if response.ok:
        gene_data = response.json()
        gene_ids = [gene["id"] for gene in gene_data]  # Extract the gene IDs
        return gene_ids
    else:
        print(f"Error fetching gene IDs: {response.status_code}")
        return []

# Function to fetch gene sequence for a given gene ID
def fetch_gene_sequence(gene_id):
    url = f"https://rest.ensembl.org/sequence/id/{gene_id}?content-type=text/plain"
    response = requests.get(url)
    if response.ok:
        return response.text
    else:
        print(f"Error fetching sequence for {gene_id}: {response.status_code}")
        return None  # Return None if sequence fetch fails


# Example: Fetch gene IDs for human genes on chromosome 1 within a certain region
gene_ids = fetch_gene_ids(species="human", chromosome="1", start=1, end=2000000)
print(f"Fetched {len(gene_ids)} gene IDs")

# Use the previously fetched gene IDs to get the sequences
sequences = []
for gene_id in gene_ids:
    seq = fetch_gene_sequence(gene_id)
    if seq:
        sequences.append(seq)

# Check how many sequences were successfully fetched
print(f"Fetched {len(sequences)} gene sequences")



Fetched 182 gene IDs
Fetched 182 gene sequences


In [4]:
 # Fetch multiple gene sequences (you can add more gene IDs to this list)
 gene_ids = ["ENSG00000012048", "ENSG00000139618", "ENSG00000141510"] # BRCA1,
# BRCA2, TP53, etc. sequences = []

 for gene_id in gene_ids:
    seq = fetch_gene_sequence(gene_id)
    if seq:
        sequences.append(seq)

# Check how many sequences were successfully fetched
 print(f"Fetched {len(sequences)} gene sequences")
# Use the previously fetched gene IDs to get the sequences




Fetched 185 gene sequences


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

# Example: Create mock binary labels (adjust based on your task)
labels = [1, 0, 1] * (len(sequences) // 3) + [1, 0, 1][:len(sequences) % 3]
# Repeats [1, 0, 1] and adds any remaining elements to reach 182

# Tokenize the genetic sequences
def tokenize_function(examples):
    return tokenizer(examples['sequence'], padding="max_length", truncation=True,max_length=128)

dataset = Dataset.from_dict({"sequence": sequences, "labels": labels})
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3
)
# Before saving the model, make sure the weights are contiguous:
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

# Train the model on the Ensembl data
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/185 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.630755
2,No log,0.633667
3,No log,0.639355


TrainOutput(global_step=72, training_loss=0.6407313876681857, metrics={'train_runtime': 56.177, 'train_samples_per_second': 9.879, 'train_steps_per_second': 1.282, 'total_flos': 36506658931200.0, 'train_loss': 0.6407313876681857, 'epoch': 3.0})

In [11]:
import torch
import pinecone # Added this line to import the pinecone module
from pinecone import Pinecone
from pinecone import Pinecone, ServerlessSpec

# Function to generate embeddings
def get_embeddings(sequences):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "T4GPU")
    model.to(device)
    inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)  # Get hidden states
        embeddings = torch.cat(outputs.hidden_states[-2:], dim=-1)[:, 0, :].detach().cpu().numpy()
    return embeddings

# Generate embeddings for all sequences
sequence_embeddings = get_embeddings(sequences)

# Initialize Pinecone
pinecone = Pinecone(api_key="55e2a7c0-355f-40ca-a0f7-c50aa6c29ef3", environment="us-east-1")  # Replaced pinecone.init with Pinecone class initialization
index_name = "ensembl-genetic-index" # Define index_name outside the if block
index_names = pinecone.list_indexes()  # Retrieve the existing indexes
if index_name in index_names:
    pinecone.delete_index(index_name)
    print(f"Deleted existing index '{index_name}'")

index = pinecone.Index(index_name)





# Insert embeddings into Pinecone
ids = [f"gene_{i}" for i in range(len(sequence_embeddings))]
vectors_to_insert = list(zip(ids, sequence_embeddings))
index.upsert(vectors=vectors_to_insert)


{'upserted_count': 185}

In [12]:
from sklearn.preprocessing import normalize

# Normalizing embeddings to have unit norm
sequence_embeddings = normalize(sequence_embeddings, norm='l2')


In [35]:
import numpy as np

def query_sequence(sequence):
    query_embedding = get_embeddings([sequence])[0]
    # Convert query_embedding to float32 explicitly
    query_embedding = query_embedding.astype(np.float32).tolist()


    # Check the type and range of values in query_embedding to get more insights
    query_embedding = [min(max(x, -1.0), 1.0) for x in query_embedding]

    # Wrap the query_embedding in an additional list for compatibility
    results = index.query(
        vector=query_embedding,  # Wrap the query_embedding with 'values' key
        top_k=5,
        include_metadata=True
    )

    return results
# Example query for a new sequence
new_sequence = "Solyc02g085560"  # Replace with an actual genetic sequence
results = query_sequence(new_sequence)
print(results)


{'matches': [{'id': 'gene_92', 'score': 0.416070849, 'values': []},
             {'id': 'gene_60', 'score': 0.401281595, 'values': []},
             {'id': 'gene_72', 'score': 0.400829703, 'values': []},
             {'id': 'gene_91', 'score': 0.387798637, 'values': []},
             {'id': 'gene_52', 'score': 0.374839276, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
