In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------------------------
# CNN Module (Structural Patterns)
# ------------------------------
class StructuralCNN(nn.Module):
    def __init__(self, input_channels=1, embedding_dim=128):
        super(StructuralCNN, self).__init__()
        """
        Input: 2D distance/contact matrix from AlphaFold (e.g., 64x64)
        Output: Embedding vector representing structural context
        """

        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))  # global average pooling

        self.fc = nn.Linear(128, embedding_dim)  # final embedding vector

    def forward(self, x):
        # x shape: [batch_size, 1, H, W]
        x = F.relu(self.conv1(x))
        x = self.pool1(x)

        x = F.relu(self.conv2(x))
        x = self.pool2(x)

        x = F.relu(self.conv3(x))
        x = self.global_pool(x)  # shape → [batch, 128, 1, 1]
        x = torch.flatten(x, 1)  # shape → [batch, 128]

        embedding = self.fc(x)
        return embedding


# ------------------------------
# Example usage
# ------------------------------
if __name__ == "__main__":
    # Simulate AlphaFold contact matrix (batch_size=1, 64x64)
    contact_matrix = torch.rand(1, 1, 64, 64)

    model = StructuralCNN(input_channels=1, embedding_dim=128)
    embedding = model(contact_matrix)

    print("Structural Embedding Vector Shape:", embedding.shape)
    print("Example Embedding (first 10 values):", embedding[0][:10])


Structural Embedding Vector Shape: torch.Size([1, 128])
Example Embedding (first 10 values): tensor([ 0.0521, -0.0629, -0.1503, -0.0279,  0.0409,  0.0786, -0.0517,  0.0373,
         0.0544,  0.0284], grad_fn=<SliceBackward0>)


In [7]:
import torch
print(torch.__version__)


2.9.0+cpu


In [24]:
from torch_geometric.nn import GCNConv, global_mean_pool
print("✅ PyTorch Geometric is ready to use!")


✅ PyTorch Geometric is ready to use!


In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

# ------------------------------
# GNN Module (Graph Representation)
# ------------------------------
class StructuralGNN(nn.Module):
    def __init__(self, node_in_dim=128, hidden_dim=128, embedding_dim=128):
        super(StructuralGNN, self).__init__()
        """
        Input: Graph representation of protein (nodes = residues)
        Output: Graph-level embedding (structural + relational context)
        """
        self.conv1 = GCNConv(node_in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, embedding_dim)

    def forward(self, x, edge_index, batch):
        """
        x: [num_nodes, node_in_dim] node features (from CNN or sequence embedding)
        edge_index: [2, num_edges] connectivity between nodes
        batch: [num_nodes] batch vector to group nodes per protein
        """
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(self.conv2(x, edge_index))

        # Graph-level pooling (mean over all node embeddings)
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return x


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------------------------
# Transformer Module (Sequence Context)
# ------------------------------
class SequenceTransformer(nn.Module):
    def __init__(self, seq_len=512, embed_dim=128, num_heads=8, num_layers=2, dropout=0.1):
        super(SequenceTransformer, self).__init__()
        """
        Input: Sequence embeddings + positional encoding (variant position)
        Output: Embedding vector representing sequence context
        """

        # Positional encoding (learnable)
        self.pos_embedding = nn.Embedding(seq_len, embed_dim)

        # Transformer Encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=256,
            dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final sequence embedding projection
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, seq_embeddings, variant_pos=None):
        """
        seq_embeddings: [batch, seq_len, embed_dim]
        variant_pos: list/tensor of variant positions (optional)
        """

        batch_size, seq_len, _ = seq_embeddings.size()

        # Add positional encoding
        positions = torch.arange(seq_len, device=seq_embeddings.device).unsqueeze(0)
        pos_emb = self.pos_embedding(positions)
        x = seq_embeddings + pos_emb

        # Pass through Transformer Encoder
        x = self.transformer(x)

        # If variant position provided, extract its context vector
        if variant_pos is not None:
            idx = variant_pos.long().unsqueeze(-1).unsqueeze(-1).expand(-1, 1, x.size(-1))
            variant_emb = torch.gather(x, 1, idx).squeeze(1)
        else:
            # Global average pooling if variant position not given
            variant_emb = x.mean(dim=1)

        out = self.fc(variant_emb)
        return out


In [29]:
# Assume from previous CNN code:
# cnn_model = StructuralCNN()
# contact_matrix = torch.rand(1, 1, 64, 64)
cnn_embedding = cnn_model(contact_matrix)  # shape: [1, 128]

# Suppose each residue is represented by CNN embedding
# For simplicity, simulate 50 residues → 50 nodes, each 128-dim feature
num_residues = 50
x = torch.rand(num_residues, 128)

# Define dummy edges (simple chain or contact-based)
edge_index = torch.tensor([
    [i for i in range(num_residues - 1)] + [i + 1 for i in range(num_residues - 1)],
    [i + 1 for i in range(num_residues - 1)] + [i for i in range(num_residues - 1)]
], dtype=torch.long)

# Batch info (1 graph → all nodes belong to same protein)
batch = torch.zeros(num_residues, dtype=torch.long)

# Run GNN
gnn_model = StructuralGNN(node_in_dim=128)
gnn_embedding = gnn_model(x, edge_index, batch)

print("GNN Structural Embedding Shape:", gnn_embedding.shape)
print("Graph-level Embedding:", gnn_embedding)


GNN Structural Embedding Shape: torch.Size([1, 128])
Graph-level Embedding: tensor([[-1.1701e-01,  3.3949e-01,  3.3200e-01, -1.6349e-02, -1.1113e-01,
          9.5515e-02, -7.8390e-03, -1.7458e-01, -1.3772e-01, -2.2421e-01,
          2.0224e-01, -9.4489e-02,  1.9384e-01,  7.5994e-02, -1.8465e-01,
          9.4791e-03, -1.1883e-01, -1.2106e-01,  2.1763e-05,  1.1804e-01,
          3.4863e-01,  1.5225e-01,  1.4202e-02, -2.3250e-01, -2.8812e-01,
         -1.8030e-01,  1.6206e-01,  1.3166e-01,  4.6108e-02, -1.8608e-01,
          1.1743e-01,  1.1736e-01,  7.7772e-02,  1.6215e-01, -9.7333e-03,
          1.2260e-01, -2.3181e-02, -1.1576e-01,  8.9787e-02,  4.4712e-02,
          3.4639e-02,  1.0141e-01,  9.3240e-02,  8.5480e-02, -1.2755e-01,
          1.9587e-01, -1.1118e-01, -3.3523e-01, -2.0722e-02,  2.6581e-01,
         -8.5945e-02,  1.1450e-01,  6.3505e-02,  1.3858e-01, -2.1607e-02,
          2.5020e-01,  4.5125e-02,  7.5419e-02, -4.7158e-02, -1.0258e-01,
          9.9375e-02,  1.0669e-02,  

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------------------------
# CNN Module (Structural Patterns)
# ------------------------------
class StructuralCNN(nn.Module):
    def __init__(self):
        super(StructuralCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc = nn.Linear(32 * 16 * 16, 128)  # assuming input 64×64

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Initialize CNN model
cnn_model = StructuralCNN()

# Example input (64×64 contact matrix)
contact_matrix = torch.rand(1, 1, 64, 64)

# Get CNN embedding
cnn_embedding = cnn_model(contact_matrix)
print("CNN embedding shape:", cnn_embedding.shape)


CNN embedding shape: torch.Size([1, 128])


In [30]:
# Simulated amino acid sequence embeddings (batch=1, seq_len=100, embed_dim=128)
seq_embeddings = torch.rand(1, 100, 128)

# Variant at position 45
variant_position = torch.tensor([45])

model = SequenceTransformer(seq_len=100, embed_dim=128)
seq_context_emb = model(seq_embeddings, variant_position)

print("Sequence Context Embedding Shape:", seq_context_emb.shape)



Sequence Context Embedding Shape: torch.Size([1, 128])


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Assume you already defined:
# - StructuralCNN
# - StructuralGNN
# - SequenceTransformer

class HybridVariantPredictor(nn.Module):
    def __init__(self, seq_len=512, embed_dim=128, num_classes=2):
        super(HybridVariantPredictor, self).__init__()

        # --- Individual Modules ---
        self.cnn_module = StructuralCNN(input_channels=1, embedding_dim=embed_dim)
        self.gnn_module = StructuralGNN(node_in_dim=embed_dim, hidden_dim=128, embedding_dim=embed_dim)
        self.seq_module = SequenceTransformer(seq_len=seq_len, embed_dim=embed_dim)

        # --- Fusion Layer ---
        self.fc_fusion = nn.Sequential(
            nn.Linear(embed_dim * 3, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU()
        )

        # --- Output Prediction Head ---
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, contact_matrix, residue_graph, seq_embeddings, variant_pos=None):
        """
        contact_matrix: [B, 1, H, W]   -> CNN
        residue_graph: tuple(x, edge_index, batch) -> GNN
        seq_embeddings: [B, seq_len, embed_dim]    -> Transformer
        """

        # CNN output → local structure
        cnn_out = self.cnn_module(contact_matrix)

        # GNN output → global structure
        x, edge_index, batch = residue_graph
        gnn_out = self.gnn_module(x, edge_index, batch)

        # Transformer output → sequence context
        seq_out = self.seq_module(seq_embeddings, variant_pos)

        # Combine embeddings (concatenate)
        combined = torch.cat([cnn_out, gnn_out, seq_out], dim=-1)

        # Fusion and classification
        fused = self.fc_fusion(combined)
        logits = self.classifier(fused)

        return logits


In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class StructuralCNN(nn.Module):
    def __init__(self, input_channels=1, embedding_dim=128):
        super(StructuralCNN, self).__init__()
        """
        Input: 2D contact/distance matrix (AlphaFold)
        Output: Embedding vector representing structural context
        """
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))

        self.fc = nn.Linear(128, embedding_dim)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)

        x = F.relu(self.conv2(x))
        x = self.pool2(x)

        x = F.relu(self.conv3(x))
        x = self.global_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x
model = HybridVariantPredictor(seq_len=100, embed_dim=128, num_classes=2)
output = model(contact_matrix, residue_graph, seq_embeddings, variant_pos)
print(output)


tensor([[0.0067, 0.0694]], grad_fn=<AddmmBackward0>)


In [42]:
import torch
import torch.nn as nn
import torch.optim as optim

# Initialize model
model = HybridVariantPredictor(seq_len=100, embed_dim=128, num_classes=2)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()       # good for classification
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
# Example: simulate a batch of 4 samples
batch_size = 4

contact_matrices = torch.rand(batch_size, 1, 64, 64)
num_residues = 50
seq_len = 100
variant_positions = torch.randint(0, seq_len, (batch_size,))

# Each sample → different graph
x = torch.rand(num_residues * batch_size, 128)
edge_index = torch.tensor([
    [i for i in range(num_residues - 1)] * batch_size + [i + 1 for i in range(num_residues - 1)] * batch_size,
    [i + 1 for i in range(num_residues - 1)] * batch_size + [i for i in range(num_residues - 1)] * batch_size
], dtype=torch.long)
batch = torch.repeat_interleave(torch.arange(batch_size), num_residues)

residue_graph = (x, edge_index, batch)

seq_embeddings = torch.rand(batch_size, seq_len, 128)
labels = torch.randint(0, 2, (batch_size,))  # 0=benign, 1=pathogenic
epochs = 5  # increase later
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(contact_matrices, residue_graph, seq_embeddings, variant_positions)
    loss = criterion(outputs, labels)

    loss.backward()
    optimizer.step()

    # Accuracy
    preds = torch.argmax(outputs, dim=1)
    acc = (preds == labels).float().mean()

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {loss.item():.4f} - Accuracy: {acc.item():.4f}")
torch.save(model.state_dict(), "hybrid_variant_predictor.pth")


Epoch [1/5] - Loss: 0.6879 - Accuracy: 0.5000
Epoch [2/5] - Loss: 0.6833 - Accuracy: 0.7500
Epoch [3/5] - Loss: 0.6741 - Accuracy: 1.0000
Epoch [4/5] - Loss: 0.6632 - Accuracy: 1.0000
Epoch [5/5] - Loss: 0.6487 - Accuracy: 1.0000


In [44]:
import pandas as pd

# Load your CSV file (replace the filename with your actual one)
df = pd.read_csv("E:/vit/ai/data/BDNF_missense_expanded_mapped.csv")

# Preview dataset
print(df.head())
print(df.columns)


                                       Name       Gene(s)  \
0  NM_001709.5(BDNF):c.715T>G (p.Cys239Gly)  BDNF|BDNF-AS   
1  NM_001709.5(BDNF):c.715T>G (p.Cys239Gly)  BDNF|BDNF-AS   
2  NM_001709.5(BDNF):c.715T>G (p.Cys239Gly)  BDNF|BDNF-AS   
3  NM_001709.5(BDNF):c.715T>G (p.Cys239Gly)  BDNF|BDNF-AS   
4  NM_001709.5(BDNF):c.715T>G (p.Cys239Gly)  BDNF|BDNF-AS   

                      Protein change           Condition(s)     Accession  \
0  C239G, C247G, C254G, C268G, C321G  BDNF-related disorder  VCV003344608   
1  C239G, C247G, C254G, C268G, C321G  BDNF-related disorder  VCV003344608   
2  C239G, C247G, C254G, C268G, C321G  BDNF-related disorder  VCV003344608   
3  C239G, C247G, C254G, C268G, C321G  BDNF-related disorder  VCV003344608   
4  C239G, C247G, C254G, C268G, C321G  BDNF-related disorder  VCV003344608   

   GRCh37Chromosome  GRCh37Location  GRCh38Chromosome  GRCh38Location  \
0                11        27679397                11        27657850   
1                11     

In [45]:
import pandas as pd
import torch
import numpy as np

# Load CSV
df = pd.read_csv("E:/vit/ai/data/BDNF_missense_expanded_mapped.csv")

# Step 1: Filter missense variants
df = df[df["VariantType"].str.lower() == "missense"].copy()

# Step 2: Drop rows without residue positions or labels
df = df.dropna(subset=["Residue"])

# Step 3: Convert residue positions to integers
df["Residue"] = df["Residue"].astype(int)

# Step 4: Encode labels
# Try to use 'Germline classification' or 'Oncogenicity classification' if available
if "Germline classification" in df.columns:
    label_col = "Germline classification"
elif "Oncogenicity classification" in df.columns:
    label_col = "Oncogenicity classification"
else:
    raise ValueError("No pathogenicity label found in CSV")

# Simplify to binary labels
def encode_label(val):
    if isinstance(val, str):
        val = val.lower()
        if "pathogenic" in val:
            return 1
        elif "benign" in val:
            return 0
    return None

df["label"] = df[label_col].apply(encode_label)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print(f"✅ Processed {len(df)} valid variants for modeling.")
print(df[["Residue", "Orig_AA", "New_AA", "label"]].head())


✅ Processed 21 valid variants for modeling.
    Residue Orig_AA New_AA  label
30      186       C      Y      1
31      194       C      Y      1
32      201       C      Y      1
33      215       C      Y      1
34      268       C      Y      1


In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx

# ------------------------------
# 1. CNN Module (Structural Patterns)
# ------------------------------
class StructuralCNN(nn.Module):
    def __init__(self):
        super(StructuralCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc = nn.Linear(32 * 16 * 16, 128)  # for input 64x64

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x  # [batch, 128]


# ------------------------------
# 2. Simplified GNN (Manual message passing)
# ------------------------------
class SimpleGNN(nn.Module):
    def __init__(self, in_features=128, hidden_dim=64, out_features=64):
        super(SimpleGNN, self).__init__()
        self.fc1 = nn.Linear(in_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_features)

    def forward(self, features, adj):
        h = torch.matmul(adj, features)  # aggregate neighbors
        h = F.relu(self.fc1(h))
        h = torch.matmul(adj, h)
        h = self.fc2(h)
        return h.mean(dim=0)  # global mean pooling [64]


# ------------------------------
# 3. Transformer (Sequence Context)
# ------------------------------
class SequenceTransformer(nn.Module):
    def __init__(self, input_dim=128, nhead=4, hidden_dim=256, num_layers=2):
        super(SequenceTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=nhead, dim_feedforward=hidden_dim, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, seq_embeddings, variant_pos=None):
        x = self.transformer(seq_embeddings)  # [batch, seq_len, embed_dim]
        if variant_pos is not None:
            seq_len = x.size(1)
            variant_pos = torch.clamp(variant_pos, max=seq_len - 1)
            idx = variant_pos.long().unsqueeze(-1).unsqueeze(-1).expand(-1, 1, x.size(-1))
            variant_emb = torch.gather(x, 1, idx).squeeze(1)
        else:
            variant_emb = x.mean(dim=1)
        return variant_emb  # [batch, embed_dim]


# ------------------------------
# 4. Hybrid Model (CNN + GNN + Transformer)
# ------------------------------
class HybridVariantPredictor(nn.Module):
    def __init__(self):
        super(HybridVariantPredictor, self).__init__()
        self.cnn_module = StructuralCNN()
        self.gnn_module = SimpleGNN()
        self.seq_module = SequenceTransformer(input_dim=128)
        self.classifier = nn.Sequential(
            nn.Linear(128 + 64 + 128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, contact_matrix, adj, seq_embeddings, variant_pos):
        cnn_out = self.cnn_module(contact_matrix)       # [1,128]
        node_features = cnn_out.repeat(adj.size(0), 1)  # replicate to [num_nodes,128]
        gnn_out = self.gnn_module(node_features, adj)   # [64]
        seq_out = self.seq_module(seq_embeddings, variant_pos)  # [1,128]
        combined = torch.cat([cnn_out.squeeze(0), gnn_out, seq_out.squeeze(0)], dim=-1)
        return self.classifier(combined.unsqueeze(0))   # [1,1]


# ------------------------------
# 5. Test Run with Dummy Data
# ------------------------------
contact_matrix = torch.rand(1, 1, 64, 64)
num_nodes = 50
G = nx.erdos_renyi_graph(num_nodes, 0.1)
adj = torch.tensor(nx.to_numpy_array(G), dtype=torch.float32)
seq_embeddings = torch.rand(1, 100, 128)
variant_pos = torch.tensor([186])  # intentionally out of bounds

# Initialize model
model = HybridVariantPredictor()
output = model(contact_matrix, adj, seq_embeddings, variant_pos)
print("✅ Model output:", output)


✅ Model output: tensor([[0.5051]], grad_fn=<SigmoidBackward0>)


In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx

# ------------------------------
# 1. CNN Module (Structural Patterns)
# ------------------------------
class StructuralCNN(nn.Module):
    def __init__(self):
        super(StructuralCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc = nn.Linear(32 * 16 * 16, 128)  # for input 64x64

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x  # [batch, 128]


# ------------------------------
# 2. Simplified GNN (Manual message passing)
# ------------------------------
class SimpleGNN(nn.Module):
    def __init__(self, in_features=128, hidden_dim=64, out_features=64):
        super(SimpleGNN, self).__init__()
        self.fc1 = nn.Linear(in_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_features)

    def forward(self, features, adj):
        h = torch.matmul(adj, features)  # aggregate neighbors
        h = F.relu(self.fc1(h))
        h = torch.matmul(adj, h)
        h = self.fc2(h)
        return h.mean(dim=0)  # global mean pooling [64]


# ------------------------------
# 3. Transformer (Sequence Context)
# ------------------------------
class SequenceTransformer(nn.Module):
    def __init__(self, input_dim=128, nhead=4, hidden_dim=256, num_layers=2):
        super(SequenceTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=nhead, dim_feedforward=hidden_dim, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, seq_embeddings, variant_pos=None):
        x = self.transformer(seq_embeddings)  # [batch, seq_len, embed_dim]
        if variant_pos is not None:
            seq_len = x.size(1)
            variant_pos = torch.clamp(variant_pos, max=seq_len - 1)
            idx = variant_pos.long().unsqueeze(-1).unsqueeze(-1).expand(-1, 1, x.size(-1))
            variant_emb = torch.gather(x, 1, idx).squeeze(1)
        else:
            variant_emb = x.mean(dim=1)
        return variant_emb  # [batch, embed_dim]


# ------------------------------
# 4. Hybrid Model (CNN + GNN + Transformer)
# ------------------------------
class HybridVariantPredictor(nn.Module):
    def __init__(self):
        super(HybridVariantPredictor, self).__init__()
        self.cnn_module = StructuralCNN()
        self.gnn_module = SimpleGNN()
        self.seq_module = SequenceTransformer(input_dim=128)
        self.classifier = nn.Sequential(
            nn.Linear(128 + 64 + 128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, contact_matrix, adj, seq_embeddings, variant_pos):
        cnn_out = self.cnn_module(contact_matrix)       # [1,128]
        node_features = cnn_out.repeat(adj.size(0), 1)  # replicate for nodes [num_nodes,128]
        gnn_out = self.gnn_module(node_features, adj)   # [64]
        seq_out = self.seq_module(seq_embeddings, variant_pos)  # [1,128]
        combined = torch.cat([cnn_out.squeeze(0), gnn_out, seq_out.squeeze(0)], dim=-1)
        return self.classifier(combined.unsqueeze(0))   # [1,1]


# ------------------------------
# 5. Test Run with Dummy Data
# ------------------------------
contact_matrix = torch.rand(1, 1, 64, 64)
num_nodes = 50
G = nx.erdos_renyi_graph(num_nodes, 0.1)
adj = torch.tensor(nx.to_numpy_array(G), dtype=torch.float32)
seq_embeddings = torch.rand(1, 100, 128)
variant_pos = torch.tensor([186])  # intentionally out of bounds

# Initialize model
model = HybridVariantPredictor()
output = model(contact_matrix, adj, seq_embeddings, variant_pos)
print("✅ Model output:", output)


✅ Model output: tensor([[0.5054]], grad_fn=<SigmoidBackward0>)


In [54]:
class HybridVariantPredictor(nn.Module):
    def __init__(self, seq_len=100, embed_dim=128, num_classes=2, dropout=0.3):
        super(HybridVariantPredictor, self).__init__()
        self.cnn_module = StructuralCNN(embedding_dim=embed_dim, dropout=dropout)
        self.gnn_module = StructuralGNN(node_in_dim=embed_dim, embedding_dim=embed_dim, dropout=dropout)
        self.seq_module = SequenceTransformer(seq_len=seq_len, embed_dim=embed_dim, dropout=dropout)
        self.fc1 = nn.Linear(embed_dim * 3, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, contact_matrix, residue_graph, seq_embeddings, variant_pos):
        cnn_out = self.cnn_module(contact_matrix)
        gnn_out = self.gnn_module(residue_graph)
        seq_out = self.seq_module(seq_embeddings, variant_pos)

        combined = torch.cat((cnn_out, gnn_out, seq_out), dim=1)
        x = F.relu(self.bn1(self.fc1(combined)))
        x = self.dropout(x)
        return self.fc2(x)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=1e-4, 
    weight_decay=1e-5  # ← L2 regularization
)


In [56]:

import torch
import torch.nn as nn
import torch.nn.functional as F

# =========================
# 1. CNN Module
# =========================
class StructuralCNN(nn.Module):
    def __init__(self, embedding_dim=128, dropout=0.3):
        super(StructuralCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(32 * 16 * 16, embedding_dim)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.fc(x))
        return x  # [batch, embedding_dim]


# =========================
# 2. GNN Module (Matrix-based)
# =========================
class StructuralGNN(nn.Module):
    def __init__(self, node_in_dim=128, embedding_dim=128, dropout=0.3):
        super(StructuralGNN, self).__init__()
        self.fc1 = nn.Linear(node_in_dim, 128)
        self.fc2 = nn.Linear(128, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, node_features, adj):
        # node_features: [N, F]
        # adj: [N, N]
        h = torch.matmul(adj, node_features)
        h = F.relu(self.fc1(h))
        h = torch.matmul(adj, h)
        h = self.dropout(F.relu(self.fc2(h)))
        gnn_embedding = torch.mean(h, dim=0)  # graph-level embedding
        return gnn_embedding  # [embedding_dim]


# =========================
# 3. Transformer Module
# =========================
class SequenceTransformer(nn.Module):
    def __init__(self, seq_len=100, embed_dim=128, dropout=0.3):
        super(SequenceTransformer, self).__init__()
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=8, dim_feedforward=256, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.fc = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, seq_embeddings, variant_pos):
        # seq_embeddings: [batch, seq_len, embed_dim]
        x = seq_embeddings + self.pos_embedding
        x = self.transformer(x)
        variant_token = x[:, variant_pos, :]  # extract variant position embedding
        x = self.dropout(self.fc(variant_token))
        return x  # [batch, embed_dim]


# =========================
# 4. Hybrid Model
# =========================
class HybridVariantPredictor(nn.Module):
    def __init__(self, seq_len=100, embed_dim=128, num_classes=2, dropout=0.3):
        super(HybridVariantPredictor, self).__init__()
        self.cnn_module = StructuralCNN(embedding_dim=embed_dim, dropout=dropout)
        self.gnn_module = StructuralGNN(node_in_dim=embed_dim, embedding_dim=embed_dim, dropout=dropout)
        self.seq_module = SequenceTransformer(seq_len=seq_len, embed_dim=embed_dim, dropout=dropout)

        self.fc1 = nn.Linear(embed_dim * 3, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, contact_matrix, adj, seq_embeddings, variant_pos):
        cnn_out = self.cnn_module(contact_matrix)  # [1, 128]
        # For GNN, assume 50 nodes with same embedding dim as CNN output
        node_features = torch.randn(50, cnn_out.shape[-1])
        gnn_out = self.gnn_module(node_features, adj)  # [128]
        seq_out = self.seq_module(seq_embeddings, variant_pos)  # [1, 128]

        combined = torch.cat([cnn_out.squeeze(0), gnn_out, seq_out.squeeze(0)], dim=-1)
        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        out = self.fc2(x)
        return out


# =========================
# 5. Example Run
# =========================
seq_len = 100
embed_dim = 128

contact_matrix = torch.rand(1, 1, 64, 64)
adj = torch.eye(50)
seq_embeddings = torch.rand(1, seq_len, embed_dim)
variant_pos = 10

model = HybridVariantPredictor(seq_len=seq_len, embed_dim=embed_dim, num_classes=2, dropout=0.3)
output = model(contact_matrix, adj, seq_embeddings, variant_pos)

print("✅ Model output shape:", output.shape)
print("✅ Output:", output)


✅ Model output shape: torch.Size([2])
✅ Output: tensor([-0.0141,  0.0672], grad_fn=<ViewBackward0>)


In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

# ------------------------------------------------------------
# Define Model (same as earlier)
# ------------------------------------------------------------
class StructuralCNN(nn.Module):
    def __init__(self, embedding_dim=128, dropout=0.3):
        super(StructuralCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(32 * 16 * 16, embedding_dim)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.fc(x))
        return x


class StructuralGNN(nn.Module):
    def __init__(self, node_in_dim=128, embedding_dim=128, dropout=0.3):
        super(StructuralGNN, self).__init__()
        self.fc1 = nn.Linear(node_in_dim, embedding_dim)
        self.fc2 = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, adj):
        h = torch.matmul(adj, features)
        h = F.relu(self.fc1(h))
        h = self.dropout(torch.matmul(adj, h))
        h = F.relu(self.fc2(h))
        return h.mean(dim=0)


class SequenceTransformer(nn.Module):
    def __init__(self, seq_len=100, embed_dim=128, num_heads=4, num_layers=2, dropout=0.3):
        super(SequenceTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len, embed_dim))
        self.dropout = nn.Dropout(dropout)

    def forward(self, seq_embeddings, variant_pos=None):
        x = seq_embeddings + self.pos_embedding[:, : seq_embeddings.size(1), :]
        x = self.transformer(x)
        if variant_pos is not None:
            seq_len = x.size(1)
            variant_pos = torch.clamp(variant_pos, max=seq_len - 1)
            idx = variant_pos.long().unsqueeze(-1).unsqueeze(-1).expand(-1, 1, x.size(-1))
            variant_emb = torch.gather(x, 1, idx).squeeze(1)
        else:
            variant_emb = x.mean(dim=1)
        return self.dropout(variant_emb)


class HybridVariantPredictor(nn.Module):
    def __init__(self, seq_len=100, embed_dim=128, num_classes=2, dropout=0.3):
        super(HybridVariantPredictor, self).__init__()
        self.cnn_module = StructuralCNN(embedding_dim=embed_dim, dropout=dropout)
        self.gnn_module = StructuralGNN(node_in_dim=embed_dim, embedding_dim=embed_dim, dropout=dropout)
        self.seq_module = SequenceTransformer(seq_len=seq_len, embed_dim=embed_dim, dropout=dropout)
        self.fc_final = nn.Sequential(
            nn.Linear(embed_dim * 3, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, contact_matrix, adj, seq_embeddings, variant_pos):
        cnn_out = self.cnn_module(contact_matrix)                       # [1,128]
        gnn_out = self.gnn_module(cnn_out.repeat(adj.size(0), 1), adj)  # [128]
        seq_out = self.seq_module(seq_embeddings, variant_pos)           # [1,128]
        combined = torch.cat([cnn_out.squeeze(0), gnn_out, seq_out.squeeze(0)], dim=-1)
        output = self.fc_final(combined.unsqueeze(0))
        return output


# ------------------------------------------------------------
# Load Variant Dataset
# ------------------------------------------------------------
df = pd.read_csv("E:/vit/ai/data/BDNF_missense_expanded_mapped.csv")
variant_data = df[['Residue', 'Orig_AA', 'New_AA', 'Region', 'Protein change']]
print("Loaded variants:", len(variant_data))

# Encode residue positions (dummy for now)
residues = torch.tensor(variant_data['Residue'].fillna(0).values, dtype=torch.long)
num_variants = len(variant_data)

# Dummy embeddings
contact_matrices = torch.randn(num_variants, 1, 64, 64)
sequence_embeddings = torch.randn(num_variants, 100, 128)
variant_positions = residues % 100
adj = torch.eye(50)  # simple adjacency for all variants

# ------------------------------------------------------------
# Load or Initialize Model
# ------------------------------------------------------------
model = HybridVariantPredictor(seq_len=100, embed_dim=128, num_classes=2)

try:
    model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')))
    print("✅ Loaded pretrained weights from 'best_model.pth'")
except FileNotFoundError:
    print("⚠️ No pretrained model found — using untrained model weights.")

model.eval()

# ------------------------------------------------------------
# Run Predictions
# ------------------------------------------------------------
pred_classes, pathogenic_probs = [], []

with torch.no_grad():
    for i in range(num_variants):
        cm = contact_matrices[i].unsqueeze(0)
        seq = sequence_embeddings[i].unsqueeze(0)
        vp = variant_positions[i].unsqueeze(0)
        output = model(cm, adj, seq, vp)
        probs = torch.softmax(output, dim=1)
        preds = torch.argmax(probs, dim=1)
        pred_classes.append(preds.item())
        pathogenic_probs.append(probs[:, 1].item())

# ------------------------------------------------------------
# Save annotated results
# ------------------------------------------------------------
variant_data['Predicted_Class'] = pred_classes
variant_data['Pathogenic_Prob'] = pathogenic_probs

variant_data.to_csv("E:/vit/ai/data/BDNF_processed_with_predictions.csv", index=False)
print("\n✅ Predictions saved to 'BDNF_processed_with_predictions.csv'")
print(variant_data.head())


Loaded variants: 239
⚠️ No pretrained model found — using untrained model weights.

✅ Predictions saved to 'BDNF_processed_with_predictions.csv'
   Residue Orig_AA New_AA        Region                     Protein change  \
0    239.0       C      G        mature  C239G, C247G, C254G, C268G, C321G   
1    247.0       C      G        mature  C239G, C247G, C254G, C268G, C321G   
2    254.0       C      G  out_of_range  C239G, C247G, C254G, C268G, C321G   
3    268.0       C      G  out_of_range  C239G, C247G, C254G, C268G, C321G   
4    321.0       C      G  out_of_range  C239G, C247G, C254G, C268G, C321G   

   Predicted_Class  Pathogenic_Prob  
0                1         0.508206  
1                0         0.489482  
2                0         0.474495  
3                0         0.468924  
4                0         0.479513  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_data['Predicted_Class'] = pred_classes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variant_data['Pathogenic_Prob'] = pathogenic_probs


In [62]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pretrained protein language model
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D")

# Example: BDNF protein sequence (UniProt ID: P23560)
bdnf_seq = (
    "MTSRTPAAPAAGPVLPAVPLPLLRLPLLPPLHPAAAEPLHPADWDAAPAAPASPLEPAPAPAARPR"
    "RSHPHFLAENTRVL..."
)  # use full sequence for actual embedding

# Tokenize
inputs = tokenizer(bdnf_seq, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)
    sequence_embeddings = outputs.last_hidden_state.squeeze(0)
    print("Sequence embedding shape:", sequence_embeddings.shape)


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sequence embedding shape: torch.Size([85, 320])


In [71]:
import numpy as np
from Bio.PDB import PDBParser

def get_contact_map(pdb_path, threshold=8.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("BDNF", pdb_path)
    residues = [res for res in structure.get_residues() if 'CA' in res]
    n = len(residues)
    dist_matrix = np.zeros((n, n))
    for i, res1 in enumerate(residues):
        for j, res2 in enumerate(residues):
            dist = res1['CA'] - res2['CA']
            dist_matrix[i, j] = dist
    contact_map = (dist_matrix < threshold).astype(float)
    return torch.tensor(contact_map, dtype=torch.float).unsqueeze(0)

contact_matrix = get_contact_map("E:/vit/ai/data/AF-P23560-F1-model_v6.pdb")
print("Contact matrix shape:", contact_matrix.shape)



Contact matrix shape: torch.Size([1, 247, 247])
