<a href="https://colab.research.google.com/github/ninikvn/hackathon-project/blob/main/pytorch_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from sklearn.model_selection import train_test_split
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import numpy as np
from tqdm import tqdm
import pandas as pd
import ast
import json


In [3]:
df = pd.read_csv("simulated_cancer_patients.csv")
df['mutated_hugoGeneSymbols'] = df['mutated_hugoGeneSymbols'].apply(ast.literal_eval)

all_genes = sorted(set(g for genes in df['mutated_hugoGeneSymbols'] for g in genes))
gene_to_index = {gene: i for i, gene in enumerate(all_genes)}
n_patients, n_genes = len(df), len(all_genes)

mutation_matrix = torch.zeros((n_patients, n_genes), dtype=torch.uint8)
for i, gene_list in enumerate(df['mutated_hugoGeneSymbols']):
    for gene in gene_list:
        mutation_matrix[i, gene_to_index[gene]] = 1

# === Encode cancer types as labels ===
df['cancerType'] = df['cancerType'].astype('category')
cancer_labels = torch.tensor(df['cancerType'].cat.codes.values)
label_mapping = dict(enumerate(df['cancerType'].cat.categories))

# === Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    mutation_matrix, cancer_labels, test_size=0.2, stratify=cancer_labels, random_state=42
)

# === Save everything ===

# Inputs and labels
torch.save(X_train, "X_train.pt")
torch.save(y_train, "y_train.pt")
torch.save(X_test, "X_test.pt")
torch.save(y_test, "y_test.pt")

# Optional: save as datasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
torch.save(train_dataset, "train_dataset.pt")
torch.save(test_dataset, "test_dataset.pt")

# Save label mapping and gene list
with open("cancer_label_mapping.json", "w") as f:
    json.dump(label_mapping, f)

with open("gene_index.json", "w") as f:
    json.dump(gene_to_index, f)

print("✅ Data saved:")
print("- X_train.pt / y_train.pt")
print("- X_test.pt / y_test.pt")
print("- train_dataset.pt / test_dataset.pt")
print("- cancer_label_mapping.json / gene_index.json")


FileNotFoundError: [Errno 2] No such file or directory: 'simulated_cancer_patients.csv'

In [None]:

train_inputs = None
train_labels= None
test_inputs = None
test_labels=None

class cBioPortalDataset(Dataset):
  """
    Every Pytorch Dataset needs an __init__, __len__, and __getitem__
    These methods are used to get and batch the data using a DataLoader later
  """
  def __init__(self, images, labels):
    self.images = torch.Tensor(images)
    self.labels = torch.Tensor(labels)

  def __len__(self):
    return len(self.images)

  def __getitem__(self, idx):
    return self.images[idx], self.labels[idx]


train_dataset = cBioPortalDataset(train_inputs, train_labels)
test_dataset = cBioPortalDataset(test_inputs, test_labels)


train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
class Model(torch.nn.Module):

  def __init__(self, **kwargs):
    """
    The model class inherits from tf.keras.Model.
    It stores the trainable weights as attributes.
    """
    super(Model, self).__init__(**kwargs)

    self.layer1 = torch.nn.Linear(784, 256)
    self.layer2 = torch.nn.Linear(256, 128)
    self.layer3 = torch.nn.Linear(128, 10)

    self.relu = torch.nn.ReLU()
    self.softmax = torch.nn.Softmax(dim=1)

  def forward(self, inputs):
    """
    Forward pass, predicts labels given an input image using fully connected layers
    :return: the probabilites of each label
    """

    out1 = self.layer1(inputs)
    out1 = self.relu(out1)
    out2 = self.layer2(out1)
    out2 = self.relu(out2)
    out3 = self.layer3(out2)
    prbs = self.softmax(out3)
    return prbs

  def loss(self, predictions, labels):
    """
    Calculates the model loss
    :return: the loss of the model as a tensor
    """
    nll_comps = -labels * torch.log(torch.clip(predictions,1e-10,1.0))
    return torch.mean(torch.sum(nll_comps, axis=[1]))

  def accuracy(self, predictions, labels):
    """
    Calculates the model accuracy
    :return: the accuracy of the model as a tensor
    """
    pred_classes = torch.argmax(predictions, 1)
    true_classes = torch.argmax(labels, 1)
    correct_prediction = torch.eq(pred_classes, true_classes)
    return torch.mean(torch.Tensor(correct_prediction).to(torch.float32))

################################################################################

model = Model()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10

for j in range(epochs):
  for batch_idx, (input, label) in tqdm(enumerate(train_loader)):
    input = torch.reshape(input, (len(input),-1))
    y_pred = model(input)
    loss = model.loss(y_pred, label)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  test_acc = 0
  for batch_idx, (input, label) in enumerate(test_loader):
    input = torch.reshape(input, (len(input),-1))
    test_acc += model.accuracy(model(input), label)
  print(f"Accuracy on testing set after epoch {j}: {test_acc/len(test_loader)}")
print()
print(model)

59it [00:01, 40.03it/s]


Accuracy on testing set after epoch 0: 0.9364855885505676


59it [00:01, 53.38it/s]


Accuracy on testing set after epoch 1: 0.9549247026443481


59it [00:01, 50.60it/s]


Accuracy on testing set after epoch 2: 0.9593949317932129


59it [00:01, 52.18it/s]


Accuracy on testing set after epoch 3: 0.9654256701469421


59it [00:01, 43.21it/s]


Accuracy on testing set after epoch 4: 0.9686263799667358


59it [00:01, 52.84it/s]


Accuracy on testing set after epoch 5: 0.9700155258178711


59it [00:01, 51.78it/s]


Accuracy on testing set after epoch 6: 0.9710897207260132


59it [00:01, 49.31it/s]


Accuracy on testing set after epoch 7: 0.9715701341629028


59it [00:02, 27.35it/s]


Accuracy on testing set after epoch 8: 0.9713448286056519


59it [00:01, 39.87it/s]


Accuracy on testing set after epoch 9: 0.9723971486091614

Model(
  (layer1): Linear(in_features=784, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
  (softmax): Softmax(dim=1)
)
