<a href="https://colab.research.google.com/github/moritzpail/cs288-final-project/blob/main/notebooks/288_Final_Project_1_Create_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import pandas as pd
import seaborn as sns
from transformers import AutoModel, AutoTokenizer
import torch

# Download Data

In [3]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1tDw00yLk42PhSndvrpzJrLnu6ydQIY2b' -O raw_single_cell_tcr_data_acs_controllers_and_progressors.csv
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1uZuac6-X6_6rTZRkmNkASJFtOvt3-Mis' -O immunoSeq_samples_key.csv
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1S7wYkHOOLJtb7O6GjhtmIk8aQmDgwi3D' -O study_sample_key.csv

--2024-04-21 20:42:33--  https://drive.google.com/uc?export=download&id=1tDw00yLk42PhSndvrpzJrLnu6ydQIY2b
Resolving drive.google.com (drive.google.com)... 142.250.141.102, 142.250.141.101, 142.250.141.100, ...
Connecting to drive.google.com (drive.google.com)|142.250.141.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1tDw00yLk42PhSndvrpzJrLnu6ydQIY2b&export=download [following]
--2024-04-21 20:42:33--  https://drive.usercontent.google.com/download?id=1tDw00yLk42PhSndvrpzJrLnu6ydQIY2b&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.2.132, 2607:f8b0:4023:c0d::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41943373 (40M) [application/octet-stream]
Saving to: ‘raw_single_cell_tcr_data_acs_controllers_and_progressors.csv’


2024-04-21 20:4

# Load Data

In [4]:
df_raw_controllers_and_progressors = pd.read_csv("raw_single_cell_tcr_data_acs_controllers_and_progressors.csv")
df_immunoSeq_samples_key =  pd.read_csv("immunoSeq_samples_key.csv")
df_study_sample_key = pd.read_csv("study_sample_key.csv")

# Pre-Process Data

## Merge dataframes

In [5]:
# Deduplicate IDs based on some logic, here taking the first occurrence
df_immunoSeq_samples_key_unique = df_immunoSeq_samples_key.drop_duplicates(subset=['Donor.ID'], keep='first')
df_study_sample_key_unique = df_study_sample_key.drop_duplicates(subset=['Donor ID'], keep='first')

# First merge and drop
df_merged = pd.merge(
    df_raw_controllers_and_progressors,
    df_immunoSeq_samples_key_unique[['Donor.ID', 'Group']],
    left_on='donorId',
    right_on='Donor.ID',
    how='left'
)
df_merged.drop('Donor.ID', axis=1, inplace=True)

# Second merge and drop
df_merged = pd.merge(
    df_merged,
    df_study_sample_key_unique[['Donor ID', 'Group']],
    left_on='donorId',
    right_on='Donor ID',
    how='left'
)
df_merged.drop('Donor ID', axis=1, inplace=True)

# Condition where both Group_x and Group_y are not null
condition = (~df_merged['Group_x'].isna()) & (~df_merged['Group_y'].isna())

# Check if there are any disagreements
disagreements = df_merged[condition & (df_merged['Group_x'] != df_merged['Group_y'])]
if not disagreements.empty:
    raise ValueError(f"Disagreement found in rows: {disagreements.index.tolist()}")

# Combine Group_x and Group_y into a single column, preferring Group_x values
df_merged['Group'] = df_merged['Group_x'].combine_first(df_merged['Group_y'])

## Filter relevant columns & rows

In [6]:
dff_merged = df_merged[
    ["donorId", "CDR3a", "Group"]
]

# Convert columns to strings
dff_merged = dff_merged.astype(str)

# Drop rows for which CDR3a is non
dff_merged = dff_merged[dff_merged["CDR3a"] != "nan"]

In [7]:
dff_merged.dtypes

donorId    object
CDR3a      object
Group      object
dtype: object

In [8]:
len(dff_merged)

22276

# Embed Sequences

In [9]:
from transformers import BertModel
from transformers import BertTokenizer

## Tokenize

In [10]:
PAD = "$"
MASK = "."
UNK = "?"
SEP = "|"
CLS = "*"

def get_pretrained_bert_tokenizer(path: str) -> BertTokenizer:
    """Get the pretrained BERT tokenizer from given path"""
    tok = BertTokenizer.from_pretrained(
        path,
        do_lower_case=False,
        tokenize_chinese_chars=False,
        unk_token=UNK,
        sep_token=SEP,
        pad_token=PAD,
        cls_token=CLS,
        mask_token=MASK,
        padding_side="right",
        truncation=True,
        padding = True,
        return_tensors = 'pt'
    )
    return tok

In [11]:
sequences = dff_merged["CDR3a"].tolist()

In [12]:
seq_w_whitespace = [
    " ".join(list(seq)) for seq in sequences
]

In [13]:
tcrbert_tokenizer = get_pretrained_bert_tokenizer("wukevin/tcr-bert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/91.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

In [14]:
# Tokenize Sequence
inputs = tcrbert_tokenizer(
    seq_w_whitespace,
    return_tensors="pt",
    max_length=128,
    padding=True,
    add_special_tokens=True
)



## Set-up Model

In [15]:
# This model was pretrained on MAA and TRB classification
tcrbert_model = BertModel.from_pretrained("wukevin/tcr-bert")

pytorch_model.bin:   0%|          | 0.00/230M [00:00<?, ?B/s]

In [16]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Put model and inputs on device
tcrbert_model = tcrbert_model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

### Embed Sequence

In [17]:
with torch.no_grad():
      outputs = tcrbert_model(**inputs)

In [18]:
# Extracting the [CLS] token's embeddings (assuming they are the first token's output)
embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Make sure to detach and move to CPU


In [19]:
dff_merged['embeddings'] = list(embeddings)

In [20]:
dff_merged

Unnamed: 0,donorId,CDR3a,Group,embeddings
0,04-0333,CILRDVWGNNARLMF,Controller,"[-0.32483366, -0.28578907, 1.2048455, -1.23306..."
1,04-0333,CAVYNYGQNFVF,Controller,"[-0.21782973, -1.0668253, 0.08671243, 0.650835..."
2,04-0333,CIVSPINNAGNMLTF,Controller,"[0.30981496, -0.1701109, 0.019591331, -1.85406..."
4,04-0333,CAVTRGTGGFKTIF,Controller,"[0.114908576, 0.102618076, -0.50007135, -0.680..."
5,04-0333,CAVTPNTGFQKLVF,Controller,"[0.37916976, 0.0009963448, -0.050300166, -0.73..."
...,...,...,...,...
37668,07-0386,CIVRNSGGYQKVTF,Progressor,"[-1.3121986, -0.5018212, 0.4406116, 1.8342069,..."
37669,07-0386,CVVRPGANNLFF,Progressor,"[-0.17877944, 0.16738518, -0.26593262, -1.4817..."
37670,07-0386,CLVPPTGANNLFF,Progressor,"[-0.6999689, -1.242247, 0.3307893, -0.96245444..."
37671,07-0386,CAVRLKGGATNKLIF,Progressor,"[-0.5948395, 0.37960136, -0.59932, 0.3748059, ..."


# PCA embeddings

In [21]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [22]:
vectors = np.stack(dff_merged['embeddings'].values)

# Step 2: Apply PCA
pca = PCA(n_components=256)  # Reduce to 3 principal components
# print("Explained variance ratio:", pca.explained_variance_ratio_.sum())
pca_embeddings = pca.fit_transform(vectors)

# Step 3: Replace the original data with PCA-reduced embeddings
dff_merged['pca_embeddings'] = list(pca_embeddings)


In [23]:
dff_merged['pca_embeddings'][0].shape

(256,)

In [24]:
dff_merged

Unnamed: 0,donorId,CDR3a,Group,embeddings,pca_embeddings
0,04-0333,CILRDVWGNNARLMF,Controller,"[-0.32483366, -0.28578907, 1.2048455, -1.23306...","[10.513786, -0.25441, -0.50700855, 14.275534, ..."
1,04-0333,CAVYNYGQNFVF,Controller,"[-0.21782973, -1.0668253, 0.08671243, 0.650835...","[-3.4496663, -2.8899174, -6.148253, -4.183212,..."
2,04-0333,CIVSPINNAGNMLTF,Controller,"[0.30981496, -0.1701109, 0.019591331, -1.85406...","[10.723131, 0.3683252, 2.748509, 6.421699, 3.0..."
4,04-0333,CAVTRGTGGFKTIF,Controller,"[0.114908576, 0.102618076, -0.50007135, -0.680...","[5.7128286, 2.0361972, 0.7289345, -12.119802, ..."
5,04-0333,CAVTPNTGFQKLVF,Controller,"[0.37916976, 0.0009963448, -0.050300166, -0.73...","[2.4186826, 2.9996421, 3.8951156, -7.79192, -1..."
...,...,...,...,...,...
37668,07-0386,CIVRNSGGYQKVTF,Progressor,"[-1.3121986, -0.5018212, 0.4406116, 1.8342069,...","[-4.7759476, -5.2629914, -9.830283, -7.3989005..."
37669,07-0386,CVVRPGANNLFF,Progressor,"[-0.17877944, 0.16738518, -0.26593262, -1.4817...","[4.31852, -4.986516, -6.682924, 9.221398, 4.56..."
37670,07-0386,CLVPPTGANNLFF,Progressor,"[-0.6999689, -1.242247, 0.3307893, -0.96245444...","[-0.20704862, -5.5294595, -7.232851, 12.436989..."
37671,07-0386,CAVRLKGGATNKLIF,Progressor,"[-0.5948395, 0.37960136, -0.59932, 0.3748059, ...","[-3.8761845, 2.1499982, -12.556882, 0.7696441,..."


# Save Dataframe

In [35]:
dff_merged.to_pickle("sequences_w_embeddings.pkl")

In [36]:
df_loaded = pd.read_pickle("sequences_w_embeddings.pkl")
df_loaded.equals(dff_merged)

True