In [1]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Notebook to compute reconstruction loss for PCA model

In [16]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from tcrvalid.load_models import *
from tcrvalid.plot_utils import set_simple_rc_params
from tcrvalid.physio_embedding import SeqArrayDictConverter
from tcrvalid.data_subsetting import *
from tcrvalid.defaults import *


In [4]:
mapping = SeqArrayDictConverter()
loaded_models_TRB = load_named_models('1_2',chain='TRB', encoders=True)
trb_test_pq = data_path_small_trb['te']
te_seq_trb_df = pd.read_parquet(trb_test_pq, columns=['cdr2_cdr3','new_meta_vcall','j_call','insert_codons_aa']).head(100000)

2023-05-12 14:39:50.337434: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-05-12 14:39:50.337482: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gen-queue-dy-m52xlarge-2): /proc/driver/nvidia/version does not exist
2023-05-12 14:39:50.338031: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




#### Create a train and test for PCA reconstruction loss computation

In [8]:
train, test = train_test_split(te_seq_trb_df, test_size=0.2)


In [9]:
x_tr = mapping.seqs_to_array(list(train.cdr2_cdr3.values),maxlen=28)
x_val = mapping.seqs_to_array(list(test.cdr2_cdr3.values),maxlen=28)


In [13]:
def get_xflat(x):
  x_flat = np.reshape(x, (x.shape[0],x.shape[1]*x.shape[2]))
  return x_flat

x_flat_tr = get_xflat(x_tr)
x_flat_val= get_xflat(x_val)

In [14]:
def mse(x,y):
  return np.mean(np.sum(np.square(x-y),axis=(1,2)))

#### Train a 16D PCA decomposition then transform the test set and compute MSE on the reconstruction

In [18]:
n=16
tmp_pca = PCA(n_components=n)
tmp_pca.fit(x_flat_tr)
x_test_pca = tmp_pca.transform(x_flat_val)
x_test_pca_flat = tmp_pca.inverse_transform(x_test_pca)
mse_val = mse(x_val,x_test_pca_flat.reshape((x_val.shape[0],x_val.shape[1],x_val.shape[2])))
mse_val

34.86522183719725