# Tensor construction

In [13]:
import pandas as pd
import numpy as np
import math
from scipy.interpolate import interp1d
import os

## Import data

In [2]:
patients = []
for i in range(16):
    sheet_index = 2*i + 1
    file_path = 'TCR_seq_new.xlsx'
    tcr_df = pd.read_excel(file_path, sheet_name=sheet_index, engine='openpyxl')

    tcr_df = tcr_df.dropna(subset=['CDR3.amino.acid.sequence'])

    tcr_counts = tcr_df.filter(regex='Day').copy()

    tcr_counts.columns = [col.split('Day ')[1].strip() for col in tcr_counts.columns]
    tcr_counts = tcr_counts[sorted(tcr_counts.columns, key=lambda x: int(x))]
    tcr_counts = tcr_counts.where(~(tcr_counts.isna()), 0)
    matrix = tcr_counts.sort_values(by=tcr_counts.columns[0], ascending=False, kind="mergesort").head(100)
    patients.append(matrix)

In [3]:
# we delete 3 patients. Two of them for not having data pre treatment and one of them for not having enough data.
del patients[4] # SARK017. Not enough data.
del patients[7] # SARK011. No pre treatment data.
del patients[12] # SARK052. No pre treatment data.

## Interpolation

In [4]:
def interpolate_matrix(matrix, step=10):
    original_cols = np.array(matrix.columns, dtype=float)
    
    start = original_cols[0]
    end = original_cols[-1]
    target_end = math.trunc(end/10)*10
    
    target_columns = np.arange(start, target_end + step, step)
    
    matrix_values = matrix.values
    n_rows = matrix_values.shape[0]
    
    interpolated_matrix = np.zeros((n_rows, len(target_columns)))
    
    for i in range(n_rows):
        f = interp1d(original_cols, matrix_values[i], kind='linear', fill_value='extrapolate')
        interpolated_matrix[i] = f(target_columns)
    
    interpolated_df = pd.DataFrame(interpolated_matrix, index=matrix.index, columns=target_columns)
    
    return interpolated_df

In [5]:
interp_patients = []
for i in patients:
    interp_patients.append(interpolate_matrix(i))

## Joining matrices

In [6]:
# We take the patients with sufficient amount of data for each tensor (70, 100, 200, 300 days)
patients_70 = []
patients_100 = []
patients_200 = []
patients_300 = []
for i in interp_patients:
    if i.columns[-1] >= 70:
        patients_70.append(i)
    if i.columns[-1] >= 100:
        patients_100.append(i)
    if i.columns[-1] >= 200:
        patients_200.append(i)
    if i.columns[-1] >= 300:
        patients_300.append(i)

In [7]:
# we select only the desired columns for each tensor (8, 11, 21, 31)
for i in range(len(patients_70)):
    patients_70[i] = patients_70[i].loc[:, patients_70[i].columns <= 70]
for i in range(len(patients_100)):
    patients_100[i] = patients_100[i].loc[:, patients_100[i].columns <= 100]
for i in range(len(patients_200)):
    patients_200[i] = patients_200[i].loc[:, patients_200[i].columns <= 200]
for i in range(len(patients_300)):
    patients_300[i] = patients_300[i].loc[:, patients_300[i].columns <= 300]

In [8]:
# we join the matrices to obtain the desired tensors
tensor_70 = np.stack(patients_70, axis=-1)
tensor_100 = np.stack(patients_100, axis=-1)
tensor_200 = np.stack(patients_200, axis=-1)
tensor_300 = np.stack(patients_300, axis=-1)

In [9]:
print(tensor_70.shape) # 8 time steps, 13 patients
print(tensor_100.shape) # 11 time steps, 11 patients
print(tensor_200.shape) # 21 time steps, 9 patients
print(tensor_300.shape) # 31 time steps, 6 patients

(100, 8, 13)
(100, 11, 11)
(100, 21, 9)
(100, 31, 6)


In [14]:
def save_tensor(array, file_prefix, foldername=None):
    current_path = os.getcwd()
    save_folder = os.path.join(current_path, f'{foldername}')
    
    os.makedirs(save_folder, exist_ok=True)
    save_path = os.path.join(save_folder, f"{file_prefix}.npy")
    np.save(save_path, array)

In [15]:
save_tensor(tensor_70, "tensor_70", foldername="uncent_data")
save_tensor(tensor_100, "tensor_100", foldername="uncent_data")
save_tensor(tensor_200, "tensor_200", foldername="uncent_data")
save_tensor(tensor_300, "tensor_300", foldername="uncent_data")

## Centering

In [16]:
def centering(tensor, mode):
    mean = np.mean(tensor, axis=mode, keepdims=True)
    return tensor - mean

In [17]:
tensor_70_cent = centering(tensor_70, 1)
tensor_100_cent = centering(tensor_100, 1)
tensor_200_cent = centering(tensor_200, 1)
tensor_300_cent = centering(tensor_300, 1)

In [18]:
# For PARAFAC2
centered_patients = [centering(x.to_numpy(), 1) for x in interp_patients]
scaled_patients = [x / np.linalg.norm(x) for x in centered_patients]
parafac2_data = [i.T for i in scaled_patients]

## Saving

In [20]:
save_tensor(tensor_70_cent, "tensor_70", foldername="real_data")
save_tensor(tensor_100_cent, "tensor_100", foldername="real_data")
save_tensor(tensor_200_cent, "tensor_200", foldername="real_data")
save_tensor(tensor_300_cent, "tensor_300", foldername="real_data")
np.save('real_data/parafac2_data.npy', np.array(parafac2_data, dtype=object))