# PCs of normalized & non-normalized pseudobulk TS Data

### Computing PCs

#### for the compressed data

In [63]:
import scanpy as sc
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [64]:
data_path = "TabulaSapiens_pseudobulk.h5ad"
ts_adata = sc.read_h5ad(data_path)
gene_id = ts_adata.var.ensemblid.to_list()
gene_id = [i.split(".")[0] for i in gene_id]

In [65]:
gene_id_used = pd.read_csv("gene_index.csv")

In [66]:
ts_adata.var.ensemblid = gene_id

In [67]:
ts_adata = ts_adata[:,ts_adata.var['ensemblid'].isin(gene_id_used.gene_id.to_list())]

In [68]:
gene_id = ts_adata.var.ensemblid.to_list()

In [69]:
ds = ts_adata.X.T
print(ds.std())
ds = ds / ds.std()

pca = PCA()
pca.fit(ds)

varaince_ratio = 0.8
n_dim = 0
ratio = 0
while ratio < varaince_ratio:
    ratio += pca.explained_variance_ratio_[n_dim]
    n_dim += 1

scale_pcs = True
pcs = pca.transform(ds)[:, 0:n_dim]
if scale_pcs:
    pcs = pcs / pcs.std(axis = 0)

239171.52


In [70]:
pca.explained_variance_ratio_

array([3.48950654e-01, 1.70526609e-01, 1.49578601e-01, 1.15988411e-01,
       9.20247585e-02, 4.15456817e-02, 2.35386919e-02, 1.99283827e-02,
       1.00092776e-02, 7.01796357e-03, 3.50113912e-03, 2.50464794e-03,
       2.38366215e-03, 1.71907782e-03, 1.45992462e-03, 1.34967954e-03,
       8.96127371e-04, 7.89867830e-04, 6.92730944e-04, 5.95389633e-04,
       5.30114747e-04, 5.09267556e-04, 4.56899899e-04, 3.86001630e-04,
       3.32055177e-04, 2.69321667e-04, 2.45060713e-04, 1.99476533e-04,
       1.75702779e-04, 1.71062944e-04, 1.62647397e-04, 1.57392031e-04,
       1.43881218e-04, 1.21865356e-04, 1.13768241e-04, 9.07294816e-05,
       7.77965470e-05, 6.62549428e-05, 5.96780446e-05, 5.62782297e-05,
       5.18125926e-05, 4.99415328e-05, 4.54444998e-05, 4.42932578e-05,
       3.90769128e-05, 3.84694358e-05, 3.43592219e-05, 3.32315431e-05,
       3.06533620e-05, 2.51787242e-05, 2.38170069e-05, 1.87285550e-05,
       1.74229208e-05, 1.53102519e-05, 1.32171290e-05, 1.25912156e-05,
      

In [71]:
pc_df = pd.DataFrame(
    data = pcs,
    index = gene_id,
    columns = [f"PC_{i}" for i in range(n_dim)]
)
pc_df.index.name = "gene_id"

In [72]:
pc_df

Unnamed: 0_level_0,PC_0,PC_1,PC_2,PC_3,PC_4
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000186092,-0.036084,-0.013993,-0.066648,-0.136982,0.049808
ENSG00000284733,-0.036084,-0.013993,-0.066649,-0.136987,0.049811
ENSG00000284662,-0.036084,-0.013993,-0.066649,-0.136987,0.049811
ENSG00000187634,-0.035707,-0.013943,-0.065661,-0.132524,0.046678
ENSG00000188976,-0.010399,-0.012931,-0.050670,-0.079080,0.024772
...,...,...,...,...,...
ENSG00000212907,0.115951,0.023983,0.904277,2.195232,-1.041602
ENSG00000198886,1.199168,0.236864,5.977500,14.034909,-6.839797
ENSG00000198786,0.581092,0.096015,1.659438,5.905251,-2.968685
ENSG00000198695,0.035644,0.003394,0.269675,0.924570,-0.390944


In [73]:
pc_df.to_csv(f"TS_pb_d{n_dim}_{varaince_ratio}Variance.tsv", sep="\t")

#### for the uncompressed data

In [74]:
"""
data_path = "TabulaSapiens.h5ad"
ts_adata = sc.read_h5ad(data_path)
gene_id = ts_adata.var.ensemblid.to_list()
gene_id = [i.split(".")[0] for i in gene_id]
ds = ts_adata.layers["decontXcounts"].toarray().T

ds = ds / ds.std()

pca = PCA()
pca.fit(ds)

varaince_ratio = 0.8
n_dim = 0
ratio = 0
while ratio < varaince_ratio:
    ratio += pca.explained_variance_ratio_[n_dim]
    n_dim += 1

scale_pcs = True
pcs = pca.transform(ds)[:, 0:n_dim]
if scale_pcs:
    pcs = pcs / pcs.std(axis = 0)
"""

'\ndata_path = "TabulaSapiens.h5ad"\nts_adata = sc.read_h5ad(data_path)\ngene_id = ts_adata.var.ensemblid.to_list()\ngene_id = [i.split(".")[0] for i in gene_id]\nds = ts_adata.layers["decontXcounts"].toarray().T\n\nds = ds / ds.std()\n\npca = PCA()\npca.fit(ds)\n\nvaraince_ratio = 0.8\nn_dim = 0\nratio = 0\nwhile ratio < varaince_ratio:\n    ratio += pca.explained_variance_ratio_[n_dim]\n    n_dim += 1\n\nscale_pcs = True\npcs = pca.transform(ds)[:, 0:n_dim]\nif scale_pcs:\n    pcs = pcs / pcs.std(axis = 0)\n'

In [75]:
"""
pc_df = pd.DataFrame(
    data = pcs,
    index = gene_id,
    columns = [f"PC_{i}" for i in range(n_dim)]
)
pc_df.index.name = "gene_id"
"""

'\npc_df = pd.DataFrame(\n    data = pcs,\n    index = gene_id,\n    columns = [f"PC_{i}" for i in range(n_dim)]\n)\npc_df.index.name = "gene_id"\n'

In [76]:
#pc_df.to_csv(f"TS_uncompressed_d{n_dim}_{varaince_ratio}Variance.tsv", sep="\t")
