# PCs of DepMap Data

### Computing PCs of the DepMap Dataset

#### for the processed data

In [1]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
data_path = "DepMap18Q3_gene_effect_mapped.tsv"
dep_map_data = pl.read_csv(data_path, separator = "\t")

ds = dep_map_data.drop(["gene_id"]).to_numpy()
ds = ds / ds.std()

pca = PCA()
pca.fit(ds)

varaince_ratio = 0.92
n_dim = 0
ratio = 0
while ratio < varaince_ratio:
    ratio += pca.explained_variance_ratio_[n_dim]
    n_dim += 1

scale_pcs = True
pcs = pca.transform(ds)[:, 0:n_dim]
if scale_pcs:
    pcs = pcs / pcs.std(axis = 0)

In [3]:
pc_df = pd.DataFrame(
    data = pcs,
    index = dep_map_data.get_column("gene_id").to_list(),
    columns = [f"PC_{i}" for i in range(n_dim)]
)
pc_df.index.name = "gene_id"

In [4]:
pc_df

Unnamed: 0_level_0,PC_0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,...,PC_331,PC_332,PC_333,PC_334,PC_335,PC_336,PC_337,PC_338,PC_339,PC_340
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410,-0.511851,0.330281,0.841235,0.584836,-2.165912,-1.262496,0.266053,0.144909,0.542952,-0.499562,...,-1.033529,-0.764455,0.143799,0.872111,-0.229301,0.564940,-0.529064,0.782358,-0.185730,-0.069411
ENSG00000148584,-0.545146,-0.241661,1.004511,-0.329272,-0.029425,0.307707,0.192157,1.498489,-1.065310,1.130178,...,-0.558873,-0.318276,-0.057919,0.572438,0.180021,1.045560,-0.322427,-0.678844,0.620332,0.516176
ENSG00000175899,-0.188595,-0.086158,-0.200751,0.414784,0.142606,0.075865,0.004143,0.391795,-0.650046,0.955931,...,0.056687,-0.386815,0.903996,0.751331,0.857267,0.248385,0.793339,0.473372,0.286409,-0.392290
ENSG00000166535,-0.519438,-0.023753,0.193021,0.463889,0.128204,-0.084947,0.471552,0.942754,-0.125510,1.191264,...,0.492238,-0.297486,0.140358,-0.659046,0.032582,-0.284261,1.370598,0.499610,-0.026999,0.290921
ENSG00000184389,-0.021746,-0.889530,0.756182,-0.138007,-0.033664,0.957035,0.237019,-0.129761,1.133918,-0.085786,...,0.349453,1.865425,-0.909941,-0.269987,0.368911,1.059730,1.964862,-1.051030,-0.597303,1.633677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,-0.099277,-0.416120,-0.045875,0.389951,0.132633,-1.425284,0.535588,-0.941342,0.016018,0.620145,...,-0.112575,0.725703,0.268494,0.747039,-1.120022,0.027444,-0.857444,-0.866028,0.063526,0.842058
ENSG00000162378,-0.441978,-1.181456,0.105140,-0.303579,-0.881951,0.904708,0.718306,0.246417,0.681071,0.619010,...,1.117383,1.850211,0.410161,0.349167,1.941876,-2.693762,-0.701905,1.107409,2.282077,-0.346288
ENSG00000159840,-0.077250,-0.547959,-0.659254,-1.075570,-0.854228,1.080064,0.583091,-1.152159,0.575191,0.462129,...,0.326682,0.980060,0.362486,1.056315,0.312541,-0.023276,0.505951,1.073125,-0.846980,-1.992823
ENSG00000074755,-0.176850,0.145956,1.013611,-1.379388,-0.289392,0.472549,0.586611,-1.019789,-0.374680,-1.029663,...,-0.466692,-2.495201,-0.701326,-0.662559,0.572125,0.229004,-0.380549,1.372675,2.329198,0.648354


In [6]:
pc_df.to_csv(f"DepMap_processed_d{n_dim}_{varaince_ratio}Variance.tsv", sep="\t")

#### for the unprocessed data

In [7]:
data_path = "DepMap18Q3_gene_effect_raw.tsv"
dep_map_data = pl.read_csv(data_path, separator = "\t")

ds = dep_map_data.drop(["gene_id"]).to_numpy()
ds = ds / ds.std()

pca = PCA()
pca.fit(ds)

varaince_ratio = 0.92
n_dim = 0
ratio = 0
while ratio < varaince_ratio:
    ratio += pca.explained_variance_ratio_[n_dim]
    n_dim += 1

scale_pcs = True
pcs = pca.transform(ds)[:, 0:n_dim]
if scale_pcs:
    pcs = pcs / pcs.std(axis = 0)

In [8]:
pc_df = pd.DataFrame(
    data = pcs,
    index = dep_map_data.get_column("gene_id").to_list(),
    columns = [f"PC_{i}" for i in range(n_dim)]
)
pc_df.index.name = "gene_id"

In [9]:
pc_df

Unnamed: 0_level_0,PC_0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,...,PC_71,PC_72,PC_73,PC_74,PC_75,PC_76,PC_77,PC_78,PC_79,PC_80
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410,0.662573,1.580746,0.514473,-2.047583,1.142320,-0.928176,-0.493192,-0.795099,-0.624842,1.283762,...,-1.122160,-0.156809,0.869302,0.642558,1.745312,0.980833,0.170328,-0.149983,-0.693770,-0.590241
ENSG00000148584,0.656909,0.160602,-0.343535,-0.239649,0.602858,-0.555150,-0.281376,-0.623728,0.912098,-1.038547,...,0.270334,1.727173,-1.629904,-1.420185,0.644330,-0.639475,0.550903,0.028425,-0.059058,-0.276747
ENSG00000175899,0.351229,-0.796989,-0.308368,0.240197,0.108242,0.967612,0.302968,-0.017770,-0.004008,0.189885,...,0.315137,0.452606,-0.300193,0.759897,0.771527,-0.342819,-0.357698,0.701593,-0.343082,-0.481842
ENSG00000166535,0.973488,0.845236,-0.021039,-0.194928,0.698380,-0.029739,0.218153,0.302417,0.131051,-0.350795,...,0.150545,-0.631313,-0.994629,1.271951,1.243123,-0.353998,0.078321,1.063687,1.052222,-0.059171
ENSG00000184389,0.277949,-0.318211,-0.908178,0.731876,0.610990,-0.608975,0.165754,-0.166532,-0.581511,-0.711019,...,-0.717171,0.422511,-0.110117,-0.472995,-1.189791,-0.192161,1.132779,-0.715001,1.557746,-0.358701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000203995,0.613893,-0.168261,-0.745667,-0.025008,0.470470,1.025984,-0.082433,0.212591,0.935575,1.050750,...,1.487642,1.061240,0.268649,1.092679,-0.060096,0.818856,-0.719620,0.741012,-0.574340,-0.325968
ENSG00000162378,-0.094403,-1.058173,-0.744172,1.180167,-1.183483,-1.528817,0.618991,-1.330132,-0.442867,-0.702732,...,1.935993,-0.709513,-0.164563,0.506605,-0.913298,-0.319401,1.157611,1.641553,0.869964,-0.774325
ENSG00000159840,0.732689,0.398966,-0.850592,-0.252327,0.403787,1.376202,-1.914381,-0.097586,-1.545848,-0.373463,...,0.660607,-1.354713,0.569854,-0.640666,-1.577587,-0.010468,0.466455,0.416132,1.328183,-1.049291
ENSG00000074755,-0.004428,-1.268636,0.087462,-0.268167,0.699450,-0.474823,-1.500967,0.414506,-1.196946,0.015522,...,0.677347,0.574012,-0.151292,-0.506120,-0.525714,0.676932,0.232110,-1.111961,-0.303563,1.635847


In [10]:
pc_df.to_csv(f"DepMap_raw_d{n_dim}_{varaince_ratio}Variance.tsv", sep="\t")
