In [2]:
import pandas as pd
import numpy as np

This notebook roughly investigates if Factor Analysis can correctly find genes in the k = 1 pathway that **DICER1** participates in.

# Preparing the data for analysis

Loading main gene dependency data:

In [4]:
df_gene_effect = pd.read_csv("20Q4v2_Achilles_gene_effect.csv")

df_gene_effect.head()

Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
0,ACH-000004,0.181332,0.089101,-0.193867,-0.024587,0.038458,-0.181824,0.351672,-0.440073,0.292582,...,-0.124218,-0.469306,,,0.257361,0.244834,-0.408512,0.284734,0.226412,-0.149566
1,ACH-000005,-0.076383,0.24519,0.191238,0.153008,-0.197035,-0.323295,0.252522,-0.571498,-0.068945,...,-0.212442,-0.426151,-0.068295,0.031635,0.205469,-0.068536,-0.092858,0.07464,0.028779,-0.26484
2,ACH-000007,0.102195,0.092449,-0.045926,0.171892,0.140561,0.170971,0.08606,-0.43232,0.010664,...,-0.083183,-0.269196,0.101146,0.27782,0.208814,0.001393,-0.327514,0.048714,-0.372854,-0.433157
3,ACH-000009,0.142342,-0.033126,-0.051224,0.06056,0.116002,-0.010044,0.104725,-0.610481,0.181508,...,-0.277264,-0.307018,0.044741,0.201551,0.083866,0.052208,-0.574719,0.218682,-0.07475,-0.55176
4,ACH-000011,0.280082,0.088898,0.032321,0.446598,-0.037188,-0.228207,0.110942,-0.406541,0.153979,...,-0.385241,-0.476314,-0.000984,0.013225,0.294002,0.137939,-0.245951,0.111173,-0.227417,-0.349564


DICER1 correlation data:

In [5]:
df_correlations = pd.read_csv("corr_GART_preprocessed.csv", delimiter=";")

df_correlations[:10]

Unnamed: 0,gene_selected,gene_evaluated,corr,pvalue,rank,is_on_pathway
0,GART,GART,1.0,0.0,1,1
1,GART,PAICS,0.8357685385944558,1.3994723327207264e-249,2,1
2,GART,MTHFD1,0.8305428402724535,1.0527168142140681e-243,3,1
3,GART,PFAS,0.8200228038610631,1.835328876859264e-232,4,1
4,GART,ATIC,0.8138499189241203,3.3332104314891704e-226,5,1
5,GART,PPAT,0.7912798759100206,3.631225056702279e-205,6,1
6,GART,FPGS,0.7104087326125703,4.2811290201710156e-147,7,1
7,GART,ADSL,0.7005658423205525,2.1982103613464512e-141,8,1
8,GART,DHODH,0.6487303908116515,8.42614814074376e-115,9,0
9,GART,SDHB,0.6396059878366211,1.1939870817567485e-110,10,0


In [6]:
# TODO: Consider making gene selection even easier?

number_of_genes = 20 # Should be an even number
assert(number_of_genes % 2 == 0)
selected_columns = pd.concat([df_correlations["gene_evaluated"][:number_of_genes // 2], df_correlations["gene_evaluated"][-number_of_genes // 2:]]).to_list()
selected_columns = tuple(selected_columns)
selected_columns, len(selected_columns)

(('GART',
  'PAICS',
  'MTHFD1',
  'PFAS',
  'ATIC',
  'PPAT',
  'FPGS',
  'ADSL',
  'DHODH',
  'SDHB',
  'MMP11',
  'MXRA8',
  'TUBA3D',
  'TMED9',
  'PCNT',
  'RGL1',
  'CCNI',
  'P2RY2',
  'PIH1D1',
  'NEK9'),
 20)

In [7]:
columns_of_interest = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]
assert(len(selected_columns) == len(columns_of_interest)) # Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
columns_of_interest, len(columns_of_interest)

(['ADSL (158)',
  'ATIC (471)',
  'CCNI (10983)',
  'DHODH (1723)',
  'FPGS (2356)',
  'GART (2618)',
  'MMP11 (4320)',
  'MTHFD1 (4522)',
  'MXRA8 (54587)',
  'NEK9 (91754)',
  'P2RY2 (5029)',
  'PAICS (10606)',
  'PCNT (5116)',
  'PFAS (5198)',
  'PIH1D1 (55011)',
  'PPAT (5471)',
  'RGL1 (23179)',
  'SDHB (6390)',
  'TMED9 (54732)',
  'TUBA3D (113457)'],
 20)

In [8]:
df_gene_effect[columns_of_interest]

Unnamed: 0,ADSL (158),ATIC (471),CCNI (10983),DHODH (1723),FPGS (2356),GART (2618),MMP11 (4320),MTHFD1 (4522),MXRA8 (54587),NEK9 (91754),P2RY2 (5029),PAICS (10606),PCNT (5116),PFAS (5198),PIH1D1 (55011),PPAT (5471),RGL1 (23179),SDHB (6390),TMED9 (54732),TUBA3D (113457)
0,-1.596759,-0.739520,0.078964,-1.477131,-1.096059,-0.464176,-0.143301,-0.684510,0.267366,0.094555,-0.166046,-0.944674,-0.395436,-0.465501,0.015825,-0.766264,0.275410,-0.894783,0.088113,-0.366829
1,-2.034402,-0.725101,-0.235176,-1.058889,-1.031511,-0.894343,-0.146330,-0.823031,0.040827,-0.102415,-0.371854,-1.094857,-0.315431,-0.795175,-0.103573,-1.277637,-0.204180,-0.949081,-0.174698,-0.168177
2,-1.222208,-0.845033,-0.005609,-0.720187,-0.353715,-0.706659,-0.103231,-0.392101,-0.030163,-0.330800,-0.084193,-0.756281,-0.305884,-0.327465,0.107765,-0.607689,0.036762,-0.916764,-0.311737,-0.445893
3,-1.247635,-0.576957,-0.201326,-0.558100,-0.369323,-0.369104,-0.201202,-0.456149,0.150327,0.087427,0.047484,-0.711113,-0.340810,-0.566892,-0.154417,-0.546790,0.007934,-0.538930,-0.457483,-0.537479
4,-1.146202,-0.496128,-0.188211,-0.725848,-0.872680,-0.274540,0.045184,-0.430733,-0.025089,0.083104,-0.046233,-0.669749,-0.162466,-0.388637,0.053964,-0.809917,0.157805,-0.753028,-0.155264,-0.516754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,-0.423947,-0.012357,-0.431108,-0.338099,-0.302045,0.249184,-0.117198,0.150008,0.244649,0.110304,-0.144152,-0.010227,-0.257315,-0.013457,0.039329,-0.169218,-0.125891,0.186013,-0.269412,-0.217691
804,-0.623534,-0.335757,-0.718520,-0.656531,-0.329197,0.052761,-0.022511,-0.035037,0.146852,0.037154,-0.131453,-0.262108,-0.365492,-0.178853,0.178183,-0.383387,0.027884,-0.058769,-0.425417,-0.035615
805,-0.359391,-0.339175,-0.427153,-0.286382,-0.237293,0.301138,0.001616,0.208965,0.096352,-0.195091,-0.072058,-0.076952,-0.480966,0.250792,0.123237,-0.240727,0.087424,0.466871,-0.364988,-0.224268
806,-0.274565,-0.328226,-0.112775,-0.370183,-0.422838,-0.118585,-0.066947,-0.180804,0.168844,-0.057899,-0.069410,-0.246439,-0.442128,-0.305709,-0.282352,-0.386366,0.087367,-0.184681,-0.757949,-0.335418


In [9]:
X = df_gene_effect[columns_of_interest].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
if 808 - X.shape[0] > 0:
    print(f"Warning: {808 - X.shape[0]} rows contained at least 1 NA and was dropped.")
X.shape

(808, 20)

# Analysis

In [10]:
import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

In [11]:
_, lambda_hat = factor_model_solution(X, k = 1)
lambda_hat

array([[-0.77745198],
       [-0.87928987],
       [ 0.0092485 ],
       [-0.68531413],
       [-0.71939807],
       [-0.88141624],
       [-0.01913498],
       [-0.85065101],
       [-0.00652374],
       [ 0.10122948],
       [-0.02201097],
       [-0.91153772],
       [ 0.13005525],
       [-0.82917469],
       [ 0.06688884],
       [-0.88138339],
       [ 0.06663721],
       [-0.64085345],
       [ 0.07870572],
       [-0.01425333]])

In [12]:
pd.concat([df_correlations[["gene_evaluated", "is_on_pathway"]][:number_of_genes // 2], df_correlations[["gene_evaluated", "is_on_pathway"]][-number_of_genes // 2:]])

Unnamed: 0,gene_evaluated,is_on_pathway
0,GART,1
1,PAICS,1
2,MTHFD1,1
3,PFAS,1
4,ATIC,1
5,PPAT,1
6,FPGS,1
7,ADSL,1
8,DHODH,0
9,SDHB,0


In [13]:
columns_of_interest

['ADSL (158)',
 'ATIC (471)',
 'CCNI (10983)',
 'DHODH (1723)',
 'FPGS (2356)',
 'GART (2618)',
 'MMP11 (4320)',
 'MTHFD1 (4522)',
 'MXRA8 (54587)',
 'NEK9 (91754)',
 'P2RY2 (5029)',
 'PAICS (10606)',
 'PCNT (5116)',
 'PFAS (5198)',
 'PIH1D1 (55011)',
 'PPAT (5471)',
 'RGL1 (23179)',
 'SDHB (6390)',
 'TMED9 (54732)',
 'TUBA3D (113457)']

In [14]:
loadings = lambda_hat.T[0] # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in columns_of_interest]

# Create a DataFrame from genes and values
df_values = pd.DataFrame({
    'gene_evaluated': genes,
    'loadings': loadings
})

# Merge with df_correlations on 'gene'
merged_df = df_values.merge(df_correlations[['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')

print(merged_df.sort_values("loadings", ascending=False))


   gene_evaluated  loadings  is_on_pathway
12           PCNT  0.130055              0
9            NEK9  0.101229              0
18          TMED9  0.078706              0
14         PIH1D1  0.066889              0
16           RGL1  0.066637              0
2            CCNI  0.009248              0
8           MXRA8 -0.006524              0
19         TUBA3D -0.014253              0
6           MMP11 -0.019135              0
10          P2RY2 -0.022011              0
17           SDHB -0.640853              0
3           DHODH -0.685314              0
4            FPGS -0.719398              1
0            ADSL -0.777452              1
13           PFAS -0.829175              1
7          MTHFD1 -0.850651              1
1            ATIC -0.879290              1
15           PPAT -0.881383              1
5            GART -0.881416              1
11          PAICS -0.911538              1


We get heavier loadings on genes that actually lie on the pathway!!

In [15]:
factor_goodness_of_fit_test(X, k = 1)

2.202762176300214e-36

Test shows that k = 1 **is not** sufficient to describe (**all**) the data. But sufficient to **only** capture the (single) pathway perhaps?