In [1]:
import pandas as pd
import numpy as np

This notebook roughly investigates if Factor Analysis can correctly find genes in the k = 1 pathway that **GART** participates in.

# Preparing the data for analysis

Loading main gene dependency data:

In [2]:
df_gene_effect = pd.read_csv("20Q4v2_Achilles_gene_effect.csv")

df_gene_effect.head()

Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
0,ACH-000004,0.181332,0.089101,-0.193867,-0.024587,0.038458,-0.181824,0.351672,-0.440073,0.292582,...,-0.124218,-0.469306,,,0.257361,0.244834,-0.408512,0.284734,0.226412,-0.149566
1,ACH-000005,-0.076383,0.24519,0.191238,0.153008,-0.197035,-0.323295,0.252522,-0.571498,-0.068945,...,-0.212442,-0.426151,-0.068295,0.031635,0.205469,-0.068536,-0.092858,0.07464,0.028779,-0.26484
2,ACH-000007,0.102195,0.092449,-0.045926,0.171892,0.140561,0.170971,0.08606,-0.43232,0.010664,...,-0.083183,-0.269196,0.101146,0.27782,0.208814,0.001393,-0.327514,0.048714,-0.372854,-0.433157
3,ACH-000009,0.142342,-0.033126,-0.051224,0.06056,0.116002,-0.010044,0.104725,-0.610481,0.181508,...,-0.277264,-0.307018,0.044741,0.201551,0.083866,0.052208,-0.574719,0.218682,-0.07475,-0.55176
4,ACH-000011,0.280082,0.088898,0.032321,0.446598,-0.037188,-0.228207,0.110942,-0.406541,0.153979,...,-0.385241,-0.476314,-0.000984,0.013225,0.294002,0.137939,-0.245951,0.111173,-0.227417,-0.349564


GART correlation data:

In [10]:
df_correlations = pd.read_csv("correlations_GART_by_chr.csv", delimiter=";")

print(df_correlations.shape)
df_correlations.head()

(4786, 5)


Unnamed: 0,SELECTED_GENE,EVALUATED_GENE,CORRELATION,PVALUE,is_on_pathway
0,GART,GART,1.0,0.0,1
1,GART,PAICS,0.8357685385944558,1.3994723327207264e-249,1
2,GART,MTHFD1,0.8305428402724535,1.0527168142140681e-243,1
3,GART,PFAS,0.8200228038610631,1.835328876859264e-232,1
4,GART,ATIC,0.8138499189241203,3.3332104314891704e-226,1


Select wanted genes using correlations, top X and bottom X genes.

In [23]:
# TODO: Consider making gene selection even easier?
# TODO: Reformat correlation files so they have the same format. (Column names)

number_of_genes = 10 # Should be an even number
assert(number_of_genes % 2 == 0)
selected_columns = pd.concat([df_correlations["EVALUATED_GENE"][:number_of_genes // 2], df_correlations["EVALUATED_GENE"][-number_of_genes // 2:]]).to_list()
selected_columns = tuple(selected_columns)
selected_columns, len(selected_columns)

(('GART',
  'PAICS',
  'MTHFD1',
  'PFAS',
  'ATIC',
  'RGL1',
  'CCNI',
  'P2RY2',
  'PIH1D1',
  'NEK9'),
 10)

In [24]:
columns_of_interest = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]
columns_of_interest, len(columns_of_interest)

(['ATIC (471)',
  'CCNI (10983)',
  'GART (2618)',
  'MTHFD1 (4522)',
  'NEK9 (91754)',
  'P2RY2 (5029)',
  'PAICS (10606)',
  'PFAS (5198)',
  'PIH1D1 (55011)',
  'RGL1 (23179)'],
 10)

In [26]:
assert(len(selected_columns) == len(columns_of_interest)) # Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.

In [27]:
df_gene_effect[columns_of_interest]

Unnamed: 0,ATIC (471),CCNI (10983),GART (2618),MTHFD1 (4522),NEK9 (91754),P2RY2 (5029),PAICS (10606),PFAS (5198),PIH1D1 (55011),RGL1 (23179)
0,-0.739520,0.078964,-0.464176,-0.684510,0.094555,-0.166046,-0.944674,-0.465501,0.015825,0.275410
1,-0.725101,-0.235176,-0.894343,-0.823031,-0.102415,-0.371854,-1.094857,-0.795175,-0.103573,-0.204180
2,-0.845033,-0.005609,-0.706659,-0.392101,-0.330800,-0.084193,-0.756281,-0.327465,0.107765,0.036762
3,-0.576957,-0.201326,-0.369104,-0.456149,0.087427,0.047484,-0.711113,-0.566892,-0.154417,0.007934
4,-0.496128,-0.188211,-0.274540,-0.430733,0.083104,-0.046233,-0.669749,-0.388637,0.053964,0.157805
...,...,...,...,...,...,...,...,...,...,...
803,-0.012357,-0.431108,0.249184,0.150008,0.110304,-0.144152,-0.010227,-0.013457,0.039329,-0.125891
804,-0.335757,-0.718520,0.052761,-0.035037,0.037154,-0.131453,-0.262108,-0.178853,0.178183,0.027884
805,-0.339175,-0.427153,0.301138,0.208965,-0.195091,-0.072058,-0.076952,0.250792,0.123237,0.087424
806,-0.328226,-0.112775,-0.118585,-0.180804,-0.057899,-0.069410,-0.246439,-0.305709,-0.282352,0.087367


In [28]:
X = df_gene_effect[columns_of_interest].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
if 808 - X.shape[0] > 0:
    print(f"Warning: {808 - X.shape[0]} rows contained at least 1 NA and was dropped.")
X.shape

(808, 10)

# Analysis

In [29]:
import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

In [30]:
_, lambda_hat = factor_model_solution(X, k = 1)
lambda_hat

array([[-0.87884198],
       [ 0.00813229],
       [-0.87782493],
       [-0.85714176],
       [ 0.1008369 ],
       [-0.02390374],
       [-0.90121192],
       [-0.84214137],
       [ 0.06132715],
       [ 0.06347623]])

In [31]:
pd.concat([df_correlations[["EVALUATED_GENE", "is_on_pathway"]][:number_of_genes // 2], df_correlations[["EVALUATED_GENE", "is_on_pathway"]][-number_of_genes // 2:]])

Unnamed: 0,EVALUATED_GENE,is_on_pathway
0,GART,1
1,PAICS,1
2,MTHFD1,1
3,PFAS,1
4,ATIC,1
4781,RGL1,0
4782,CCNI,0
4783,P2RY2,0
4784,PIH1D1,0
4785,NEK9,0


In [32]:
columns_of_interest

['ATIC (471)',
 'CCNI (10983)',
 'GART (2618)',
 'MTHFD1 (4522)',
 'NEK9 (91754)',
 'P2RY2 (5029)',
 'PAICS (10606)',
 'PFAS (5198)',
 'PIH1D1 (55011)',
 'RGL1 (23179)']

In [34]:
loadings = lambda_hat.T[0] # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in columns_of_interest]

# Create a DataFrame from genes and values
df_values = pd.DataFrame({
    'EVALUATED_GENE': genes,
    'loadings': loadings
})

# Merge with df_correlations on 'gene'
merged_df = df_values.merge(df_correlations[['EVALUATED_GENE', 'is_on_pathway']], on='EVALUATED_GENE', how='left')

print(merged_df.sort_values("loadings", ascending=False))


  EVALUATED_GENE  loadings  is_on_pathway
4           NEK9  0.100837              0
9           RGL1  0.063476              0
8         PIH1D1  0.061327              0
1           CCNI  0.008132              0
5          P2RY2 -0.023904              0
7           PFAS -0.842141              1
3         MTHFD1 -0.857142              1
2           GART -0.877825              1
0           ATIC -0.878842              1
6          PAICS -0.901212              1


We get heavier loadings on genes that actually lie on the pathway!!

In [35]:
factor_goodness_of_fit_test(X, k = 1)

7.037376840867579e-08

Test shows that k = 1 **is not** sufficient to describe (**all**) the data. But perhaps sufficient to **only** capture the (single) pathway?