In [2]:
import pandas as pd
import numpy as np

import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

*This notebook investigates if factor analysis is still able to extract the correct pathways, if k is not predetermined. Instead k will be choosen using the goodness of fit test.*

# More factors $k$ than pathways

We'll be using the DICER1 corr rankings.

In [4]:
df_gene_effect = pd.read_csv("preprocessed_data/20Q4v2_Achilles_gene_effect.csv")
df_correlations = {
    "DICER1": pd.read_csv("preprocessed_data/corr_DICER1_preprocessed.csv", delimiter=";")
}

# Number of genes to include from each corr file taking top X // 2 and bottom X // 2.
# Note that the same gene may be selected in both corr files. (But will only appear once in the final selected_genes list.)
number_of_genes = 10
assert number_of_genes % 2 == 0, "number_of_genes should be an even number."

selected_columns = pd.concat(
    [df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2], 
     df_correlations["DICER1"]["gene_evaluated"][-number_of_genes // 2:]]
    ).to_list()
selected_columns = set(selected_columns)

# number_of_genes multiplied by how many corr files we have.
number_of_duplicates = (number_of_genes * len(df_correlations)) - len(selected_columns) 
if number_of_duplicates > 0:
    print(f"Warning: {number_of_duplicates} duplicates were removed.")

# Extract gene names according to the format of gene_effect dataset.
selected_columns_gene_effect_format = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]

# Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
assert len(selected_columns) == len(selected_columns_gene_effect_format)

# Extract columns from gene_effect dataset.
X = df_gene_effect[selected_columns_gene_effect_format].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
num_dropped_rows = df_gene_effect.shape[0] - X.shape[0]
if num_dropped_rows > 0:
    print(f"Warning: {num_dropped_rows} rows contained at least 1 NA and was dropped.")
X.shape

(808, 10)

In [11]:
print("Goodness of Fit tests on data with shape", X.shape)
print("p-value for k = 1:", factor_goodness_of_fit_test(X, k = 1))

for i in range(1, X.shape[1]):
    pvalue = factor_goodness_of_fit_test(X, k = i)
    if pvalue >= 0.05:
        print("Found sufficient factors:")
        print("FAC| PVAL")
        print(f" {i} | {pvalue.round(4)}")
        print(f"({i - 1} | {factor_goodness_of_fit_test(X, k = i - 1).round(4)})")

        # Automatically set k
        k = i
        
        break

Goodness of Fit tests on data with shape (808, 10)
p-value for k = 1: 7.640647361670192e-08
Found sufficient factors:
FAC| PVAL
 3 | 0.405
(2 | 0.0256)


In [None]:
_, lambda_hat = factor_model_solution(X, k = k)

# Adjust precision and disable scientific notation for better readability
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
lambda_hat

array([[ 0.352, -0.197,  0.238],
       [ 1.   ,  0.021, -0.   ],
       [ 0.017,  0.035, -0.155],
       [ 0.307, -0.419,  0.61 ],
       [ 0.208, -0.19 ,  0.304],
       [ 0.047, -0.043,  0.135],
       [ 0.046,  0.167, -0.097],
       [ 0.022, -0.025,  0.144],
       [ 0.376, -0.926, -0.   ],
       [ 0.004,  0.025,  0.196]])

In [20]:
loadings = lambda_hat.T # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in selected_columns_gene_effect_format]

# Set up the structure of the dataframe
data_frame_structure = {
    'gene_evaluated': genes
}
for idx, loading_vector in enumerate(loadings):
    data_frame_structure["loadings" + str(idx)] = loading_vector

# Create a DataFrame from genes and values
df_values = pd.DataFrame(data_frame_structure)

# Merge with df_correlations on 'gene'
merged_df = (df_values
             .merge(df_correlations["DICER1"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_DICER1"})
             )

Unnamed: 0,gene_evaluated,loadings0,loadings1,loadings2,is_on_pathway_DICER1
0,AGO2,0.352018,-0.19749,0.2377075,1
1,ANKRD52,0.999785,0.020692,-3.068937e-07,1
2,BTG1,0.016692,0.034623,-0.155492,0
3,DICER1,0.307034,-0.418862,0.6095384,1
4,DROSHA,0.208198,-0.190364,0.3037904,1
5,HYOU1,0.047316,-0.042706,0.1347107,0
6,ORMDL2,0.045943,0.167065,-0.09744135,0
7,PCDHB16,0.021798,-0.025033,0.1435007,0
8,XPO5,0.376427,-0.926438,-1.386444e-05,1
9,ZSCAN22,0.00434,0.0255,0.1957379,0


In [31]:
# Make the table more readable
pd.set_option('display.float_format', lambda x: '%.4f' % x)

merged_df.sort_values("loadings0", ascending=False)

Unnamed: 0,gene_evaluated,loadings0,loadings1,loadings2,is_on_pathway_DICER1
1,ANKRD52,0.9998,0.0207,-0.0,1
8,XPO5,0.3764,-0.9264,-0.0,1
0,AGO2,0.352,-0.1975,0.2377,1
3,DICER1,0.307,-0.4189,0.6095,1
4,DROSHA,0.2082,-0.1904,0.3038,1
5,HYOU1,0.0473,-0.0427,0.1347,0
6,ORMDL2,0.0459,0.1671,-0.0974,0
7,PCDHB16,0.0218,-0.025,0.1435,0
2,BTG1,0.0167,0.0346,-0.1555,0
9,ZSCAN22,0.0043,0.0255,0.1957,0


We probably find the correct pathway here in the first factor. However note that XPO5 has a loading of -0.9 on the second factor. Preferably, this loading was 0.