In [9]:
import pandas as pd
import numpy as np

import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

In [14]:
df_gene_effect = pd.read_csv("preprocessed_data/20Q4v2_Achilles_gene_effect.csv")
df_correlations = {
    "DICER1": pd.read_csv("preprocessed_data/corr_DICER1_preprocessed.csv", delimiter=";")
}

# Number of genes to include from each corr file taking top X // 2 and bottom X // 2.
# Note that the same gene may be selected in both corr files. (But will only appear once in the final selected_genes list.)
number_of_genes = 30
assert number_of_genes % 2 == 0, "number_of_genes should be an even number."

selected_columns = pd.concat(
    [df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2], 
     df_correlations["DICER1"]["gene_evaluated"][-number_of_genes // 2:]]
    ).to_list()
selected_columns = set(selected_columns)

# number_of_genes multiplied by how many corr files we have.
number_of_duplicates = (number_of_genes * len(df_correlations)) - len(selected_columns) 
if number_of_duplicates > 0:
    print(f"Warning: {number_of_duplicates} duplicates were removed.")

# Extract gene names according to the format of gene_effect dataset.
selected_columns_gene_effect_format = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]

# Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
assert len(selected_columns) == len(selected_columns_gene_effect_format)

# Extract columns from gene_effect dataset.
X = df_gene_effect[selected_columns_gene_effect_format].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
num_dropped_rows = df_gene_effect.shape[0] - X.shape[0]
if num_dropped_rows > 0:
    print(f"Warning: {num_dropped_rows} rows contained at least 1 NA and was dropped.")
X.shape



(778, 30)

In [19]:
print("Goodness of Fit tests on data with shape", X.shape)
print("p-value for k = 1:", factor_goodness_of_fit_test(X, k = 1))

found_sufficient_factors = False
for i in range(1, X.shape[1]):
    pvalue = factor_goodness_of_fit_test(X, k = i)
    print(pvalue)
    if pvalue >= 0.05:
        found_sufficient_factors = True
        print("Found sufficient factors:")
        print("FAC| PVAL")
        print(f" {i} | {pvalue.round(4)}")
        print(f"({i - 1} | {factor_goodness_of_fit_test(X, k = i - 1).round(4)})")

        # Automatically set k
        k = i
        
        break

if not found_sufficient_factors:
    print("Did not find sufficient factors")

Goodness of Fit tests on data with shape (778, 30)
p-value for k = 1: 1.3436409920550252e-61
1.3436409920550252e-61
1.845908857531346e-42
1.4276420425558403e-29
2.986254161039473e-20
2.384216974944395e-12
2.3521096017533334e-08
8.075961238967624e-06
0.0003372191296715643
0.008278696950975833
0.0003196929254724228
0.0022809687085748537
0.0
0.0
0.0
1.490073487998439e-12
1.4687404192428726e-14
1.0559650435501681e-14
6.82849228277753e-16
0.0032608443793495563
2.8109261498146515e-11
9.31585441287318e-10
5.694112800993602e-06
nan
nan
nan
nan
nan


KeyboardInterrupt: 

In [40]:
_, lambda_hat = factor_model_solution(X, k = 3)

# Adjust precision and disable scientific notation for better visibility
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
lambda_hat

array([[ 0.459,  0.147, -0.01 ],
       [ 0.456,  0.277, -0.113],
       [ 0.168, -0.062,  0.268],
       [-0.094,  0.068,  0.218],
       [ 0.316,  0.213,  0.08 ],
       [ 0.779, -0.008,  0.042],
       [ 0.335, -0.515, -0.167],
       [ 0.428,  0.165,  0.171],
       [ 0.251, -0.299, -0.142],
       [-0.117,  0.042, -0.054],
       [ 0.152, -0.205,  0.066],
       [ 0.123, -0.056,  0.19 ],
       [-0.206,  0.195, -0.347],
       [ 0.227, -0.031, -0.12 ],
       [ 0.249, -0.152, -0.052],
       [-0.144,  0.093,  0.295],
       [ 0.109,  0.044,  0.006],
       [-0.135,  0.119, -0.03 ],
       [ 0.074,  0.013,  0.089],
       [ 0.305, -0.083,  0.115],
       [ 0.15 ,  0.01 , -0.17 ],
       [ 0.1  , -0.017,  0.082],
       [-0.1  ,  0.248, -0.034],
       [-0.13 ,  0.043,  0.01 ],
       [ 0.239,  0.206, -0.127],
       [ 0.219,  0.175, -0.173],
       [-0.099, -0.008,  0.062],
       [ 0.241, -0.12 ,  0.204],
       [ 0.625,  0.132, -0.085],
       [ 0.088,  0.003,  0.244]])

In [41]:
loadings = lambda_hat.T # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in selected_columns_gene_effect_format]

# Set up the structure of the dataframe
data_frame_structure = {
    'gene_evaluated': genes
}
for idx, loading_vector in enumerate(loadings):
    data_frame_structure["loadings" + str(idx)] = loading_vector

# Create a DataFrame from genes and values
df_values = pd.DataFrame(data_frame_structure)

# Merge with df_correlations on 'gene'
merged_df = (df_values
             .merge(df_correlations["DICER1"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_DICER1"})
             )

In [46]:
# Make the table more readable
pd.set_option('display.float_format', lambda x: '%.4f' % x)

merged_df.sort_values("loadings0", ascending=False)

Unnamed: 0,gene_evaluated,loadings0,loadings1,loadings2,is_on_pathway_DICER1
5,DICER1,0.7787,-0.0081,0.0415,1
28,XPO5,0.6247,0.1321,-0.0848,1
0,AGO2,0.4589,0.1466,-0.0101,1
1,ANKRD52,0.4562,0.2768,-0.1125,1
7,DROSHA,0.4276,0.1648,0.1713,1
6,DOLK,0.3348,-0.5151,-0.1675,0
4,DGCR8,0.3163,0.213,0.0798,1
19,RBSN,0.3052,-0.0828,0.1147,0
8,EMC1,0.2515,-0.2994,-0.1422,0
14,MNAT1,0.2492,-0.152,-0.0517,0
