In [34]:
import pandas as pd
import numpy as np

import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

*This notebook does factor analysis on k = 2 pathways, trying to identify the 2 pathways in which DICER1 and GART participates in respectively.*
___
# $k = 2$ factors | GART & DICER1 pathways

## Loading and preparing data

See the `k1_[GENE]_initial_analysis.ipynb` notebooks for more intermediate steps than this notebook provides.

In [7]:
df_gene_effect = pd.read_csv("20Q4v2_Achilles_gene_effect.csv")
df_correlations = {
    "DICER1": pd.read_csv("corr_DICER1_preprocessed.csv", delimiter=";"),
    "GART": pd.read_csv("corr_GART_preprocessed.csv", delimiter=";")
}

In [33]:
# Number of genes to include from each corr file taking top X // 2 and bottom X // 2.
# Note that the same gene may be selected in both corr files. (But will only appear once in the final selected_genes list.)
number_of_genes = 10
assert number_of_genes % 2 == 0, "number_of_genes should be an even number."

selected_columns = pd.concat([df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2], 
                              df_correlations["DICER1"]["gene_evaluated"][-number_of_genes // 2:],
                              df_correlations["GART"]["gene_evaluated"][:number_of_genes // 2], 
                              df_correlations["GART"]["gene_evaluated"][-number_of_genes // 2:]]).to_list()
selected_columns = set(selected_columns)

# number_of_genes multiplied by how many corr files we have.
number_of_duplicates = (number_of_genes * 2) - len(selected_columns) 
if number_of_duplicates > 0:
    print(f"Warning: {number_of_duplicates} duplicates were removed.")

# Extract gene names according to the format of gene_effect dataset.
selected_columns_gene_effect_format = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]

# Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
assert(len(selected_columns) == len(selected_columns_gene_effect_format)) 

# Extract columns from gene_effect dataset.
X = df_gene_effect[selected_columns_gene_effect_format].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
if 808 - X.shape[0] > 0:
    print(f"Warning: {808 - X.shape[0]} rows contained at least 1 NA and was dropped.")
X.shape

(808, 20)

## Data Analysis

In [35]:
_, lambda_hat = factor_model_solution(X, k = 2)
lambda_hat

array([[ 0.10629842, -0.48670354],
       [ 0.07299983, -0.48118905],
       [ 0.87841332,  0.02848014],
       [-0.03302381,  0.09719705],
       [-0.008655  ,  0.03759163],
       [ 0.01807461, -0.72653769],
       [-0.00096324, -0.41278562],
       [ 0.87796171,  0.03348937],
       [-0.15009573, -0.13800765],
       [ 0.85673267, -0.00245353],
       [-0.09997079, -0.02107274],
       [ 0.0263521 ,  0.13301682],
       [ 0.02352066, -0.03702527],
       [ 0.90161429, -0.00522136],
       [-0.00433523, -0.09536355],
       [ 0.84230693, -0.00579557],
       [-0.05966692, -0.0347877 ],
       [-0.06491632,  0.05625643],
       [ 0.05758763, -0.67447864],
       [-0.03962759, -0.08062107]])

In [59]:
loadings = lambda_hat.T # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in selected_columns_gene_effect_format]

# Create a DataFrame from genes and values
df_values = pd.DataFrame({
    'gene_evaluated': genes,
    'loadings0': loadings[0],
    'loadings1': loadings[1]
})

# Merge with df_correlations on 'gene'
merged_df = df_values.merge(df_correlations["DICER1"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
merged_df = merged_df.rename(columns={"is_on_pathway": "is_on_pathway_DICER1"})
merged_df = merged_df.merge(df_correlations["GART"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
merged_df = merged_df.rename(columns={"is_on_pathway": "is_on_pathway_GART"})

In [64]:
# Allow pandas to display wider tables before linebreaking.
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Sorted w.r.t. first factor:

In [60]:
print(merged_df.sort_values("loadings0", ascending=False))

   gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
13          PAICS   0.901614  -0.005221                   NaN                 1.0
2            ATIC   0.878413   0.028480                   NaN                 1.0
7            GART   0.877962   0.033489                   NaN                 1.0
9          MTHFD1   0.856733  -0.002454                   NaN                 1.0
15           PFAS   0.842307  -0.005796                   NaN                 1.0
0            AGO2   0.106298  -0.486704                   1.0                 NaN
1         ANKRD52   0.073000  -0.481189                   1.0                 0.0
18           XPO5   0.057588  -0.674479                   1.0                 0.0
11         ORMDL2   0.026352   0.133017                   0.0                 NaN
12          P2RY2   0.023521  -0.037025                   NaN                 0.0
5          DICER1   0.018075  -0.726538                   1.0                 0.0
6          DROSH

Correctly finds the GART pathway. (Notice the steep drop-off in loading from PFAS to AGO2)

Sorted w.r.t. second factor:

In [62]:
print(merged_df.sort_values("loadings1", ascending=False))

   gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
11         ORMDL2   0.026352   0.133017                   0.0                 NaN
3            BTG1  -0.033024   0.097197                   0.0                 NaN
17           RGL1  -0.064916   0.056256                   NaN                 0.0
4            CCNI  -0.008655   0.037592                   NaN                 0.0
7            GART   0.877962   0.033489                   NaN                 1.0
2            ATIC   0.878413   0.028480                   NaN                 1.0
9          MTHFD1   0.856733  -0.002454                   NaN                 1.0
13          PAICS   0.901614  -0.005221                   NaN                 1.0
15           PFAS   0.842307  -0.005796                   NaN                 1.0
10           NEK9  -0.099971  -0.021073                   NaN                 0.0
16         PIH1D1  -0.059667  -0.034788                   NaN                 0.0
12          P2RY

Correctly finds the DICER1 pathway. (Notice the steep drop-off in loading from PFAS to AGO2)

## Goodness of Fit

In [88]:
print(factor_goodness_of_fit_test(X, k = 2))

for i in range(1, X.shape[1]):
    pvalue = factor_goodness_of_fit_test(X, k = i)
    if pvalue >= 0.05:
        print("Found sufficient factors:")
        print(f" {i} | {pvalue.round(4)}")
        print(f"({i - 1} | {factor_goodness_of_fit_test(X, k = i - 1).round(4)})")
        break

6.067248907679525e-24
Found sufficient factors:
 7 | 0.3449
(6 | 0.0497)
