In [19]:
import pandas as pd
import numpy as np

import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now import modules from parent directory
from utils import factor_model_solution, factor_goodness_of_fit_test

*This notebook does factor analysis on k = 2 pathways, trying to identify the 2 pathways in which DICER1 and GART participate in respectively.*
___
# $k = 2$ factors | GART & DICER1 pathways

## Loading and preparing data

See the `k1_[GENE]_initial_analysis.ipynb` notebooks for more intermediate steps than this notebook provides.

In [20]:
df_gene_effect = pd.read_csv("preprocessed_data/20Q4v2_Achilles_gene_effect.csv")
df_correlations = {
    "DICER1": pd.read_csv("preprocessed_data/corr_DICER1_preprocessed.csv", delimiter=";"),
    "GART": pd.read_csv("preprocessed_data/corr_GART_preprocessed.csv", delimiter=";")
}

In [21]:
# Number of genes to include from each corr file taking top X // 2 and bottom X // 2.
# Note that the same gene may be selected in both corr files. (But will only appear once in the final selected_genes list.)
number_of_genes = 10
assert number_of_genes % 2 == 0, "number_of_genes should be an even number."

only_top5 = True
if only_top5:
    selected_columns = pd.concat(
        [df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2],
        df_correlations["GART"]["gene_evaluated"][:number_of_genes // 2]]
        ).to_list()
else:
    selected_columns = pd.concat(
        [df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2], 
        df_correlations["DICER1"]["gene_evaluated"][-number_of_genes // 2:],
        df_correlations["GART"]["gene_evaluated"][:number_of_genes // 2], 
        df_correlations["GART"]["gene_evaluated"][-number_of_genes // 2:]]
        ).to_list()
    
len_before_dup_removal = len(selected_columns)
    
selected_columns = set(selected_columns)

len_after_dup_removal = len(selected_columns)

# number_of_genes multiplied by how many corr files we have.
number_of_duplicates = len_before_dup_removal - len_after_dup_removal
if number_of_duplicates > 0:
    print(f"Warning: {number_of_duplicates} duplicates were removed.")

# Extract gene names according to the format of gene_effect dataset.
selected_columns_gene_effect_format = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]

# Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
assert len(selected_columns) == len(selected_columns_gene_effect_format)

# Extract columns from gene_effect dataset.
X = df_gene_effect[selected_columns_gene_effect_format].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
num_dropped_rows = df_gene_effect.shape[0] - X.shape[0]
if num_dropped_rows > 0:
    print(f"Warning: {num_dropped_rows} rows contained at least 1 NA and was dropped.")
X.shape

(808, 10)

## Data Analysis

In [22]:
_, lambda_hat = factor_model_solution(X, k = 2)
lambda_hat

array([[-1.06906923e-01,  4.82579560e-01],
       [-7.40326542e-02,  4.97701977e-01],
       [-8.78979381e-01, -3.02807601e-02],
       [-1.96331464e-02,  7.07141756e-01],
       [ 6.92409717e-04,  4.14550271e-01],
       [-8.77315239e-01, -3.38317369e-02],
       [-8.56750502e-01, -3.04736539e-03],
       [-9.01991256e-01,  7.37227205e-03],
       [-8.41656080e-01, -6.24149132e-04],
       [-5.94276402e-02,  6.87247991e-01]])

In [23]:
loadings = lambda_hat.T # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in selected_columns_gene_effect_format]

# Create a DataFrame from genes and values
df_values = pd.DataFrame({
    'gene_evaluated': genes,
    'loadings0': loadings[0],
    'loadings1': loadings[1]
})

# Merge with df_correlations on 'gene'
merged_df = (df_values
             .merge(df_correlations["DICER1"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_DICER1"})
             .merge(df_correlations["GART"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_GART"})
             )

In [24]:
# Allow pandas to display wider tables before linebreaking.
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Sorted w.r.t. first factor:

In [25]:
print(merged_df.sort_values("loadings0", ascending=False))

  gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
4         DROSHA   0.000692   0.414550                   1.0                 0.0
3         DICER1  -0.019633   0.707142                   1.0                 0.0
9           XPO5  -0.059428   0.687248                   1.0                 0.0
1        ANKRD52  -0.074033   0.497702                   1.0                 0.0
0           AGO2  -0.106907   0.482580                   1.0                 NaN
8           PFAS  -0.841656  -0.000624                   NaN                 1.0
6         MTHFD1  -0.856751  -0.003047                   NaN                 1.0
5           GART  -0.877315  -0.033832                   NaN                 1.0
2           ATIC  -0.878979  -0.030281                   NaN                 1.0
7          PAICS  -0.901991   0.007372                   NaN                 1.0


Correctly finds the GART pathway. (Notice the steep drop-off in loading from PFAS to AGO2)

Sorted w.r.t. second factor:

In [26]:
print(merged_df.sort_values("loadings1", ascending=False))

  gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
3         DICER1  -0.019633   0.707142                   1.0                 0.0
9           XPO5  -0.059428   0.687248                   1.0                 0.0
1        ANKRD52  -0.074033   0.497702                   1.0                 0.0
0           AGO2  -0.106907   0.482580                   1.0                 NaN
4         DROSHA   0.000692   0.414550                   1.0                 0.0
7          PAICS  -0.901991   0.007372                   NaN                 1.0
8           PFAS  -0.841656  -0.000624                   NaN                 1.0
6         MTHFD1  -0.856751  -0.003047                   NaN                 1.0
2           ATIC  -0.878979  -0.030281                   NaN                 1.0
5           GART  -0.877315  -0.033832                   NaN                 1.0


Correctly finds the DICER1 pathway. (Notice somewhat significant drop-off from DROSHA to HYOU1)

## Goodness of Fit

In [27]:
print(factor_goodness_of_fit_test(X, k = 2))

for i in range(1, X.shape[1]):
    pvalue = factor_goodness_of_fit_test(X, k = i)
    if pvalue >= 0.05:
        print("Found sufficient factors:")
        print("FAC| PVAL")
        print(f" {i} | {pvalue.round(4)}")
        print(f"({i - 1} | {factor_goodness_of_fit_test(X, k = i - 1).round(4)})")
        break

3.4422929504764765e-09
Found sufficient factors:
FAC| PVAL
 5 | 0.685
(4 | 0.0104)
