In [1]:
import pandas as pd
import numpy as np

import sys
import os

# Get the current working directory (where the notebook is running)
notebook_dir = os.getcwd()

# Go up one level to the parent directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import your module
from utils import factor_model_solution, factor_goodness_of_fit_test

*This notebook does factor analysis on k = 2 pathways, trying to identify the 2 pathways in which DICER1 and GART participate in respectively.*
___
# $k = 2$ factors | GART & DICER1 pathways

## Loading and preparing data

See the `k1_[GENE]_initial_analysis.ipynb` notebooks for more intermediate steps than this notebook provides.

In [2]:
df_gene_effect = pd.read_csv("preprocessed_data/20Q4v2_Achilles_gene_effect.csv")
df_correlations = {
    "DICER1": pd.read_csv("preprocessed_data/corr_DICER1_preprocessed.csv", delimiter=";"),
    "GART": pd.read_csv("preprocessed_data/corr_GART_preprocessed.csv", delimiter=";")
}

In [3]:
# Number of genes to include from each corr file taking top X // 2 and bottom X // 2.
# Note that the same gene may be selected in both corr files. (But will only appear once in the final selected_genes list.)
number_of_genes = 10
assert number_of_genes % 2 == 0, "number_of_genes should be an even number."

selected_columns = pd.concat([df_correlations["DICER1"]["gene_evaluated"][:number_of_genes // 2],
                              df_correlations["GART"]["gene_evaluated"][:number_of_genes // 2]]).to_list()
selected_columns = set(selected_columns)

# number_of_genes multiplied by how many corr files we have.
number_of_duplicates = (number_of_genes * len(df_correlations)) - len(selected_columns) 
if number_of_duplicates > 0:
    print(f"Warning: {number_of_duplicates} duplicates were removed.")

# Extract gene names according to the format of gene_effect dataset.
selected_columns_gene_effect_format = [column for column in df_gene_effect.columns if column.split(" ")[0] in selected_columns]

# Rough automatic check for whether we found all corresponding columns in gene_effect dataset that we've selected.
assert len(selected_columns) == len(selected_columns_gene_effect_format)

# Extract columns from gene_effect dataset.
X = df_gene_effect[selected_columns_gene_effect_format].dropna() # Is dropping rows with na a good idea? Maybe use mean value instead?
X = X.to_numpy()
num_dropped_rows = df_gene_effect.shape[0] - X.shape[0]
if num_dropped_rows > 0:
    print(f"Warning: {num_dropped_rows} rows contained at least 1 NA and was dropped.")
X.shape



(808, 10)

In [7]:
num_random_cols = 10

X_with_random_noise = np.hstack([X] + [np.random.random(size=X.shape[0]).reshape(-1, 1) for _ in range(num_random_cols)])

X_with_random_noise.shape

(808, 20)

## Data Analysis

In [8]:
_, lambda_hat = factor_model_solution(X_with_random_noise, k = 2)
lambda_hat

array([[ 1.06786856e-01,  4.82073211e-01],
       [ 7.45454913e-02,  4.99768614e-01],
       [ 8.78890010e-01, -3.02020457e-02],
       [ 1.96163235e-02,  7.03072259e-01],
       [-9.85224945e-04,  4.16073523e-01],
       [ 8.77324111e-01, -3.31299992e-02],
       [ 8.56976637e-01, -4.24513131e-03],
       [ 9.01837380e-01,  6.95499724e-03],
       [ 8.41705070e-01, -2.31661078e-04],
       [ 5.95322504e-02,  6.89565337e-01],
       [ 2.53879415e-02, -1.93237299e-02],
       [-1.50431709e-02, -1.05667403e-02],
       [-5.83322601e-02,  6.22107860e-04],
       [-7.37902732e-02, -1.46269668e-02],
       [-5.55835238e-03,  1.57500099e-02],
       [-4.64881436e-02,  6.61312756e-02],
       [-2.56087486e-02, -1.19294904e-02],
       [ 1.37208400e-02,  9.60018973e-03],
       [-5.54253090e-03, -8.49314510e-03],
       [ 3.49864414e-02,  8.41883828e-02]])

In [11]:
loadings = lambda_hat.T # Extract loadings as a list

# Extract gene names (everything before the space)
genes = [column.split(" ")[0] for column in selected_columns_gene_effect_format] 

genes = genes + [str(i) + " random col" for i in range(num_random_cols)]

# Create a DataFrame from genes and values
df_values = pd.DataFrame({
    'gene_evaluated': genes,
    'loadings0': loadings[0],
    'loadings1': loadings[1]
})

# Merge with df_correlations on 'gene'
merged_df = (df_values
             .merge(df_correlations["DICER1"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_DICER1"})
             .merge(df_correlations["GART"][['gene_evaluated', 'is_on_pathway']], on='gene_evaluated', how='left')
             .rename(columns={"is_on_pathway": "is_on_pathway_GART"})
             )

In [12]:
# Allow pandas to display wider tables before linebreaking.
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Sorted w.r.t. first factor:

In [13]:
print(merged_df.sort_values("loadings0", ascending=False))

   gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
7           PAICS   0.901837   0.006955                   NaN                 1.0
2            ATIC   0.878890  -0.030202                   NaN                 1.0
5            GART   0.877324  -0.033130                   NaN                 1.0
6          MTHFD1   0.856977  -0.004245                   NaN                 1.0
8            PFAS   0.841705  -0.000232                   NaN                 1.0
0            AGO2   0.106787   0.482073                   1.0                 NaN
1         ANKRD52   0.074545   0.499769                   1.0                 0.0
9            XPO5   0.059532   0.689565                   1.0                 0.0
19   9 random col   0.034986   0.084188                   NaN                 NaN
10   0 random col   0.025388  -0.019324                   NaN                 NaN
3          DICER1   0.019616   0.703072                   1.0                 0.0
17   7 random co

Correctly finds the GART pathway. (Notice the steep drop-off in loading from PFAS to AGO2)

Sorted w.r.t. second factor:

In [14]:
print(merged_df.sort_values("loadings1", ascending=False))

   gene_evaluated  loadings0  loadings1  is_on_pathway_DICER1  is_on_pathway_GART
3          DICER1   0.019616   0.703072                   1.0                 0.0
9            XPO5   0.059532   0.689565                   1.0                 0.0
1         ANKRD52   0.074545   0.499769                   1.0                 0.0
0            AGO2   0.106787   0.482073                   1.0                 NaN
4          DROSHA  -0.000985   0.416074                   1.0                 0.0
19   9 random col   0.034986   0.084188                   NaN                 NaN
15   5 random col  -0.046488   0.066131                   NaN                 NaN
14   4 random col  -0.005558   0.015750                   NaN                 NaN
17   7 random col   0.013721   0.009600                   NaN                 NaN
7           PAICS   0.901837   0.006955                   NaN                 1.0
12   2 random col  -0.058332   0.000622                   NaN                 NaN
8            PFA

Correctly finds the DICER1 pathway. (Notice somewhat significant drop-off from DROSHA to HYOU1)

## Goodness of Fit

In [12]:
print(factor_goodness_of_fit_test(X, k = 2))

for i in range(1, X.shape[1]):
    pvalue = factor_goodness_of_fit_test(X, k = i)
    if pvalue >= 0.05:
        print("Found sufficient factors:")
        print("FAC| PVAL")
        print(f" {i} | {pvalue.round(4)}")
        print(f"({i - 1} | {factor_goodness_of_fit_test(X, k = i - 1).round(4)})")
        break

6.067248907679525e-24
Found sufficient factors:
FAC| PVAL
 7 | 0.3449
(6 | 0.0497)
