In [None]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import PCA

In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .getOrCreate()

In [None]:
%ls

[0m[01;36mbin[0m@       [01;34mdev[0m/     [01;34mhome[0m/   [01;34mlost+found[0m/  [01;34mopt[0m/   [01;34mrun[0m/   [01;34msys[0m/  [01;34mvar[0m/
[01;34mboot[0m/      [01;34metc[0m/     [01;36mlib[0m@    [01;34mmedia[0m/       [01;34mproc[0m/  [01;36msbin[0m@  [30;42mtmp[0m/
copyright  [01;34mhadoop[0m/  [01;36mlib64[0m@  [01;34mmnt[0m/         [01;34mroot[0m/  [01;34msrv[0m/   [01;34musr[0m/


In [None]:
expression_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/data/")
cell_line_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/cell_line_metadata.parquet")
drug_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/drug_metadata.parquet")
gene_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/gene_metadata.parquet")
observation_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/obs_metadata.parquet")
sample_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/sample_metadata.parquet")


                                                                                

In [None]:
print(expression_df.columns)
print(cell_line_df.columns)
print(drug_df.columns)
print(gene_df.columns)
print(observation_df.columns)
print(sample_df.columns)

['genes', 'expressions', 'drug', 'sample', 'BARCODE_SUB_LIB_ID', 'cell_line_id', 'moa-fine', 'canonical_smiles', 'pubchem_cid', 'plate']
['cell_name', 'Cell_ID_DepMap', 'Cell_ID_Cellosaur', 'Organ', 'Driver_Gene_Symbol', 'Driver_VarZyg', 'Driver_VarType', 'Driver_ProtEffect_or_CdnaEffect', 'Driver_Mech_InferDM', 'Driver_GeneType_DM']
['drug', 'targets', 'moa-broad', 'moa-fine', 'human-approved', 'clinical-trials', 'gpt-notes-approval', 'canonical_smiles', 'pubchem_cid']
['gene_symbol', 'ensembl_id', 'token_id']
['plate', 'BARCODE_SUB_LIB_ID', 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', '__index_level_0__']
['sample', 'plate', 'mean_gene_count', 'mean_tscp_count', 'mean_mread_count', 'mean_pcnt_mito', 'drug', 'drugname_drugconc']


In [None]:
df = expression_df.join(cell_line_df, expression_df.cell_line_id == cell_line_df.Cell_ID_Cellosaur)\
    .join(sample_df, "sample")\
    .join(drug_df, "drug")\
    .select('genes', 'expressions', 'moa-fine', 'canonical_smiles', 'drug', 'sample', 'drugname_drugconc', 'moa-broad', 'cell_name', 'organ', 'Driver_Gene_Symbol', 'Driver_Mech_InferDM', 'Driver_GeneType_DM', 'mean_pcnt_mito')
#            Expr table                                                                Drug table,                         Cell_name,                                                                             sample

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `moa-fine` is ambiguous, could be: [`moa-fine`, `moa-fine`].

In [None]:
from pyspark.sql.functions import col

# Aliases
expr_df = expression_df.alias("expr")
cell_df = cell_line_df.alias("cell")
sample_df_ = sample_df.alias("sample")
drug_df_ = drug_df.alias("drug")

df = (
    expr_df.join(cell_df, col("expr.cell_line_id") == col("cell.Cell_ID_Cellosaur"))
          .join(sample_df_, col("expr.sample") == col("sample.sample"))
          .join(drug_df_, col("expr.drug") == col("drug.drug"))
          .select(
              col("expr.genes").alias("genes"),
              col("expr.expressions").alias("expressions"),
              col("drug.`moa-fine`").alias("moa-fine"),
              col("drug.canonical_smiles").alias("canonical_smiles"),
              col("expr.drug").alias("drug"),
              col("expr.sample").alias("sample"),
              col("drug.`moa-broad`").alias("moa-broad"),
              col("cell.cell_name").alias("cell_name"),
              col("cell.organ").alias("organ"),
              col("cell.Driver_Gene_Symbol").alias("Driver_Gene_Symbol"),
              col("cell.Driver_Mech_InferDM").alias("Driver_Mech_InferDM"),
              col("cell.Driver_GeneType_DM").alias("Driver_GeneType_DM"),
              col("sample.drugname_drugconc").alias("drugname_drugconc"),
              col("sample.mean_pcnt_mito").alias("mean_pcnt_mito")
          )
)

In [None]:
df.select("moa-fine", "Organ").show(20)

[Stage 24:>                                                         (0 + 1) / 1]

+--------+--------+
|moa-fine|   Organ|
+--------+--------+
| unclear|   Bowel|
| unclear|   Bowel|
| unclear|   Bowel|
| unclear|   Bowel|
| unclear|   Bowel|
| unclear|   Bowel|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|  Breast|
| unclear|Pancreas|
| unclear|Pancreas|
+--------+--------+
only showing top 20 rows



                                                                                

In [None]:
df = expression_df.join(cell_line_df, expression_df.cell_line_id == cell_line_df.Cell_ID_Cellosaur)
df.select("cell_line_id", "Cell_ID_Cellosaur", "Organ").show(10)

                                                                                

+------------+-----------------+------+
|cell_line_id|Cell_ID_Cellosaur| Organ|
+------------+-----------------+------+
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0546|        CVCL_0546| Bowel|
|   CVCL_0179|        CVCL_0179|Breast|
|   CVCL_0179|        CVCL_0179|Breast|
|   CVCL_0179|        CVCL_0179|Breast|
|   CVCL_0179|        CVCL_0179|Breast|
+------------+-----------------+------+
only showing top 10 rows



In [None]:
df.select("Organ").distinct().show()



+--------------------+
|               Organ|
+--------------------+
|                Skin|
|            Pancreas|
|              Uterus|
|           CNS/Brain|
|              Cervix|
|               Bowel|
|                Lung|
|   Esophagus/Stomach|
|               Liver|
|              Breast|
|Peripheral Nervou...|
|              Kidney|
|Bladder/Urinary T...|
+--------------------+



                                                                                

In [None]:
df.columns

['genes',
 'expressions',
 'drug',
 'sample',
 'BARCODE_SUB_LIB_ID',
 'cell_line_id',
 'moa-fine',
 'canonical_smiles',
 'pubchem_cid',
 'plate']

In [None]:
gene_df.printSchema()

root
 |-- gene_symbol: string (nullable = true)
 |-- ensembl_id: string (nullable = true)
 |-- token_id: long (nullable = true)



In [None]:
print(observation_df.columns)

['plate', 'BARCODE_SUB_LIB_ID', 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', '__index_level_0__']


In [None]:
gene_df.select("*").distinct().count()

                                                                                

62710

In [None]:
def make_sparse_vector(genes, expressions):
    if genes is None or expressions is None:
        return SparseVector(63000, {})
    return SparseVector(63000, dict(zip(genes, expressions)))


In [None]:
make_sparse_vector_udf = udf(make_sparse_vector, VectorUDT())

In [None]:
df = df.withColumn("features", make_sparse_vector_udf("Genes", "Expressions"))

In [None]:
pca = PCA(k=256, inputCol="features", outputCol="pca_features")

In [None]:
pca_model = pca.fit(df)

25/04/17 23:16:09 WARN RowMatrix: 63000 columns will require at least 31752 megabytes of memory!
25/04/17 23:27:00 WARN YarnAllocator: Container from a bad node: container_1744839335032_0010_01_000010 on host: big-data-cluster-w-1.us-central1-a.c.excellent-math-456021-s0.internal. Exit status: 143. Diagnostics: [2025-04-17 23:27:00.202]Container killed on request. Exit code is 143
[2025-04-17 23:27:00.202]Container exited with a non-zero exit code 143. 
[2025-04-17 23:27:00.203]Killed by external signal
.
25/04/17 23:27:00 ERROR YarnScheduler: Lost executor 9 on big-data-cluster-w-1.us-central1-a.c.excellent-math-456021-s0.internal: Container from a bad node: container_1744839335032_0010_01_000010 on host: big-data-cluster-w-1.us-central1-a.c.excellent-math-456021-s0.internal. Exit status: 143. Diagnostics: [2025-04-17 23:27:00.202]Container killed on request. Exit code is 143
[2025-04-17 23:27:00.202]Container exited with a non-zero exit code 143. 
[2025-04-17 23:27:00.203]Killed by e

In [None]:
df_pca = pca_model.transform(df)

In [None]:
df_pca.select("pca_features").show(10, truncate=False)

In [None]:
df_pca.columns()

In [None]:
from pyspark.mllib.linalg.distributed import RowMatrix

# Convert to RDD[Vector] — compatible with both DenseVector and SparseVector
rdd = df.select("features").rdd.map(lambda row: row['features'])

# Create a RowMatrix
mat = RowMatrix(rdd)

# Compute top-k SVD (no U if not needed)
svd = mat.computeSVD(k=50, computeU=False)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import VectorUDT

import numpy as np

V = np.array(svd.V.toArray())  # shape: (original_dim, k)
V_broadcast = spark.sparkContext.broadcast(V)

@udf(ArrayType(DoubleType()))
def project_sparse(vec):
    return np.dot(vec.toArray(), V_broadcast.value).tolist()

In [None]:
df_reduced = df.withColumn("reduced_features", project_sparse("features"))

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf

@udf(VectorUDT())
def to_dense_vector(arr):
    return Vectors.dense(arr)

df_reduced = df_reduced.withColumn("reduced_vector", to_dense_vector("reduced_features"))

###Failed :PCA Attempts to reduce 63k to 256 parameters

In [None]:

from pyspark.sql import SparkSession
import pandas as pd
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors as MLLibVectors
import numpy as np



spark = SparkSession.builder \
    .appName("PCA-Cluster-Job")\
    .getOrCreate()
spark.conf.set("spark.sql.files.maxPartitionBytes", 800 * 1024 * 1024)
spark.conf.set("spark.hadoop.fs.gs.inputstream.buffer.size", 1048576)

features = spark.read.parquet("gs://medical-data-for-project/features_subset/")
vector_rdd = features.select("features").rdd.map(lambda row: MLLibVectors.fromML(row[0]))
mat = RowMatrix(vector_rdd)
pc_matrix = mat.computePrincipalComponents(k=256)


# Step 1: Convert to NumPy
np_pc = np.array(pc_matrix.toArray())  # shape: (num_features, k)

# Step 2: Convert to Pandas DataFrame
df_pc = pd.DataFrame(np_pc)

# Step 3: Convert to Spark DataFrame
spark_df_pc = spark.createDataFrame(df_pc)

# Step 4: Save as Parquet to GCS
spark_df_pc.write.mode("overwrite").parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/pca_models/pca_components.parquet")

###Failed :PCA Attempt to reduce 63k to 256 parameters, Cluster Job

In [None]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import PCA
from pyspark.sql import SparkSession
from pyspark import StorageLevel

# Initialize Spark session
spark = SparkSession.builder \
    .appName("PCA-Cluster-Job") \
    .getOrCreate()

# Limit partition size to 800 MB
spark.conf.set("spark.sql.files.maxPartitionBytes", 800 * 1024 * 1024)

# Optional: enable GCS connector buffering
spark.conf.set("spark.hadoop.fs.gs.inputstream.buffer.size", 1048576)

# Sparse vector UDF # Major bottleneck in terms of performance,
# no alternative available, potentially F.zipwith could be used but did not work
# in this case.
def make_sparse_vector(genes, expressions):
    if not genes or not expressions:
        return SparseVector(63000, {})
    return SparseVector(63000, dict(zip(genes, expressions)))

make_sparse_vector_udf = udf(make_sparse_vector, VectorUDT())

# Read data
expression_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/data/") \
    .select("sample", "drug", "cell_line_id", "genes", "expressions") \
    .repartition(1500)  # Tune based on cluster

cell_line_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/cell_line_metadata.parquet") \
    .select("Cell_ID_Cellosaur")

drug_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/drug_metadata.parquet") \
    .select("drug")

sample_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/sample_metadata.parquet") \
    .select("sample")

cell_line_df = cell_line_df.cache()
drug_df = drug_df.cache()
sample_df = sample_df.cache()

# Join metadata (lazy until actions triggered)
df = expression_df.alias("expressions") \
    .join(cell_line_df.alias("cell_line"), expression_df.cell_line_id == cell_line_df.Cell_ID_Cellosaur, "inner") \
    .join(sample_df.alias("sample"), "sample") \
    .join(drug_df.alias("drug"), "drug") \
    .withColumn("features", make_sparse_vector_udf("genes", "expressions")) \
    .select("features")

df = df.repartition(1500)
# Persist transformed data (disk only to avoid memory issues)
df = df.persist(StorageLevel.DISK_ONLY)

# Run PCA
pca = PCA(k=256, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(df)

# Save model
pca_model.save("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/pca_models/")
