In [None]:
from pyspark.sql.functions import udf,lit,col
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import PCA
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("PCA-Cluster-Job")\
    .getOrCreate()
spark.conf.set("spark.sql.files.maxPartitionBytes", 800 * 1024 * 1024) #Partition size shouldn't exceed 800 mb
"""Description of above configuration:

"""
spark.conf.set("spark.hadoop.fs.gs.inputstream.buffer.size", 1048576) # 1MB data extracted at a time from GCP API to overcome rate limit issue
"""Description of above configuration:
sets a Spark configuration parameter that controls the buffer size (in bytes) used when reading data from Google Cloud Storage (GCS) using the gs:// URI scheme.
Breakdown:

    spark.conf.set(...): Sets a Spark runtime configuration.
    fs.gs.inputstream.buffer.size: A Hadoop-GCS connector setting that specifies how much data (in bytes) is buffered when reading from GCS.
    1048576: This is 1 MB (1024 * 1024 bytes).

Effect:
This increases the read buffer size to 1 MB, which can:
    Improve performance by reducing the number of HTTP requests to GCS.
    Be helpful when reading large files or working with high-latency networks.
    The default is usually 8192 bytes (8 KB), which is small for large-scale distributed reads.
Reason:
    This is done to improve overall throughput.
"""

In [None]:
expression_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/data/")
cell_line_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/cell_line_metadata.parquet")
drug_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/drug_metadata.parquet")
sample_df = spark.read.parquet("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/meta_data/sample_metadata.parquet")
df = expression_df.alias("expressions").join(cell_line_df.alias("cell_line"), expression_df.cell_line_id == cell_line_df.Cell_ID_Cellosaur)\
    .join(sample_df.alias("sample"), "sample")\
    .join(drug_df.alias("drug"), "drug")\
    .select('genes', 'expressions','expressions.moa-fine')

                                                                                

In [None]:
fractions = df.select(col("moa-fine")).distinct().withColumn("fraction", lit(0.1)).rdd.collectAsMap()
df.sampleBy(col("moa-fine"), fractions, seed=42).write \
    .mode("overwrite") \
    .parquet("gs://medical-data-for-project/SUBSET/")

                                                                                

In [None]:
stratified = spark.read.parquet("gs://medical-data-for-project/SUBSET/")
stratified.select(col("moa-fine")).count()

                                                                                

90877981

In [None]:
stratified.show(1)


[Stage 21:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------------+
|               genes|         expressions|          moa-fine|
+--------------------+--------------------+------------------+
|[1, 5, 11, 19, 25...|[-2.0, 1.0, 4.0, ...|Other TK inhibitor|
+--------------------+--------------------+------------------+
only showing top 1 row




                                                                                

In [None]:
def make_sparse_vector(genes, expressions):
    if genes is None or expressions is None:
        return SparseVector(63000, {})
    return SparseVector(63000, dict(zip(genes, expressions)))
make_sparse_vector_udf = udf(make_sparse_vector, VectorUDT())

In [None]:
stratified_features = stratified.withColumn("features", make_sparse_vector_udf("genes", "expressions")).select("features")

In [None]:
pca = PCA(k=256, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(stratified_features)
pca_model.save("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/pca_models/")

25/04/22 21:16:22 WARN RowMatrix: 63000 columns will require at least 31752 megabytes of memory!
[Stage 29:>                                                      (0 + 20) / 100]

In [None]:
from pyspark.sql.functions import udf,lit,col
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import PCA
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf
import pandas as pd



spark = SparkSession.builder \
    .appName("PCA-Cluster-Job")\
    .getOrCreate()
spark.conf.set("spark.sql.files.maxPartitionBytes", 800 * 1024 * 1024)
spark.conf.set("spark.hadoop.fs.gs.inputstream.buffer.size", 1048576)

stratified = spark.read.parquet("gs://medical-data-for-project/SUBSET/")

@pandas_udf(VectorUDT())
def make_sparse_vector_udf(genes_series: pd.Series, expressions_series: pd.Series) -> pd.Series:
    return pd.Series([
        SparseVector(63000, dict(zip(genes, expressions)))
        for genes, expressions in zip(genes_series, expressions_series)
    ])

stratified_features = stratified.withColumn("features", make_sparse_vector_udf("genes", "expressions")).select("features")

stratified_features.write \
    .mode("overwrite") \
    .parquet("gs://medical-data-for-project/features_subset/")

25/04/23 02:14:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/04/23 02:15:29 WARN TaskSetManager: Lost task 1.0 in stage 1.0 (TID 2) (cluster-c6a6-w-1.us-central1-a.c.excellent-math-456021-s0.internal executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 100, in dump_stream
    for batch in itera

25/04/23 02:15:40 WARN TaskSetManager: Lost task 7.0 in stage 1.0 (TID 8) (cluster-c6a6-w-1.us-central1-a.c.excellent-math-456021-s0.internal executor 4): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 100, in dump_stream
    for batch in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 464, in init_stream_yield_ba

25/04/23 02:16:10 WARN TaskSetManager: Lost task 0.1 in stage 1.0 (TID 25) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 3): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to gs://medical-data-for-project/features_subset.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.sp

25/04/23 02:16:39 WARN TaskSetManager: Lost task 8.2 in stage 1.0 (TID 56) (cluster-c6a6-w-4.us-central1-a.c.excellent-math-456021-s0.internal executor 6): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 100, in dump_stream
    for batch in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 464, in init_stream_yield_b

25/04/23 02:16:45 WARN TaskSetManager: Lost task 14.2 in stage 1.0 (TID 62) (cluster-c6a6-w-1.us-central1-a.c.excellent-math-456021-s0.internal executor 1): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 100, in dump_stream
    for batch in iterator:
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 464, in init_stream_yield_batches
    batch = self._create_batch(series)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 444, in _create_batch
    raise PySparkValueError(
pyspark.errors.exceptions.base.PySparkValueError: A field of type StructType expects a pandas.DataFrame, but got: <class 'pandas.core.series.Series'>


25/04/23 02:16:47 WARN TaskSetManager: Lost task 17.3 in stage 1.0 (TID 78) (cluster-c6a6-w-0.us-central1-a.c.excellent-math-456021-s0.internal executor 5): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "

25/04/23 02:16:47 WARN TaskSetManager: Lost task 11.3 in stage 1.0 (TID 79) (cluster-c6a6-w-0.us-central1-a.c.excellent-math-456021-s0.internal executor 5): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "

25/04/23 02:16:48 WARN TaskSetManager: Lost task 5.3 in stage 1.0 (TID 66) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 3): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/

25/04/23 02:16:48 WARN TaskSetManager: Lost task 1.3 in stage 1.0 (TID 67) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/

25/04/23 02:16:49 WARN TaskSetManager: Lost task 18.3 in stage 1.0 (TID 75) (cluster-c6a6-w-4.us-central1-a.c.excellent-math-456021-s0.internal executor 6): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 1.0 failed 4 times, most recent failure: Lost task 15.3 in stage 1.0 (TID 64) (cluster-c6a6-w-2.us-central1-a.c.excellent-math-456021-s0.internal executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 470, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "

In [None]:
from pyspark.ml.feature import TruncatedSVD

ImportError: cannot import name 'TruncatedSVD' from 'pyspark.ml.feature' (/usr/lib/spark/python/pyspark/ml/feature.py)

In [None]:
features = spark.read.parquet("gs://medical-data-for-project/features_subset/")


                                                                                

+--------------------+
|            features|
+--------------------+
|(63000,[1,11,20,5...|
+--------------------+
only showing top 1 row



###PCA Another Failed Attempt with Row Matrix : Driver and Worker memory issues

In [None]:
pca = PCA(k=256, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(features)

25/04/23 13:58:19 WARN RowMatrix: 63000 columns will require at least 31752 megabytes of memory!
25/04/23 14:25:48 WARN RowMatrix: 63000 columns will require at least 31752 megabytes of memory!


IllegalArgumentException: requirement failed: Cannot aggregate object of size 15876252000 Bytes, as it's bigger than maxResultSize (1073741824 Bytes)

In [None]:
pca_model.save("gs://medical-data-for-project/huggingface.co/datasets/vevotx/Tahoe-100M/resolve/main/pca_models/")


In [None]:
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import Vectors
from pyspark.mllib.linalg import Vectors as MLLibVectors

# Convert each ml.linalg.SparseVector to mllib vector
vector_rdd = features.select("features").rdd.map(lambda row: MLLibVectors.fromML(row[0]))

# Now this will work
mat = RowMatrix(vector_rdd)
pc_matrix = mat.computePrincipalComponents(k=256)

25/04/23 14:46:53 WARN RowMatrix: 63000 columns will require at least 31752 megabytes of memory!
ERROR:root:KeyboardInterrupt while sending command.             (19 + 20) / 445]
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/miniconda3/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 