In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F
# very important to keep number of partitions low, initially 3300 partitions of
# 70 Mbs each, reduced to 417 with below config, working with 3300 partitions
# resulted in 5x more compute time due to a lot of network I/O and
# led to less workers being shut down
spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100) # this reads data in 1 Mb chunks
    # because there is a rate limit on reading data from GCP buckets on trial accounts.
    .getOrCreate())

25/05/02 14:00:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
df = spark.read.parquet("gs://bigdata_27/features/")

                                                                                

In [None]:
df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- canonical_smiles: string (nullable = true)
 |-- drug: string (nullable = true)
 |-- moa-fine: string (nullable = true)



In [None]:
spark.sparkContext.getConf().get("spark.driver.memory")

'4096m'

25/04/30 22:41:17 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:20 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:23 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:26 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:29 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:32 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:35 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:38 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/04/30 22:41:41 WARN YarnAllocatorNodeHealthTracker: No available nodes report

In [None]:
from pyspark.ml.feature import VarianceThresholdSelector

selector = VarianceThresholdSelector(
    featuresCol="features",         # your SparseVector column
    outputCol="selected_features",
    varianceThreshold=0.1       # keep only features with variance > 0.01
)
selected_df = selector.fit(df).transform(df)

                                                                                

In [None]:
selected_df.drop("features").write \
    .mode("overwrite") \
    .parquet("gs://bigdata_27/scaled_features/")

                                                                                

In [None]:
scaled_df = spark.read.parquet("gs://bigdata_27/scaled_features/")
scaled_df.show(1)

                                                                                

+--------------------+-----------+--------+--------------------+
|    canonical_smiles|       drug|moa-fine|   selected_features|
+--------------------+-----------+--------+--------------------+
|CC(C)NC(=O)COC1=C...|Belumosudil| unclear|(6256,[26,35,36,4...|
+--------------------+-----------+--------+--------------------+
only showing top 1 row



In [None]:
scaled_df.count()

                                                                                

95624334

In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

#df_100 = spark.read.parquet("gs://bigdata_27/scaled_features/part-02493-aae67da7-d1d8-4b34-9535-493843b5f419-c000.snappy.parquet")

In [None]:
df_100.count()

                                                                                

29956

In [None]:
# Apply PCA
pca = PCA(k=256, inputCol="selected_features", outputCol="pca_features")  # adjust k as needed
pca_model = pca.fit(scaled_df)

# Save the PCA model
pca_model.write().overwrite().save("gs://bigdata_27/pca_models/")

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/miniconda3/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
fractions = scaled_df.select("moa-fine").distinct().withColumn("fraction", lit(0.01)).rdd.collectAsMap()
sampled_df = scaled_df.sampleBy("moa-fine", fractions, seed=42).coalesce(20)

                                                                                

In [None]:
sampled_df.count()

                                                                                

954884

In [None]:
sampled_df\
  .write \
  .mode("overwrite") \
  .parquet("gs://bigdata_27/one_percent_subset/")

                                                                                

In [None]:
sampled_df = spark.read.parquet("gs://bigdata_27/one_percent_subset/")

[Stage 9:>                                                          (0 + 1) / 1]                                                                                

In [None]:
sampled_df.cache()

DataFrame[canonical_smiles: string, drug: string, moa-fine: string, selected_features: vector]

In [None]:
sampled_df.show(10)

[Stage 12:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|    canonical_smiles|                drug|            moa-fine|   selected_features|
+--------------------+--------------------+--------------------+--------------------+
|CC1C(C(CC(O1)OC2C...|Epirubicin (hydro...|DNA synthesis/rep...|(6256,[0,34,69,78...|
|CC1C(C(CC(O1)OC2C...|Epirubicin (hydro...|DNA synthesis/rep...|(6256,[2,9,12,21,...|
|CC1C(C(CC(O1)OC2C...|Epirubicin (hydro...|DNA synthesis/rep...|(6256,[0,3,4,8,10...|
|CC1C(C(CC(O1)OC2C...|Epirubicin (hydro...|DNA synthesis/rep...|(6256,[1,4,8,19,2...|
|CC1C(C(CC(O1)OC2C...|Epirubicin (hydro...|DNA synthesis/rep...|(6256,[6,9,25,42,...|
|C1=CC=C(C=C1)C2(C...|  Phenytoin (sodium)|             unclear|(6256,[5,7,8,24,2...|
|C1=CC=C(C=C1)C2(C...|  Phenytoin (sodium)|             unclear|(6256,[9,32,45,50...|
|C1=CC=C(C=C1)C2(C...|  Phenytoin (sodium)|             unclear|(6256,[5,12,25,34...|
|CCCCCCCCCCCC(CC1C...|            Orlistat|           

                                                                                

In [None]:
selected_df.drop("features").write \
    .mode("overwrite") \
    .parquet("gs://bigdata_27/scaled_features/")

In [None]:
# Apply PCA
pca = PCA(k=256, inputCol="selected_features", outputCol="pca_features")  # adjust k as needed
pca_model = pca.fit(sampled_df)

# Save the PCA model
pca_model.write().overwrite().save("gs://bigdata_27/pca_models/")

25/05/01 15:52:17 WARN TaskSetManager: Stage 19 contains a task of very large size (12780 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [None]:
import pyspark.sql.functions as F

In [None]:
from pyspark.ml.feature import PCAModel


pca_model = PCAModel.load("gs://bigdata_27/pca_models/")

                                                                                

In [None]:
sampled_df = pca_model.transform(sampled_df)

In [None]:
sampled_df.select("pca_features").show(1)

25/05/01 16:04:16 WARN DAGScheduler: Broadcasting large task binary with size 12.3 MiB
[Stage 28:>                                                         (0 + 1) / 1]

+--------------------+
|        pca_features|
+--------------------+
|[-37.014480892292...|
+--------------------+
only showing top 1 row



                                                                                

In [None]:
first_row = sampled_df.select("pca_features").head()
vector_length = type(first_row["pca_features"])
print(f"PCA vector length: {vector_length}")

25/05/01 16:06:59 WARN DAGScheduler: Broadcasting large task binary with size 12.3 MiB
[Stage 30:>                                                         (0 + 1) / 1]

PCA vector length: <class 'pyspark.ml.linalg.DenseVector'>


                                                                                

In [None]:
pca_model.transform(scaled_df).drop("selected_features").write.mode("overwrite").parquet("gs://bigdata_27/transformed_data/")

25/05/01 16:22:39 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

In [None]:
transformed_df = spark.read.parquet("gs://bigdata_27/transformed_data/")
transformed_df.show(1)

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|    canonical_smiles|                drug|            moa-fine|        pca_features|
+--------------------+--------------------+--------------------+--------------------+
|CNCC(C1=CC(=CC=C1...|Phenylephrine (hy...|Adrenoceptor agonist|[-30.062736918028...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



                                                                                

In [None]:
fractions = transformed_df.select("moa-fine").distinct().withColumn("fraction", lit(0.95)).rdd.collectAsMap()

25/05/01 19:22:53 WARN TaskSetManager: Lost task 0.0 in stage 38.0 (TID 3754) (cluster-f229-w-0.us-central1-a.c.big-data-attempt-2-457220.internal executor 63): org.apache.spark.SparkFileNotFoundException: File not found: gs://bigdata_27/transformed_data/part-00015-ce27b2b4-6888-415f-957e-3836700542d2-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:781)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD

[Stage 38:>                                                       (0 + 2) / 370]25/05/01 19:22:53 ERROR TaskSetManager: Task 1 in stage 38.0 failed 4 times; aborting job
25/05/01 19:22:53 WARN TaskSetManager: Lost task 0.2 in stage 38.0 (TID 3761) (cluster-f229-w-0.us-central1-a.c.big-data-attempt-2-457220.internal executor 63): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 1 in stage 38.0 failed 4 times, most recent failure: Lost task 1.3 in stage 38.0 (TID 3762) (cluster-f229-w-0.us-central1-a.c.big-data-attempt-2-457220.internal executor 63): org.apache.spark.SparkFileNotFoundException: File not found: gs://bigdata_27/transformed_data/part-00020-ce27b2b4-6888-415f-957e-3836700542d2-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.rea

Py4JJavaError: An error occurred while calling o248.javaToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 38.0 failed 4 times, most recent failure: Lost task 1.3 in stage 38.0 (TID 3762) (cluster-f229-w-0.us-central1-a.c.big-data-attempt-2-457220.internal executor 63): org.apache.spark.SparkFileNotFoundException: File not found: gs://bigdata_27/transformed_data/part-00020-ce27b2b4-6888-415f-957e-3836700542d2-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:781)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:656)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:142)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkFileNotFoundException: File not found: gs://bigdata_27/transformed_data/part-00020-ce27b2b4-6888-415f-957e-3836700542d2-c000.snappy.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:781)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:656)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:142)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
transformed_df.sampleBy("moa-fine", fractions, seed=42).write \
    .mode("overwrite") \
    .parquet("gs://bigdata_27/transformed_data/")

                                                                                

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

string_cols = ["canonical_smiles","drug","moa-fine"]
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_indexed")
    for col in string_cols
]
encoders = [
    OneHotEncoder(inputCol=col + "_indexed", outputCol=col + "_encoded")
    for col in string_cols
]


logistic_stages = indexers + encoders
tree_stages = indexers

pipeline = Pipeline(stages=indexers)
model = pipeline.fit(df)
indexed_df = model.transform(df)


In [None]:
train_df =spark.read.parquet("gs://bigdata_27/ten_percent_subset/")
test_df = spark.read.parquet("gs://bigdata_27/test_data/")

                                                                                

In [None]:
from pyspark.sql.functions import col, sum as _sum, when

null_counts = train_df.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in train_df.columns
])

null_counts.show()

                                                                                

+----------------+----+--------+------------+
|canonical_smiles|drug|moa-fine|pca_features|
+----------------+----+--------+------------+
|               0|   0|       0|           0|
+----------------+----+--------+------------+



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier


categorical_cols = ["canonical_smiles","drug"]
indexed_cols = [col + "_indexed" for col in categorical_cols]


indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_indexed")
    for col in categorical_cols
]


feature_cols = indexed_cols + ["pca_features"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")


label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)


pipeline = Pipeline(stages=indexers + [assembler, label_indexer, rf])

model = pipeline.fit(train_df) # use train df here


25/05/01 19:46:42 WARN DAGScheduler: Broadcasting large task binary with size 1210.5 KiB
25/05/01 19:48:51 WARN DAGScheduler: Broadcasting large task binary with size 1275.5 KiB
25/05/01 19:49:32 WARN DAGScheduler: Broadcasting large task binary with size 1309.8 KiB
25/05/01 19:49:58 WARN DAGScheduler: Broadcasting large task binary with size 1331.7 KiB
25/05/01 19:50:15 WARN DAGScheduler: Broadcasting large task binary with size 1345.7 KiB
25/05/01 19:50:25 WARN DAGScheduler: Broadcasting large task binary with size 1346.4 KiB
25/05/01 19:50:36 WARN DAGScheduler: Broadcasting large task binary with size 1329.6 KiB
25/05/01 19:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1387.4 KiB
25/05/01 19:51:03 WARN DAGScheduler: Broadcasting large task binary with size 1345.0 KiB
25/05/01 19:51:18 WARN DAGScheduler: Broadcasting large task binary with size 1351.8 KiB
25/05/01 19:51:39 WARN DAGScheduler: Broadcasting large task binary with size 1359.7 KiB
25/05/01 19:51:50 WAR

In [None]:
predictions = model.transform(test_df) # use test_df here

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")


f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")


25/05/01 19:55:43 WARN DAGScheduler: Broadcasting large task binary with size 1369.3 KiB
25/05/01 19:55:55 WARN DAGScheduler: Broadcasting large task binary with size 1369.3 KiB


Test Accuracy = 0.5307


25/05/01 19:56:05 WARN DAGScheduler: Broadcasting large task binary with size 1369.3 KiB
25/05/01 19:56:14 WARN DAGScheduler: Broadcasting large task binary with size 1369.3 KiB

F1 Score = 0.3764
Precision = 0.3568
Recall = 0.5307


                                                                                

In [None]:
train_df =spark.read.parquet("gs://bigdata_27/ten_percent_subset/")
train_df.show(1)

[Stage 13:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|    canonical_smiles|                drug|            moa-fine|        pca_features|
+--------------------+--------------------+--------------------+--------------------+
|CNCC(C1=CC(=CC=C1...|Phenylephrine (hy...|Adrenoceptor agonist|[-111.78087590760...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



                                                                                

##Data Leakage:
### If we know a drug and its concentration, we cannot

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F

spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100)
    .getOrCreate())



train_df =spark.read.parquet("gs://bigdata_27/ten_percent_subset/")
test_df = spark.read.parquet("gs://bigdata_27/test_data/")

train_df = train_df.filter(
    (col("canonical_smiles") != "") & (col("drug") != "")
)
test_df = test_df.filter(
    (col("canonical_smiles") != "") & (col("drug") != "")
)


from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression


categorical_cols = ["canonical_smiles","drug"]
indexed_cols = [col + "_indexed" for col in categorical_cols]
coded_cols = [col + "_coded" for col in categorical_cols]


indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_indexed")
    for col in categorical_cols
]
encoders = [
    OneHotEncoder(inputCol=col + "_indexed", outputCol=col + "_coded")
    for col in categorical_cols
] # this causes problems because there are some none or empty values in the categorical cols

feature_cols = indexed_cols + ["pca_features"]
one_hot_features = coded_cols + ["pca_features"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
one_hot_assembler = VectorAssembler(inputCols=one_hot_features, outputCol="features")




label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",
    predictionCol="prediction",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # L2 regularization
)

# mlp = MultilayerPerceptronClassifier(
#     featuresCol="features",
#     labelCol="label_idx",
#     predictionCol="prediction",
#     maxIter=100,
#     layers=[input_dim, 64, 32, 26],  # Define this based on your data
#     blockSize=128,
#     seed=42
# )

pipeline = Pipeline(stages=indexers +[assembler, label_indexer, lr])
onehot_pipeline = Pipeline(stages= indexers + encoders + [one_hot_assembler, label_indexer, lr])

#model = pipeline.fit(train_df)

model = onehot_pipeline.fit(train_df)


predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")


f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

                                                                                

Test Accuracy = 0.9989


[Stage 171:>                                                      (0 + 20) / 20]

F1 Score = 0.9989
Precision = 0.9989
Recall = 0.9989




In [None]:
predictions.show(1)

[Stage 54:>                                                         (0 + 1) / 1]

+--------------------+-----------+--------+--------------------+------------------------+------------+
|    canonical_smiles|       drug|moa-fine|        pca_features|canonical_smiles_indexed|drug_indexed|
+--------------------+-----------+--------+--------------------+------------------------+------------+
|C1=CC=C2C(=C1)N=C...|HI-TOPK-032| unclear|[-124.15845761817...|                   348.0|       350.0|
+--------------------+-----------+--------+--------------------+------------------------+------------+
only showing top 1 row



                                                                                

In [None]:
predictions.select("canonical_smiles").distinct().filter("canonical_smiles = ''").count()

0

In [None]:
onehot =OneHotEncoder(inputCol="canonical_smiles_indexed", outputCol="canonical_smiles_coded").fit(predictions)

onehot.transform(predictions).show(1)

[Stage 57:>                                                         (0 + 1) / 1]

+--------------------+-----------+--------+--------------------+------------------------+------------+----------------------+
|    canonical_smiles|       drug|moa-fine|        pca_features|canonical_smiles_indexed|drug_indexed|canonical_smiles_coded|
+--------------------+-----------+--------+--------------------+------------------------+------------+----------------------+
|C1=CC=C2C(=C1)N=C...|HI-TOPK-032| unclear|[-124.15845761817...|                   348.0|       350.0|     (374,[348],[1.0])|
+--------------------+-----------+--------+--------------------+------------------------+------------+----------------------+
only showing top 1 row



                                                                                

In [None]:
predictions.select("canonical_smiles_indexed").distinct().show()



+------------------------+
|canonical_smiles_indexed|
+------------------------+
|                   299.0|
|                   323.0|
|                   118.0|
|                   305.0|
|                   170.0|
|                   147.0|
|                   184.0|
|                    71.0|
|                   186.0|
|                   160.0|
|                   169.0|
|                    67.0|
|                   156.0|
|                    70.0|
|                   311.0|
|                     8.0|
|                   173.0|
|                   143.0|
|                   361.0|
|                   320.0|
+------------------------+
only showing top 20 rows



                                                                                

In [None]:
from pyspark.sql.functions import col, sum as _sum, when

null_counts = predictions.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in train_df.columns
])

null_counts.show()



+----------------+----+--------+------------+
|canonical_smiles|drug|moa-fine|pca_features|
+----------------+----+--------+------------+
|               0|   0|       0|           0|
+----------------+----+--------+------------+



                                                                                

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")


f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F

spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100)
    .getOrCreate())



train_df =spark.read.parquet("gs://bigdata_27/ten_percent_subset/")
test_df = spark.read.parquet("gs://bigdata_27/test_data/")

train_df = train_df.filter(
    (F.col("canonical_smiles") != "") & (F.col("drug") != "")
)
test_df = test_df.filter(
    (F.col("canonical_smiles") != "") & (F.col("drug") != "")
)


from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression


categorical_cols = ["canonical_smiles","drug"]
indexed_cols = [col + "_indexed" for col in categorical_cols]
coded_cols = [col + "_coded" for col in categorical_cols]


indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_indexed")
    for col in categorical_cols
]
encoders = [
    OneHotEncoder(inputCol=col + "_indexed", outputCol=col + "_coded")
    for col in categorical_cols
] # this causes problems because there are some none or empty values in the categorical cols

feature_cols = ["pca_features"]
one_hot_features = coded_cols + ["pca_features"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
one_hot_assembler = VectorAssembler(inputCols=one_hot_features, outputCol="features")




label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",
    predictionCol="prediction",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # L2 regularization
)

# mlp = MultilayerPerceptronClassifier(
#     featuresCol="features",
#     labelCol="label_idx",
#     predictionCol="prediction",
#     maxIter=100,
#     layers=[input_dim, 64, 32, 26],  # Define this based on your data
#     blockSize=128,
#     seed=42
# )

pipeline = Pipeline(stages=[assembler, label_indexer, lr])
onehot_pipeline = Pipeline(stages= indexers + encoders + [one_hot_assembler, label_indexer, lr])

#model = pipeline.fit(train_df)

model = pipeline.fit(train_df)


predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")


f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

25/05/02 14:39:44 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_50_23 !
25/05/02 14:39:44 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_50_10 !
25/05/02 14:39:44 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_50_27 !
25/05/02 14:39:44 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_50_1 !
25/05/02 14:39:45 WARN YarnAllocator: Container from a bad node: container_1746193490402_0001_01_000008 on host: cluster-bb8b-w-4.us-central1-a.c.big-data-attempt-2-457220.internal. Exit status: 143. Diagnostics: [2025-05-02 14:39:45.378]Container killed on request. Exit code is 143
[2025-05-02 14:39:45.380]Container exited with a non-zero exit code 143. 
[2025-05-02 14:39:45.381]Killed by external signal
.
25/05/02 14:39:45 ERROR YarnScheduler: Lost executor 7 on cluster-bb8b-w-4.us-central1-a.c.big-data-attempt-2-457220.internal: Container from a bad node: container_1746193490402_0001_01_000008 on host: cluster-bb8b

                                                                                

Test Accuracy = 0.5341


[Stage 208:>                                                      (0 + 20) / 20]

F1 Score = 0.3777
Precision = 0.3680
Recall = 0.5341


                                                                                

In [None]:
import time
# Example DataFrame operation
start_time = time.time()


from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F
#Expression data 256 (63k to 256 from PCA) -> moa-fine
spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100)
    .getOrCreate())



train_df = spark.read.parquet("gs://bigdata_27/ten_percent_subset/") #10 m , 12.5% of dataset
# train_df = spark.read.parquet("gs://bigdata_27/twenty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/forty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/eighty_percent_subset/")

test_df = spark.read.parquet("gs://bigdata_27/test_data/")



from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression




feature_cols = ["pca_features"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")





label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",
    predictionCol="prediction",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # L2 regularization
)



pipeline = Pipeline(stages=[assembler, label_indexer, rf])



model = pipeline.fit(train_df)


predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)




f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print("No. of Workers:", 5)
print(f"Model:", "Linear Regression")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

25/05/02 21:38:34 WARN DAGScheduler: Broadcasting large task binary with size 1144.2 KiB
25/05/02 21:41:02 WARN DAGScheduler: Broadcasting large task binary with size 1213.3 KiB
25/05/02 21:41:56 WARN DAGScheduler: Broadcasting large task binary with size 1242.1 KiB
25/05/02 21:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
25/05/02 21:42:49 WARN DAGScheduler: Broadcasting large task binary with size 1269.9 KiB
25/05/02 21:43:02 WARN DAGScheduler: Broadcasting large task binary with size 1274.9 KiB
25/05/02 21:43:16 WARN DAGScheduler: Broadcasting large task binary with size 1278.9 KiB
25/05/02 21:43:37 WARN DAGScheduler: Broadcasting large task binary with size 1293.9 KiB
25/05/02 21:43:51 WARN DAGScheduler: Broadcasting large task binary with size 1271.6 KiB
25/05/02 21:44:04 WARN DAGScheduler: Broadcasting large task binary with size 1261.1 KiB
25/05/02 21:44:38 WARN DAGScheduler: Broadcasting large task binary with size 1277.3 KiB
25/05/02 21:44:58 WAR

No. of Workers: 5
Model: Linear Regression
Test Accuracy = 0.5211
F1 Score = 0.3570
Precision = 0.2715
Recall = 0.5211
Execution time: 760.87 seconds


                                                                                

##Logistic Regression & Random Forest 20% Data

In [None]:
import time
# Example DataFrame operation
start_time = time.time()


from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F
#Expression data 256 (63k to 256 from PCA) -> moa-fine
spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100)
    .getOrCreate())



# train_df = spark.read.parquet("gs://bigdata_27/ten_percent_subset/") #10 m , 12.5% of dataset
train_df = spark.read.parquet("gs://bigdata_27/twenty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/forty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/eighty_percent_subset/")

test_df = spark.read.parquet("gs://bigdata_27/test_data/")



from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression




feature_cols = ["pca_features"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")





label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",
    predictionCol="prediction",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # L2 regularization
)



pipeline = Pipeline(stages=[assembler, label_indexer, rf])



model = pipeline.fit(train_df)


predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)




f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print("20% Dataset")
print("No. of Workers:", 5)
print(f"Model:", "Random Forest")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

25/05/02 22:10:34 WARN DAGScheduler: Broadcasting large task binary with size 1144.2 KiB
25/05/02 22:14:08 WARN DAGScheduler: Broadcasting large task binary with size 1213.3 KiB
25/05/02 22:15:35 WARN DAGScheduler: Broadcasting large task binary with size 1242.1 KiB
25/05/02 22:16:26 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
25/05/02 22:16:58 WARN DAGScheduler: Broadcasting large task binary with size 1269.9 KiB
25/05/02 22:17:20 WARN DAGScheduler: Broadcasting large task binary with size 1274.9 KiB
25/05/02 22:17:42 WARN DAGScheduler: Broadcasting large task binary with size 1278.9 KiB
25/05/02 22:18:15 WARN DAGScheduler: Broadcasting large task binary with size 1293.9 KiB
25/05/02 22:18:38 WARN DAGScheduler: Broadcasting large task binary with size 1271.6 KiB
25/05/02 22:19:00 WARN DAGScheduler: Broadcasting large task binary with size 1261.1 KiB
25/05/02 22:19:52 WARN DAGScheduler: Broadcasting large task binary with size 1277.3 KiB
25/05/02 22:20:25 WAR

20% Dataset
No. of Workers: 5
Model: Random Forest
Test Accuracy = 0.5211
F1 Score = 0.3570
Precision = 0.2715
Recall = 0.5211
Execution time: 1103.79 seconds




##Logistic Regression & Random Forest 40% Data

In [None]:
import time
# Example DataFrame operation
start_time = time.time()


from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import pyspark.sql.functions as F
#Expression data 256 (63k to 256 from PCA) -> moa-fine
spark = (SparkSession.builder
    .appName("ControlPartitionSize")
    .config("spark.sql.files.maxPartitionBytes", 734003200)
    .config("spark.sql.shuffle.partitions", 100)
    .getOrCreate())



# train_df = spark.read.parquet("gs://bigdata_27/ten_percent_subset/") #10 m , 12.5% of dataset
# train_df = spark.read.parquet("gs://bigdata_27/twenty_percent_subset/")
train_df = spark.read.parquet("gs://bigdata_27/forty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/eighty_percent_subset/")

test_df = spark.read.parquet("gs://bigdata_27/test_data/")



from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, LogisticRegression




feature_cols = ["pca_features"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")





label_indexer = StringIndexer(inputCol="moa-fine", outputCol="label_index")


rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_index",
    maxBins=512,
    numTrees=100
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",
    predictionCol="prediction",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # L2 regularization
)



pipeline = Pipeline(stages=[assembler, label_indexer, rf])



model = pipeline.fit(train_df)


predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)




f1 = evaluator.setMetricName("f1").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

print("40% Dataset")
print("No. of Workers:", 5)
print(f"Model:", "Random Forest")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

25/05/02 22:37:26 WARN DAGScheduler: Broadcasting large task binary with size 1144.2 KiB
25/05/02 22:50:50 WARN DAGScheduler: Broadcasting large task binary with size 1213.3 KiB
25/05/02 22:56:56 WARN DAGScheduler: Broadcasting large task binary with size 1242.1 KiB
25/05/02 23:01:30 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
25/05/02 23:04:31 WARN DAGScheduler: Broadcasting large task binary with size 1269.9 KiB
25/05/02 23:07:15 WARN DAGScheduler: Broadcasting large task binary with size 1274.9 KiB
25/05/02 23:10:08 WARN DAGScheduler: Broadcasting large task binary with size 1278.9 KiB
25/05/02 23:13:49 WARN DAGScheduler: Broadcasting large task binary with size 1293.9 KiB
25/05/02 23:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1271.6 KiB
25/05/02 23:19:56 WARN DAGScheduler: Broadcasting large task binary with size 1261.1 KiB
25/05/02 23:24:26 WARN DAGScheduler: Broadcasting large task binary with size 1277.3 KiB
25/05/02 23:27:40 WAR

40% Dataset
No. of Workers: 5
Model: Logistic Regression
Test Accuracy = 0.5245
F1 Score = 0.3666
Precision = 0.3508
Recall = 0.5245
Execution time: 7100.90 seconds


[Stage 567:=====>                                                 (2 + 18) / 20]                                                                                

##MLP 12.5% Dataset
###Obervations: It was a simple single layer setup, took about 50 minutes, so we skipped running for scale up as we have to be mindful of our free credits and project minimum requirement

In [None]:
import time
# Example DataFrame operation
start_time = time.time()
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


train_df = spark.read.parquet("gs://bigdata_27/ten_percent_subset/") #10 m , 12.5% of dataset
# train_df = spark.read.parquet("gs://bigdata_27/twenty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/forty_percent_subset/")
# train_df = spark.read.parquet("gs://bigdata_27/eighty_percent_subset/")
test_df = spark.read.parquet("gs://bigdata_27/test_data/")

# === Assemble Features ===
def get_feature_pipeline(feature_cols, label_col="moa-fine"):
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    label_indexer = StringIndexer(inputCol=label_col, outputCol="label_index")
    return assembler, label_indexer

# === Define MLPC Model ===
def get_mlp_model(input_size, output_size):
    layers = [input_size, 256, 128, output_size]  # Example architecture
    mlp = MultilayerPerceptronClassifier(
        featuresCol="features",
        labelCol="label_index",
        predictionCol="prediction",
        maxIter=100,
        layers=layers,
        blockSize=128,
        seed=47
    )
    return mlp

# === Evaluation Metrics ===
def evaluate_model(predictions):
    evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction")
    accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
    f1 = evaluator.setMetricName("f1").evaluate(predictions)
    precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
    recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
    return accuracy, f1, precision, recall

# === Main Pipeline Execution ===
feature_cols = ["pca_features"]
assembler, label_indexer = get_feature_pipeline(feature_cols)

# Estimate number of classes
num_classes = train_df.select("moa-fine").distinct().count()
print(num_classes)
input_size = 256  # Since pca_features is assumed to be a single vector column

mlp = get_mlp_model(input_size=input_size, output_size=num_classes)

pipeline = Pipeline(stages=[assembler, label_indexer, mlp])
model = pipeline.fit(train_df)
predictions = model.transform(test_df)

test_accuracy = evaluator.evaluate(predictions)
accuracy, f1, precision, recall = evaluate_model(predictions)

print("Model: Multilayer Perceptron Classifier")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"F1 Score = {f1:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

                                                                                

27


25/05/03 02:04:04 WARN BlockManager: Asked to remove block broadcast_1040_piece0, which does not exist
[Stage 800:>                                                      (0 + 20) / 20]

Model: Multilayer Perceptron Classifier
Test Accuracy = 0.5223
F1 Score = 0.3601
Precision = 0.2874
Recall = 0.5223
Execution time: 2909.39 seconds


                                                                                

#Above Setup to be run for Scale Out
#We didn't do more than 12.5% of dataset with Random Forest as we were short on credits

In [None]:
from pyspark.sql.functions import col

# Check class distribution in test set
test_df.groupBy("moa-fine").count().orderBy("count", ascending=False).show(30)

# Check prediction distribution
predictions.groupBy("prediction").count().orderBy("count", ascending=False).show()

                                                                                

+--------------------+-----+
|            moa-fine|count|
+--------------------+-----+
|             unclear| 4899|
|DNA synthesis/rep...|  642|
|Cyclooxygenase in...|  375|
| EGFR/ERBB inhibitor|  372|
|  Other TK inhibitor|  354|
|  Multi-TK inhibitor|  244|
|Adrenoceptor agonist|  219|
|       RAS inhibitor|  208|
|                    |  197|
|Other MAPK inhibitor|  162|
|      MTOR inhibitor|  155|
|Microtubule inhib...|  149|
|  JAK/STAT inhibitor|  138|
|       MEK inhibitor|  137|
|Protein synthesis...|  126|
|  PI3K/AKT inhibitor|  125|
|      HDAC inhibitor|  104|
|Androgen receptor...|  102|
|Glucocorticoid re...|  101|
|       CDK inhibitor|   94|
|Retinoic receptor...|   91|
|Glucose transport...|   91|
|       RAF inhibitor|   74|
|DNA methyltransfe...|   66|
|     Sonic inhibitor|   62|
|Proteasome inhibitor|   59|
|      GSK3 inhibitor|   56|
+--------------------+-----+





+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 9378|
|      14.0|   20|
|      12.0|    4|
+----------+-----+



                                                                                