In [1]:
from pyspark.sql import SparkSession, DataFrame, types
from pyspark.sql import functions as F
import os
from dotenv import load_dotenv
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline


In [2]:
load_dotenv()

spark = SparkSession \
    .builder \
    .master('spark://localhost:7077') \
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") \
    .config("spark.jars.packages", "ai.catboost:catboost-spark_3.5_2.12:1.2.6") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "false") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.type", "SERVICE_ACCOUNT_JSON_KEYFILE") \
    .config("spark.hadoop.fs.gs.project.id", os.getenv('PROJECT_ID')) \
    .appName("transformation_spark") \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/Bagas/spark/spark-3.5.5-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/Bagas/.ivy2/cache
The jars for the packages stored in: /home/Bagas/.ivy2/jars
ai.catboost#catboost-spark_3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9209b7a2-5fe1-43fc-b42e-303826b5ef3f;1.0
	confs: [default]
	found ai.catboost#catboost-spark_3.5_2.12;1.2.6 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.6.0 in central
	found com.google.guava#guava;32.0.0-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;3.33.0 in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;2.8 in central
	found commons-io#commons-io;2.7 in central
	found org.apache.commons#commons-lang3;3.11 in central
	found org.apache.commons

In [3]:
spark

In [4]:
base_dir = "gs://project-abd/notebook-data/"

try:
    df = spark.read.csv(os.path.join(base_dir,'fusion_data.csv'),
                        header=True,
                        inferSchema=True)
except Exception as e:
    print(f'Error while reading data : {e}')

                                                                                

In [5]:
df.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+
|class_id|room|student_id|gender_code|age|          timestamp|          hr_mean|         temp_mean|           eda_mean|          ibi_mean|            bvp_mean|engagement_level|
+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+
|       7|  R3|        21|          1| 15|2025-06-04 10:55:00|96.89173352559408|29.075799973805747| 0.2409935087157646|0.7601556011424158| 0.14733649224615267|  Highly Engaged|
|     202|  R1|        20|          1| 16|2025-06-04 14:15:00|84.53866225022536|   32.529366131322|0.15323040743138613|0.7591810502657076|0.024287718439838422|  Highly Engaged|
|      78|  R3|         1|          1| 16|2025-06-04 11:25:00|71.10822733748319| 32.39248572278758|0.16132125217433

                                                                                

In [6]:
df.describe().show()

25/06/04 09:07:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+-----------------+----+------------------+-----------+------------------+------------------+------------------+------------------+-------------------+--------------------+----------------+
|summary|         class_id|room|        student_id|gender_code|               age|           hr_mean|         temp_mean|          eda_mean|           ibi_mean|            bvp_mean|engagement_level|
+-------+-----------------+----+------------------+-----------+------------------+------------------+------------------+------------------+-------------------+--------------------+----------------+
|  count|              248| 248|               248|        248|               248|               248|               248|               248|                248|                 248|             248|
|   mean|132.1048387096774|NULL|13.060483870967742|        1.0|            15.875| 87.06891107648762|31.650660187285208|0.9649842082417228| 0.7606887402130599|0.003096441220778...|            NULL|
| stddev|6

                                                                                

In [7]:
indexers = [StringIndexer(inputCol="room", outputCol="room_index") , StringIndexer(inputCol="engagement_level", outputCol="label_index")]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

                                                                                

In [8]:
df.show(5)

[Stage 9:>                                                          (0 + 1) / 1]

+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+----------+-----------+
|class_id|room|student_id|gender_code|age|          timestamp|          hr_mean|         temp_mean|           eda_mean|          ibi_mean|            bvp_mean|engagement_level|room_index|label_index|
+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+----------+-----------+
|       7|  R3|        21|          1| 15|2025-06-04 10:55:00|96.89173352559408|29.075799973805747| 0.2409935087157646|0.7601556011424158| 0.14733649224615267|  Highly Engaged|       1.0|        0.0|
|     202|  R1|        20|          1| 16|2025-06-04 14:15:00|84.53866225022536|   32.529366131322|0.15323040743138613|0.7591810502657076|0.024287718439838422|  Highly Engaged|       0.0|        0.0|


                                                                                

In [9]:
feature_cols = ['room_index','gender_code','age','hr_mean','temp_mean','eda_mean','ibi_mean','bvp_mean']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_prep = assembler.transform(df)

In [10]:
df_prep.show(5)

[Stage 10:>                                                         (0 + 1) / 1]

+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+----------+-----------+--------------------+
|class_id|room|student_id|gender_code|age|          timestamp|          hr_mean|         temp_mean|           eda_mean|          ibi_mean|            bvp_mean|engagement_level|room_index|label_index|            features|
+--------+----+----------+-----------+---+-------------------+-----------------+------------------+-------------------+------------------+--------------------+----------------+----------+-----------+--------------------+
|       7|  R3|        21|          1| 15|2025-06-04 10:55:00|96.89173352559408|29.075799973805747| 0.2409935087157646|0.7601556011424158| 0.14733649224615267|  Highly Engaged|       1.0|        0.0|[1.0,1.0,15.0,96....|
|     202|  R1|        20|          1| 16|2025-06-04 14:15:00|84.53866225022536|   32.529366131322|0.153230407431386

                                                                                

In [11]:
train_df, test_df = df_prep.randomSplit([0.8,0.2])

In [12]:
train_df.count()

                                                                                

199

In [13]:
test_df.count()

                                                                                

49

In [14]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='label_index', 
    predictionCol="prediction", 
    metricName='f1')

In [15]:
import catboost_spark
from catboost_spark import CatBoostClassifier

In [16]:
train_pool = catboost_spark.Pool(train_df.select(['features', 'label_index']))
train_pool.setLabelCol('label_index')
train_pool.setFeaturesCol('features')

Pool_f609cdb1e18d

In [17]:
classifier = catboost_spark.CatBoostClassifier(featuresCol='features', labelCol='label_index')

In [18]:
classifier.setIterations(1000)
classifier.setDepth(10)
classifier.setLearningRate(0.001)

CatBoostClassifier_ed8add82c51b

In [19]:
model = classifier.fit(train_pool)
predict = model.transform(test_df)
print(f'Model F1 = {evaluator.evaluate(predict)}')

25/06/04 09:08:03 WARN TaskSetManager: Lost task 0.0 in stage 22.0 (TID 22) (172.18.0.6 executor 1): java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.sql.execution.MapPartitionsExec.func of type scala.Function1 in instance of org.apache.spark.sql.execution.MapPartitionsExec
	at java.base/java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2076)
	at java.base/java.io.ObjectStreamClass$FieldReflector.checkObjectFieldValueTypes(ObjectStreamClass.java:2039)
	at java.base/java.io.ObjectStreamClass.checkObjFieldValueTypes(ObjectStreamClass.java:1293)
	at java.base/java.io.ObjectInputStream.defaultCheckFieldValues(ObjectInputStream.java:2512)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2419)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at ja

Py4JJavaError: An error occurred while calling o184.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 22.0 failed 4 times, most recent failure: Lost task 0.3 in stage 22.0 (TID 25) (172.18.0.5 executor 2): java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.sql.execution.MapPartitionsExec.func of type scala.Function1 in instance of org.apache.spark.sql.execution.MapPartitionsExec
	at java.base/java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2076)
	at java.base/java.io.ObjectStreamClass$FieldReflector.checkObjectFieldValueTypes(ObjectStreamClass.java:2039)
	at java.base/java.io.ObjectStreamClass.checkObjFieldValueTypes(ObjectStreamClass.java:1293)
	at java.base/java.io.ObjectInputStream.defaultCheckFieldValues(ObjectInputStream.java:2512)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2419)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2134)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1675)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2134)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1675)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:527)
	at jdk.internal.reflect.GeneratedMethodAccessor5.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1046)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2357)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:527)
	at jdk.internal.reflect.GeneratedMethodAccessor5.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1046)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2357)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:90)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.rdd.RDD.collectPartition$1(RDD.scala:1064)
	at org.apache.spark.rdd.RDD.$anonfun$toLocalIterator$3(RDD.scala:1066)
	at org.apache.spark.rdd.RDD.$anonfun$toLocalIterator$3$adapted(RDD.scala:1066)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.toStream(Iterator.scala:1417)
	at scala.collection.Iterator.toStream$(Iterator.scala:1416)
	at scala.collection.AbstractIterator.toStream(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toSeq(TraversableOnce.scala:354)
	at scala.collection.TraversableOnce.toSeq$(TraversableOnce.scala:354)
	at scala.collection.AbstractIterator.toSeq(Iterator.scala:1431)
	at ai.catboost.spark.DataHelpers$.getDistinctFloatLabelValues(DataHelpers.scala:815)
	at ai.catboost.spark.CatBoostClassifier.preprocessBeforeTraining(CatBoostClassifier.scala:399)
	at ai.catboost.spark.CatBoostPredictorTrait.fit(CatBoostPredictor.scala:167)
	at ai.catboost.spark.CatBoostPredictorTrait.fit$(CatBoostPredictor.scala:125)
	at ai.catboost.spark.CatBoostClassifier.fit(CatBoostClassifier.scala:372)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.sql.execution.MapPartitionsExec.func of type scala.Function1 in instance of org.apache.spark.sql.execution.MapPartitionsExec
	at java.base/java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2076)
	at java.base/java.io.ObjectStreamClass$FieldReflector.checkObjectFieldValueTypes(ObjectStreamClass.java:2039)
	at java.base/java.io.ObjectStreamClass.checkObjFieldValueTypes(ObjectStreamClass.java:1293)
	at java.base/java.io.ObjectInputStream.defaultCheckFieldValues(ObjectInputStream.java:2512)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2419)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2134)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1675)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2134)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1675)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:527)
	at jdk.internal.reflect.GeneratedMethodAccessor5.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1046)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2357)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:527)
	at jdk.internal.reflect.GeneratedMethodAccessor5.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1046)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2357)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:90)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
# from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# param_grid =(ParamGridBuilder()
#             .addGrid(classifier.iterations, [2000,5000,10000])
#             .addGrid(classifier.depth, [2,4,6,8,10])
#             .addGrid(classifier.learningRate, [0.001,0.01, 0.05, 0.1])
#             .addGrid(classifier.l2LeafReg, [1.0,3.0,5.0])
#             .build())

# tvs = TrainValidationSplit(
#     estimator=classifier,
#     estimatorParamMaps=param_grid,
#     evaluator=evaluator,
#     trainRatio=0.8,  
#     parallelism=1   
# )

# tvs_model = tvs.fit(train_df)
# best_model = tvs_model.bestModel
# prediction = best_model.transform(test_df)
# print(f'Model F1 = {evaluator.evaluate(prediction)}')

25/06/04 08:59:04 WARN CacheManager: Asked to cache already cached data.
25/06/04 08:59:04 WARN CacheManager: Asked to cache already cached data.


25/06/04 08:59:07 ERROR Executor: Exception in task 0.0 in stage 56921.0 (TID 33870)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

	learn: 0.8481283	total: 4m 11s	remaining: 5m 42s
847:	learn: 0.8478723	total: 4m 12s	remaining: 5m 42s
848:	learn: 0.8477398	total: 4m 12s	remaining: 5m 42s
849:	learn: 0.8474459	total: 4m 12s	remaining: 5m 42s
850:	learn: 0.8471360	total: 4m 13s	remaining: 5m 41s
851:	learn: 0.8469288	total: 4m 13s	remaining: 5m 41s
852:	learn: 0.8466051	total: 4m 14s	remaining: 5m 41s
853:	learn: 0.8463092	total: 4m 14s	remaining: 5m 41s
854:	learn: 0.8461194	total: 4m 14s	remaining: 5m 41s
855:	learn: 0.8457546	total: 4m 15s	remaining: 5m 41s
856:	learn: 0.8455120	total: 4m 15s	remaining: 5m 41s
857:	learn: 0.8452289	total: 4m 16s	remaining: 5m 41s
858:	learn: 0.8450670	total: 4m 16s	remaining: 5m 40s
859:	learn: 0.8449134	total: 4m 16s	remaining: 5m 40s
860:	learn: 0.8446342	total: 4m 17s	remaining: 5m 40s
861:	learn: 0.8443739	total: 4m 17s	remaining: 5m 40s
862:	learn: 0.8441379	total: 4m 17s	remaining: 5m 39s
863:	learn: 0.8440175	total: 4m 18s	remaining: 5m 39s
864:	learn: 0.8437892	total: 4m 

25/06/04 08:59:21 ERROR Executor: Exception in task 0.0 in stage 58662.0 (TID 34841)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

Py4JJavaError: An error occurred while calling o15475.fit.
: java.util.concurrent.ExecutionException: Error while executing workers
	at ai.catboost.spark.impl.Helpers$.checkOneFutureAndWaitForOther(Helpers.scala:33)
	at ai.catboost.spark.impl.Helpers$.waitForTwoFutures(Helpers.scala:61)
	at ai.catboost.spark.CatBoostPredictorTrait.$anonfun$fit$12(CatBoostPredictor.scala:260)
	at scala.util.control.Breaks.breakable(Breaks.scala:42)
	at ai.catboost.spark.CatBoostPredictorTrait.fit(CatBoostPredictor.scala:230)
	at ai.catboost.spark.CatBoostPredictorTrait.fit$(CatBoostPredictor.scala:125)
	at ai.catboost.spark.CatBoostClassifier.fit(CatBoostClassifier.scala:372)
	at ai.catboost.spark.CatBoostPredictorTrait.train(CatBoostPredictor.scala:111)
	at ai.catboost.spark.CatBoostPredictorTrait.train$(CatBoostPredictor.scala:108)
	at ai.catboost.spark.CatBoostClassifier.train(CatBoostClassifier.scala:372)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 59746.0 failed 1 times, most recent failure: Lost task 0.0 in stage 59746.0 (TID 35529) (abd-vm.us-central1-c.c.project-big-data-461104.internal executor driver): ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1039)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1037)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$9(Workers.scala:327)
	at ai.catboost.spark.impl.CatBoostWorkers.run(Workers.scala:178)
	at ai.catboost.spark.CatBoostPredictorTrait$$anon$1.run(CatBoostPredictor.scala:252)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	... 3 more


25/06/04 08:59:37 ERROR Executor: Exception in task 0.0 in stage 59955.0 (TID 35674)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

27s
922:	learn: 0.8307688	total: 4m 40s	remaining: 5m 26s
923:	learn: 0.8305868	total: 4m 40s	remaining: 5m 26s
924:	learn: 0.8303424	total: 4m 40s	remaining: 5m 26s
925:	learn: 0.8300747	total: 4m 41s	remaining: 5m 26s
926:	learn: 0.8298102	total: 4m 41s	remaining: 5m 26s
927:	learn: 0.8295685	total: 4m 42s	remaining: 5m 25s
928:	learn: 0.8294258	total: 4m 42s	remaining: 5m 25s
929:	learn: 0.8293313	total: 4m 42s	remaining: 5m 24s
930:	learn: 0.8291552	total: 4m 42s	remaining: 5m 24s
931:	learn: 0.8290803	total: 4m 42s	remaining: 5m 24s
932:	learn: 0.8287933	total: 4m 43s	remaining: 5m 23s
933:	learn: 0.8284699	total: 4m 43s	remaining: 5m 23s
934:	learn: 0.8282846	total: 4m 44s	remaining: 5m 23s
935:	learn: 0.8281607	total: 4m 44s	remaining: 5m 23s
936:	learn: 0.8279240	total: 4m 44s	remaining: 5m 22s
937:	learn: 0.8278335	total: 4m 44s	remaining: 5m 22s
938:	learn: 0.8277350	total: 4m 45s	remaining: 5m 22s
939:	learn: 0.8275708	total: 4m 45s	remaining: 5m 21s
940:	learn: 0.8272906	to

25/06/04 08:59:52 ERROR Executor: Exception in task 0.0 in stage 61628.0 (TID 36609)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

39601	total: 5m 11s	remaining: 5m 11s
999:	learn: 0.8138792	total: 5m 11s	remaining: 5m 11s
1000:	learn: 0.8136150	total: 5m 12s	remaining: 5m 11s
1001:	learn: 0.8133538	total: 5m 12s	remaining: 5m 11s
1002:	learn: 0.8130909	total: 5m 13s	remaining: 5m 11s
1003:	learn: 0.8129396	total: 5m 13s	remaining: 5m 10s
1004:	learn: 0.8126796	total: 5m 14s	remaining: 5m 10s
1005:	learn: 0.8123885	total: 5m 14s	remaining: 5m 10s
1006:	learn: 0.8121544	total: 5m 15s	remaining: 5m 10s
1007:	learn: 0.8118333	total: 5m 15s	remaining: 5m 10s
1008:	learn: 0.8117655	total: 5m 15s	remaining: 5m 10s
1009:	learn: 0.8116638	total: 5m 15s	remaining: 5m 9s
1010:	learn: 0.8113661	total: 5m 16s	remaining: 5m 9s
1011:	learn: 0.8111793	total: 5m 16s	remaining: 5m 9s
1012:	learn: 0.8109385	total: 5m 17s	remaining: 5m 9s
1013:	learn: 0.8107985	total: 5m 17s	remaining: 5m 9s
1014:	learn: 0.8107266	total: 5m 17s	remaining: 5m 8s
1015:	learn: 0.8106287	total: 5m 18s	remaining: 5m 8s
1016:	learn: 0.8103686	total: 5m 18

25/06/04 09:00:24 ERROR Executor: Exception in task 0.0 in stage 64580.0 (TID 38357)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

5s
1074:	learn: 0.7981857	total: 5m 43s	remaining: 4m 55s
1075:	learn: 0.7979155	total: 5m 43s	remaining: 4m 55s
1076:	learn: 0.7976666	total: 5m 44s	remaining: 4m 55s
1077:	learn: 0.7974585	total: 5m 44s	remaining: 4m 54s
1078:	learn: 0.7973640	total: 5m 44s	remaining: 4m 54s
1079:	learn: 0.7972429	total: 5m 45s	remaining: 4m 54s
1080:	learn: 0.7969338	total: 5m 45s	remaining: 4m 54s
1081:	learn: 0.7966903	total: 5m 46s	remaining: 4m 53s
1082:	learn: 0.7964227	total: 5m 46s	remaining: 4m 53s
1083:	learn: 0.7961578	total: 5m 47s	remaining: 4m 53s
1084:	learn: 0.7958627	total: 5m 48s	remaining: 4m 53s
1085:	learn: 0.7956215	total: 5m 48s	remaining: 4m 53s
1086:	learn: 0.7953362	total: 5m 49s	remaining: 4m 53s
1087:	learn: 0.7950547	total: 5m 49s	remaining: 4m 53s
1088:	learn: 0.7947947	total: 5m 50s	remaining: 4m 53s
1089:	learn: 0.7945505	total: 5m 50s	remaining: 4m 52s
1090:	learn: 0.7942917	total: 5m 51s	remaining: 4m 52s
1091:	learn: 0.7940609	total: 5m 52s	remaining: 4m 52s
1092:	l

25/06/04 09:01:06 ERROR Executor: Exception in task 0.0 in stage 66807.0 (TID 39715)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

ning: 4m 47s
1149:	learn: 0.7826604	total: 6m 29s	remaining: 4m 47s
1150:	learn: 0.7825764	total: 6m 29s	remaining: 4m 47s
1151:	learn: 0.7824013	total: 6m 30s	remaining: 4m 47s
1152:	learn: 0.7822017	total: 6m 30s	remaining: 4m 47s
1153:	learn: 0.7821290	total: 6m 31s	remaining: 4m 46s
1154:	learn: 0.7820089	total: 6m 31s	remaining: 4m 46s
1155:	learn: 0.7819463	total: 6m 31s	remaining: 4m 46s
1156:	learn: 0.7817138	total: 6m 32s	remaining: 4m 46s
1157:	learn: 0.7815515	total: 6m 33s	remaining: 4m 45s
1158:	learn: 0.7813028	total: 6m 34s	remaining: 4m 45s
1159:	learn: 0.7811745	total: 6m 34s	remaining: 4m 45s
1160:	learn: 0.7809477	total: 6m 35s	remaining: 4m 45s
1161:	learn: 0.7807304	total: 6m 36s	remaining: 4m 45s
1162:	learn: 0.7804564	total: 6m 37s	remaining: 4m 45s
1163:	learn: 0.7802565	total: 6m 37s	remaining: 4m 45s
1164:	learn: 0.7799618	total: 6m 38s	remaining: 4m 45s
1165:	learn: 0.7796885	total: 6m 39s	remaining: 4m 45s
1166:	learn: 0.7794798	total: 6m 40s	remaining: 4m 4

25/06/04 09:01:49 ERROR Executor: Exception in task 0.0 in stage 69773.0 (TID 41485)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

ng: 4m 31s
1224:	learn: 0.7684865	total: 7m 9s	remaining: 4m 31s
1225:	learn: 0.7683389	total: 7m 9s	remaining: 4m 31s
1226:	learn: 0.7682081	total: 7m 10s	remaining: 4m 30s
1227:	learn: 0.7679577	total: 7m 10s	remaining: 4m 30s
1228:	learn: 0.7677370	total: 7m 11s	remaining: 4m 30s
1229:	learn: 0.7676439	total: 7m 11s	remaining: 4m 30s
1230:	learn: 0.7674446	total: 7m 12s	remaining: 4m 30s
1231:	learn: 0.7672564	total: 7m 13s	remaining: 4m 30s
1232:	learn: 0.7669900	total: 7m 13s	remaining: 4m 29s
1233:	learn: 0.7668423	total: 7m 14s	remaining: 4m 29s
1234:	learn: 0.7666537	total: 7m 14s	remaining: 4m 29s
1235:	learn: 0.7664725	total: 7m 15s	remaining: 4m 29s
1236:	learn: 0.7663112	total: 7m 15s	remaining: 4m 28s
1237:	learn: 0.7661053	total: 7m 16s	remaining: 4m 28s
1238:	learn: 0.7660357	total: 7m 16s	remaining: 4m 28s
1239:	learn: 0.7657897	total: 7m 17s	remaining: 4m 28s
1240:	learn: 0.7655203	total: 7m 17s	remaining: 4m 27s
1241:	learn: 0.7652623	total: 7m 18s	remaining: 4m 27s
1

25/06/04 09:02:26 ERROR Executor: Exception in task 0.0 in stage 72729.0 (TID 43250)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

49173	total: 7m 45s	remaining: 4m 11s
1299:	learn: 0.7546806	total: 7m 46s	remaining: 4m 11s
1300:	learn: 0.7544529	total: 7m 46s	remaining: 4m 10s
1301:	learn: 0.7542907	total: 7m 47s	remaining: 4m 10s
1302:	learn: 0.7541887	total: 7m 47s	remaining: 4m 10s
1303:	learn: 0.7539719	total: 7m 48s	remaining: 4m 9s
1304:	learn: 0.7537318	total: 7m 48s	remaining: 4m 9s
1305:	learn: 0.7534557	total: 7m 49s	remaining: 4m 9s
1306:	learn: 0.7532695	total: 7m 49s	remaining: 4m 8s
1307:	learn: 0.7531910	total: 7m 49s	remaining: 4m 8s
1308:	learn: 0.7529638	total: 7m 50s	remaining: 4m 8s
1309:	learn: 0.7526860	total: 7m 50s	remaining: 4m 8s
1310:	learn: 0.7525405	total: 7m 51s	remaining: 4m 7s
1311:	learn: 0.7523503	total: 7m 51s	remaining: 4m 7s
1312:	learn: 0.7521015	total: 7m 52s	remaining: 4m 7s
1313:	learn: 0.7519499	total: 7m 52s	remaining: 4m 6s
1314:	learn: 0.7517644	total: 7m 52s	remaining: 4m 6s
1315:	learn: 0.7515011	total: 7m 53s	remaining: 4m 6s
1316:	learn: 0.7514115	total: 7m 53s	rem

25/06/04 09:02:58 ERROR Executor: Exception in task 0.0 in stage 75749.0 (TID 45047)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe

74:	learn: 0.7405459	total: 8m 19s	remaining: 3m 47s
1375:	learn: 0.7403957	total: 8m 20s	remaining: 3m 46s
1376:	learn: 0.7401961	total: 8m 20s	remaining: 3m 46s
1377:	learn: 0.7399966	total: 8m 21s	remaining: 3m 46s
1378:	learn: 0.7399499	total: 8m 21s	remaining: 3m 45s
1379:	learn: 0.7397561	total: 8m 21s	remaining: 3m 45s
1380:	learn: 0.7395443	total: 8m 22s	remaining: 3m 45s
1381:	learn: 0.7394977	total: 8m 22s	remaining: 3m 44s
1382:	learn: 0.7392781	total: 8m 23s	remaining: 3m 44s
1383:	learn: 0.7390470	total: 8m 23s	remaining: 3m 44s
1384:	learn: 0.7387898	total: 8m 24s	remaining: 3m 43s
1385:	learn: 0.7385925	total: 8m 24s	remaining: 3m 43s
1386:	learn: 0.7383241	total: 8m 25s	remaining: 3m 43s
1387:	learn: 0.7381012	total: 8m 26s	remaining: 3m 43s
1388:	learn: 0.7378792	total: 8m 26s	remaining: 3m 42s
1389:	learn: 0.7376695	total: 8m 27s	remaining: 3m 42s
1390:	learn: 0.7374773	total: 8m 27s	remaining: 3m 42s
1391:	learn: 0.7374131	total: 8m 27s	remaining: 3m 41s
1392:	learn:

25/06/04 09:03:44 ERROR Executor: Exception in task 0.0 in stage 79586.0 (TID 47295)
ai.catboost.CatBoostError: An active CatBoost worker is already present in the current process
	at ai.catboost.spark.impl.CatBoostWorker.processPartition(Workers.scala:57)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10(Workers.scala:338)
	at ai.catboost.spark.impl.CatBoostWorkers$.$anonfun$apply$10$adapted(Workers.scala:327)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafe