## Income Prediction with Spark Random Forest

### Install findspark and init

In [11]:
!pip install findspark



In [12]:
import findspark
findspark.init()

### Get spark and h2o sessions

In [13]:
from pyspark.sql import SparkSession
from pysparkling import H2OContext
import h2o

from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName('cognetix-spark-nb')
    .config('spark.dynamicAllocation.enabled', 'false')
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext
hc = H2OContext.getOrCreate()
h2o_cluster = h2o.cluster()

23/10/20 20:31:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


10-20 20:31:35.423 172.17.0.2:54321      18300    Thread-4  INFO water.default: ----- H2O started  -----
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Build git branch: rel-zz_kurka
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Build git hash: 5ff8870f912c6110d7b6988f577c020de10496ec
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Build git describe: jenkins-3.40.0.3-122-g5ff8870
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Build project version: 3.40.0.4
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Build age: 5 months and 22 days
10-20 20:31:35.425 172.17.0.2:54321      18300    Thread-4  INFO water.default: Built by: 'jenkins'
10-20 20:31:35.426 172.17.0.2:54321      18300    Thread-4  INFO water.default: Built on: '2023-04-28 12:08:23'
10-20 20:31:35.426 172.17.0.2:54321      18300    Thread-4  WARN water.default: 


0,1
H2O_cluster_uptime:,09 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,5 months and 22 days
H2O_cluster_name:,sparkling-water-root_local-1697833892184
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4



Sparkling Water Context:
 * Sparkling Water Version: 3.40.0.4-1-3.1
 * H2O name: sparkling-water-root_local-1697833892184
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,172.17.0.2,54321)
  ------------------------

  Open H2O Flow in browser: http://5b5a8eb7561c:54323 (CMD + click in Mac OSX)

    


### Global params

In [17]:
seed = 42
train_rate = 0.8

train_path = '../data/census-train.csv'
test_path = '../data/census-test.csv'
model_path = 'outputs/income_rf_spark'
pred_path = 'outputs/income_rf_spark_pred'

### Data Prep

In [18]:
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType

schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", DoubleType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", IntegerType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capital_gain", DoubleType(), True),
    StructField("capital_loss", DoubleType(), True),
    StructField("hours_per_week", DoubleType(), True),
    StructField("native_country", StringType(), True),
    StructField("income_level", StringType(), True),
])

train_df = (
    spark.read
    .format('csv')
    .option('header', 'false')
    .option('delimiter', ',')
    .schema(schema)
    .load(train_path)
    .drop('education_num')
    .withColumn('label', when(col('income_level').contains('>50K'), lit(1)).otherwise(lit(0)))
    .drop('income_level')
    .withColumn('workclass', when(col('workclass') == ' ?', lit('NA')).otherwise(col('workclass')))
    .withColumn('occupation', when(col('occupation') == ' ?', lit('NA')).otherwise(col('occupation')))
    .withColumn('native_country', when(col('native_country') == ' ?', lit('NA')).otherwise(col('native_country')))
)

test_df = (
    spark.read
    .format('csv')
    .option('header', 'false')
    .option('delimiter', ',')
    .schema(schema)
    .load(test_path)
    .withColumn('label', when(col('income_level').contains('>50K'), lit(1)).otherwise(lit(0)))
    .drop('education_num', 'income_level')
    .withColumn('workclass', when(col('workclass') == ' ?', lit('NA')).otherwise(col('workclass')))
    .withColumn('occupation', when(col('occupation') == ' ?', lit('NA')).otherwise(col('occupation')))
    .withColumn('native_country', when(col('native_country') == ' ?', lit('NA')).otherwise(col('native_country')))
)

train_df, val_df = train_df.randomSplit([train_rate, 1-train_rate], seed=seed)
print(f'Train split size: {train_df.count()}')
print(f'Validation split size: {val_df.count()}')


10-20 20:32:00.591 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.InMemoryFileIndex: It took 4 ms to list leaf files for 1 paths.
10-20 20:32:00.778 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.InMemoryFileIndex: It took 5 ms to list leaf files for 1 paths.
10-20 20:32:01.229 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:32:01.230 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:32:01.231 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:32:01.811 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.catalyst.expres

[Stage 0:>                                                          (0 + 1) / 1]

10-20 20:32:03.265 172.17.0.2:54321      18300  .0 (TID 0)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 0.0 (TID 0). 2775 bytes result sent to driver
10-20 20:32:03.300 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 926 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:03.311 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 0 (count at NativeMethodAccessorImpl.java:0) finished in 1.008 s
10-20 20:32:03.311 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 20:32:03.312 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: running: Set()
10-20 20:32:03.312 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: waiting: Set(ResultStage 1)
10-20 20:32:03.313 172.17.0.2:54321      18300 

                                                                                

10-20 20:32:03.519 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:32:03.520 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:32:03.521 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:32:03.607 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 45.842719 ms
10-20 20:32:03.612 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_3 stored as values in memory (estimated size 176.1 KiB, free 434.0 MiB)
10-20 20:32:03.667 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.stor

### ML Pipeline

#### Preprocessing

In [19]:
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    OneHotEncoder,
    Imputer,
)
cols_to_impute = ['fnlwgt', 'age', 'capital_gain', 'capital_loss', 'hours_per_week']
cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
imputed_cols = [f'{x}_IMPUTED' for x in cols_to_impute]
imputer = Imputer(strategy='mean', inputCols=cols_to_impute, outputCols=imputed_cols)
string_indexers = []
ohe_indexers = []
for cat_col in cat_cols:
    si = StringIndexer(inputCol=cat_col, outputCol=f'{cat_col}_idx').setHandleInvalid('keep')
    enc = OneHotEncoder(inputCols=[si.getOutputCol()], outputCols=[f'{cat_col}_vec'])
    string_indexers.append(si)
    ohe_indexers.append(enc)

assembler_cols = [f'{c}_vec' for c in cat_cols] + imputed_cols
vector_assembler = VectorAssembler(inputCols=assembler_cols, outputCol='features')

#### LinearSVC

In [20]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

svc = LinearSVC(labelCol='label', featuresCol='features')
svc_stages = [imputer] + string_indexers + ohe_indexers + [vector_assembler] + [svc]
pipeline = Pipeline().setStages(svc_stages)
svc_model = pipeline.fit(train_df)

val_df_pred = svc_model.transform(val_df)
test_df_pred = svc_model.transform(test_df)
evaluator = BinaryClassificationEvaluator()
print(f'Metric name: {evaluator.getMetricName()}')
print(f'CV Metric value: {evaluator.evaluate(val_df_pred)}')
print(f'Test Metric value: {evaluator.evaluate(test_df_pred)}')

10-20 20:32:17.111 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:32:17.112 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:32:17.113 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:32:17.197 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 24.564393 ms
10-20 20:32:17.264 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 36.934815 ms
10-20 20:32:17.267 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadca

[Stage 24:>                                                         (0 + 1) / 1]

10-20 20:32:26.114 172.17.0.2:54321      18300  0 (TID 26)  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 4.718918 ms
10-20 20:32:26.132 172.17.0.2:54321      18300  0 (TID 26)  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 7.961625 ms
10-20 20:32:27.018 172.17.0.2:54321      18300  0 (TID 26)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 24.0 (TID 26). 5776 bytes result sent to driver
10-20 20:32:27.019 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 24.0 (TID 26) in 1693 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:27.020 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 24.0, whose tasks have all completed, from pool 
10-20 20:32:27.020 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ResultStage 24 (treeAg

                                                                                

10-20 20:32:27.416 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_37_piece0 on 5b5a8eb7561c:44751 in memory (size: 41.9 KiB, free: 434.3 MiB)
10-20 20:32:27.456 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_38 stored as values in memory (estimated size 888.0 B, free 434.0 MiB)
10-20 20:32:27.472 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_38_piece0 stored as bytes in memory (estimated size 860.0 B, free 434.0 MiB)
10-20 20:32:27.472 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added broadcast_38_piece0 in memory on 5b5a8eb7561c:44751 (size: 860.0 B, free: 434.3 MiB)
10-20 20:32:27.474 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.SparkContext: Created broadcast 38 from broadcast at LinearSVC.scala:285
10-20 20:32:27.516 172.17.0.2:54321      183

[Stage 25:>                                                         (0 + 1) / 1]

10-20 20:32:28.594 172.17.0.2:54321      18300  0 (TID 27)  INFO org.apache.spark.storage.memory.MemoryStore: Block rdd_104_0 stored as values in memory (estimated size 3.6 MiB, free 430.2 MiB)
10-20 20:32:28.595 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added rdd_104_0 in memory on 5b5a8eb7561c:44751 (size: 3.6 MiB, free: 430.7 MiB)
10-20 20:32:28.631 172.17.0.2:54321      18300  0 (TID 27)  WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
10-20 20:32:28.631 172.17.0.2:54321      18300  0 (TID 27)  WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
10-20 20:32:28.652 172.17.0.2:54321      18300  0 (TID 27)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 25.0 (TID 27). 3542 bytes result sent to driver
10-20 20:32:28.653 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.Task

                                                                                

10-20 20:32:28.801 172.17.0.2:54321      18300  0 (TID 28)  INFO org.apache.spark.storage.BlockManager: Found block rdd_104_0 locally
10-20 20:32:28.823 172.17.0.2:54321      18300  0 (TID 28)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 26.0 (TID 28). 3456 bytes result sent to driver
10-20 20:32:28.825 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 26.0 (TID 28) in 55 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:28.825 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 26.0, whose tasks have all completed, from pool 
10-20 20:32:28.825 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ResultStage 26 (treeAggregate at RDDLossFunction.scala:61) finished in 0.068 s
10-20 20:32:28.826 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Job 15 is finished. Cance

[Stage 233:>                                                        (0 + 1) / 1]

10-20 20:32:42.159 172.17.0.2:54321      18300   (TID 235)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 233.0 (TID 235). 2133 bytes result sent to driver
10-20 20:32:42.160 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 233.0 (TID 235) in 1070 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:42.160 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 233.0, whose tasks have all completed, from pool 
10-20 20:32:42.161 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 233 (map at BinaryClassificationMetrics.scala:48) finished in 1.136 s
10-20 20:32:42.161 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 20:32:42.161 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: runni

                                                                                

10-20 20:32:42.699 172.17.0.2:54321      18300   (TID 239)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Getting 1 (90.5 KiB) non-empty blocks including 1 (90.5 KiB) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
10-20 20:32:42.699 172.17.0.2:54321      18300   (TID 239)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
10-20 20:32:42.843 172.17.0.2:54321      18300   (TID 239)  INFO org.apache.spark.storage.memory.MemoryStore: Block rdd_341_0 stored as values in memory (estimated size 82.4 KiB, free 433.7 MiB)
10-20 20:32:42.844 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added rdd_341_0 in memory on 5b5a8eb7561c:44751 (size: 82.4 KiB, free: 434.2 MiB)
10-20 20:32:42.860 172.17.0.2:54321      18300   (TID 239)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 241.0 (TID 239). 1524 bytes result sent to driver
10-20 20:32:42.861 172.17.0.2:54321      18300

[Stage 242:>                                                        (0 + 1) / 1]

10-20 20:32:44.506 172.17.0.2:54321      18300   (TID 240)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 242.0 (TID 240). 1861 bytes result sent to driver
10-20 20:32:44.507 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 242.0 (TID 240) in 1024 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:44.507 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 242.0, whose tasks have all completed, from pool 
10-20 20:32:44.508 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 242 (map at BinaryClassificationMetrics.scala:48) finished in 1.204 s
10-20 20:32:44.508 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 20:32:44.508 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: runni

                                                                                

10-20 20:32:45.037 172.17.0.2:54321      18300   (TID 244)  INFO org.apache.spark.storage.memory.MemoryStore: Block rdd_359_0 stored as values in memory (estimated size 82.1 KiB, free 433.7 MiB)
10-20 20:32:45.039 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added rdd_359_0 in memory on 5b5a8eb7561c:44751 (size: 82.1 KiB, free: 434.2 MiB)
10-20 20:32:45.044 172.17.0.2:54321      18300   (TID 244)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 250.0 (TID 244). 1524 bytes result sent to driver
10-20 20:32:45.046 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 250.0 (TID 244) in 116 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:32:45.046 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 250.0, whose tasks have all completed, from pool 
10-20 20:32:45.047 172.17.0.2:54321      18300  event-loop 

### LogisticRegresssion

In [21]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(labelCol='label', featuresCol='features')
lr_stages = [imputer] + string_indexers + ohe_indexers + [vector_assembler] + [lr]
pipeline = Pipeline().setStages(lr_stages)
lr_model = pipeline.fit(train_df)

val_df_pred = lr_model.transform(val_df)
test_df_pred = lr_model.transform(test_df)
evaluator = BinaryClassificationEvaluator()
print(f'Metric name: {evaluator.getMetricName()}')
print(f'CV Metric value: {evaluator.evaluate(val_df_pred)}')
print(f'Test Metric value: {evaluator.evaluate(test_df_pred)}')

10-20 20:33:30.601 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:33:30.602 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:33:30.602 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:33:30.627 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_469 stored as values in memory (estimated size 176.1 KiB, free 433.6 MiB)
10-20 20:33:30.643 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_463_piece0 on 5b5a8eb7561c:44751 in memory (size: 28.0 KiB, free: 434.3 MiB)
10-20 20:33:30.645 172.17.0.2:54321      1

                                                                                

10-20 20:33:36.186 172.17.0.2:54321      18300   (TID 267)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 271.0 (TID 267). 5776 bytes result sent to driver
10-20 20:33:36.187 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 271.0 (TID 267) in 744 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:33:36.187 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 271.0, whose tasks have all completed, from pool 
10-20 20:33:36.187 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ResultStage 271 (treeAggregate at Summarizer.scala:232) finished in 0.756 s
10-20 20:33:36.187 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Job 239 is finished. Cancelling potential speculative or zombie tasks for this job
10-20 20:33:36.188 172.17.0.2:54321      18300  event-loop  INFO org.apac

                                                                                

10-20 20:33:36.945 172.17.0.2:54321      18300   (TID 268)  INFO org.apache.spark.storage.memory.MemoryStore: Block rdd_454_0 stored as values in memory (estimated size 3.6 MiB, free 429.8 MiB)
10-20 20:33:36.945 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added rdd_454_0 in memory on 5b5a8eb7561c:44751 (size: 3.6 MiB, free: 430.6 MiB)
10-20 20:33:36.970 172.17.0.2:54321      18300   (TID 268)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 272.0 (TID 268). 3630 bytes result sent to driver
10-20 20:33:36.970 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 272.0 (TID 268) in 720 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:33:36.971 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 272.0, whose tasks have all completed, from pool 
10-20 20:33:36.971 172.17.0.2:54321      18300  event-loop  I

                                                                                

10-20 20:33:43.474 172.17.0.2:54321      18300   (TID 355)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 363.0 (TID 355). 1861 bytes result sent to driver
10-20 20:33:43.475 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 363.0 (TID 355) in 535 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:33:43.475 172.17.0.2:54321      18300  t-getter-3  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 363.0, whose tasks have all completed, from pool 
10-20 20:33:43.476 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 363 (map at BinaryClassificationMetrics.scala:48) finished in 0.571 s
10-20 20:33:43.476 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 20:33:43.476 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: runnin

#### DecisionTree

In [22]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')
dt_stages = [imputer] + string_indexers + ohe_indexers + [vector_assembler] + [dt]
pipeline = Pipeline().setStages(dt_stages)
dt_model = pipeline.fit(train_df)

val_df_pred = dt_model.transform(val_df)
test_df_pred = dt_model.transform(test_df)
evaluator = BinaryClassificationEvaluator()
print(f'Metric name: {evaluator.getMetricName()}')
print(f'CV Metric value: {evaluator.evaluate(val_df_pred)}')
print(f'Test Metric value: {evaluator.evaluate(test_df_pred)}')

10-20 20:35:01.097 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:35:01.097 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:35:01.097 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:35:01.121 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_680 stored as values in memory (estimated size 176.1 KiB, free 433.0 MiB)
10-20 20:35:01.139 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_680_piece0 stored as bytes in memory (estimated size 28.0 KiB, free 433.0 MiB)
10-20 20:35:01.139 172.17.0.2:54321     

[Stage 396:>                                                        (0 + 1) / 1]

10-20 20:35:06.903 172.17.0.2:54321      18300   (TID 386)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 396.0 (TID 386). 2133 bytes result sent to driver
10-20 20:35:06.904 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 396.0 (TID 386) in 856 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:35:06.904 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 396.0, whose tasks have all completed, from pool 
10-20 20:35:06.905 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 396 (flatMap at RandomForest.scala:1039) finished in 0.866 s
10-20 20:35:06.905 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 20:35:06.905 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: running: Set()


                                                                                

10-20 20:35:07.639 172.17.0.2:54321      18300   (TID 388)  INFO org.apache.spark.storage.memory.MemoryStore: Block rdd_692_0 stored as values in memory (estimated size 12.6 MiB, free 420.4 MiB)
10-20 20:35:07.639 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added rdd_692_0 in memory on 5b5a8eb7561c:44751 (size: 12.6 MiB, free: 421.6 MiB)
10-20 20:35:07.678 172.17.0.2:54321      18300   (TID 388)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 398.0 (TID 388). 2133 bytes result sent to driver
10-20 20:35:07.679 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 398.0 (TID 388) in 649 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:35:07.679 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 398.0, whose tasks have all completed, from pool 
10-20 20:35:07.679 172.17.0.2:54321      18300  event-loop 

                                                                                

10-20 20:35:07.840 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_723_piece0 stored as bytes in memory (estimated size 45.4 KiB, free 420.2 MiB)
10-20 20:35:07.841 172.17.0.2:54321      18300  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added broadcast_723_piece0 in memory on 5b5a8eb7561c:44751 (size: 45.4 KiB, free: 421.5 MiB)
10-20 20:35:07.841 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.SparkContext: Created broadcast 723 from broadcast at DAGScheduler.scala:1388
10-20 20:35:07.841 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 400 (MapPartitionsRDD[696] at mapPartitions at RandomForest.scala:644) (first 15 tasks are for partitions Vector(0))
10-20 20:35:07.842 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Adding task set 400.0 with 1 tasks resource profile 

#### NaiveBayes

In [23]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

nb = NaiveBayes(labelCol='label', featuresCol='features')
nb_stages = [imputer] + string_indexers + ohe_indexers + [vector_assembler] + [nb]
pipeline = Pipeline().setStages(nb_stages)
nb_model = pipeline.fit(train_df)

val_df_pred = nb_model.transform(val_df)
test_df_pred = nb_model.transform(test_df)
evaluator = BinaryClassificationEvaluator()
print(f'Metric name: {evaluator.getMetricName()}')
print(f'CV Metric value: {evaluator.evaluate(val_df_pred)}')
print(f'Test Metric value: {evaluator.evaluate(test_df_pred)}')

10-20 20:36:57.196 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:36:57.196 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:36:57.196 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:36:57.217 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_746 stored as values in memory (estimated size 176.1 KiB, free 433.2 MiB)
10-20 20:36:57.230 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_746_piece0 stored as bytes in memory (estimated size 28.0 KiB, free 433.2 MiB)
10-20 20:36:57.231 172.17.0.2:54321     

[Stage 446:>                                                        (0 + 1) / 1]

10-20 20:37:01.603 172.17.0.2:54321      18300   (TID 430)  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 5.667343 ms
10-20 20:37:01.612 172.17.0.2:54321      18300   (TID 430)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 446.0 (TID 430). 2713 bytes result sent to driver
10-20 20:37:01.613 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 446.0 (TID 430) in 795 ms on 5b5a8eb7561c (executor driver) (1/1)
10-20 20:37:01.613 172.17.0.2:54321      18300  t-getter-2  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 446.0, whose tasks have all completed, from pool 
10-20 20:37:01.613 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 446 (collect at NaiveBayes.scala:193) finished in 0.802 s
10-20 20:37:01.613 172.17.0.2:54321      18300  event-loop  INFO org.apache.spark.scheduler.DAGScheduler



10-20 20:37:02.214 172.17.0.2:54321      18300   (TID 562)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Getting 0 (0.0 B) non-empty blocks including 0 (0.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
10-20 20:37:02.214 172.17.0.2:54321      18300   (TID 562)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
10-20 20:37:02.214 172.17.0.2:54321      18300  ent-loop-2  INFO org.apache.spark.scheduler.TaskSetManager: Starting task 136.0 in stage 447.0 (TID 567) (5b5a8eb7561c, executor driver, partition 136, PROCESS_LOCAL, 4453 bytes) taskResourceAssignments Map()
10-20 20:37:02.214 172.17.0.2:54321      18300  t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 133.0 in stage 447.0 (TID 564) in 16 ms on 5b5a8eb7561c (executor driver) (133/200)
10-20 20:37:02.214 172.17.0.2:54321      18300   (TID 563)  INFO org.apache.spark.executor.Executor: Finished task 132.0 in stage 447.0 (TID 563). 3277 

                                                                                

Metric name: areaUnderROC
10-20 20:37:03.140 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 20:37:03.140 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 20:37:03.140 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 20:37:03.158 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_779 stored as values in memory (estimated size 176.1 KiB, free 432.9 MiB)
10-20 20:37:03.164 172.17.0.2:54321      18300    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_779_piece0 stored as bytes in memory (estimated size 28.0 KiB, free 432.9 MiB)
10-20 20:37:03