## Income Prediction with Spark Random Forest

### Install findspark and init

In [1]:
!pip install findspark



In [2]:
import findspark
findspark.init()

### Get spark and h2o sessions

In [3]:
from pyspark.sql import SparkSession
from pysparkling import H2OContext
import h2o

from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName('cognetix-spark-nb')
    .config('spark.dynamicAllocation.enabled', 'false')
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext
hc = H2OContext.getOrCreate()
h2o_cluster = h2o.cluster()

23/10/20 14:40:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


10-20 14:40:07.224 172.17.0.2:54321      6000     Thread-4  INFO water.default: ----- H2O started  -----
10-20 14:40:07.225 172.17.0.2:54321      6000     Thread-4  INFO water.default: Build git branch: rel-zz_kurka
10-20 14:40:07.225 172.17.0.2:54321      6000     Thread-4  INFO water.default: Build git hash: 5ff8870f912c6110d7b6988f577c020de10496ec
10-20 14:40:07.226 172.17.0.2:54321      6000     Thread-4  INFO water.default: Build git describe: jenkins-3.40.0.3-122-g5ff8870
10-20 14:40:07.226 172.17.0.2:54321      6000     Thread-4  INFO water.default: Build project version: 3.40.0.4
10-20 14:40:07.226 172.17.0.2:54321      6000     Thread-4  INFO water.default: Build age: 5 months and 22 days
10-20 14:40:07.226 172.17.0.2:54321      6000     Thread-4  INFO water.default: Built by: 'jenkins'
10-20 14:40:07.226 172.17.0.2:54321      6000     Thread-4  INFO water.default: Built on: '2023-04-28 12:08:23'
10-20 14:40:07.227 172.17.0.2:54321      6000     Thread-4  WARN water.default: 


0,1
H2O_cluster_uptime:,11 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,5 months and 22 days
H2O_cluster_name:,sparkling-water-root_local-1697812804151
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4



Sparkling Water Context:
 * Sparkling Water Version: 3.40.0.4-1-3.1
 * H2O name: sparkling-water-root_local-1697812804151
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,172.17.0.2,54321)
  ------------------------

  Open H2O Flow in browser: http://95675304fa2d:54323 (CMD + click in Mac OSX)

    


### Global params

In [4]:
max_depth = 5
learning_rate = 0.01
train_rate = 0.8
seed = 42

train_path = '../data/census-train.csv'
test_path = '../data/census-test.csv'
model_path = 'outputs/income_rf_spark'
pred_path = 'outputs/income_rf_spark_pred'

### Load data and basic transformations

In [5]:
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType

schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", DoubleType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", IntegerType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capital_gain", DoubleType(), True),
    StructField("capital_loss", DoubleType(), True),
    StructField("hours_per_week", DoubleType(), True),
    StructField("native_country", StringType(), True),
    StructField("income_level", StringType(), True),
])

train_df = (
    spark.read
    .format('csv')
    .option('header', 'false')
    .option('delimiter', ',')
    .schema(schema)
    .load(train_path)
    .drop('education_num')
    .withColumn('label', when(col('income_level').contains('>50K'), lit(1)).otherwise(lit(0)))
    .drop('income_level')
    .withColumn('workclass', when(col('workclass') == ' ?', lit('NA')).otherwise(col('workclass')))
    .withColumn('occupation', when(col('occupation') == ' ?', lit('NA')).otherwise(col('occupation')))
    .withColumn('native_country', when(col('native_country') == ' ?', lit('NA')).otherwise(col('native_country')))
)

10-20 14:40:18.586 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.InMemoryFileIndex: It took 30 ms to list leaf files for 1 paths.


### Explore train data

In [6]:
train_df.count()

10-20 14:40:24.555 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 14:40:24.556 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 14:40:24.558 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<>
10-20 14:40:25.139 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 173.773322 ms
10-20 14:40:25.162 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 12.195505 ms
10-20 14:40:25.203 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 176.1 KiB, free 434.2 MiB)
10-20 14:40:25.263 172.17.0.2

[Stage 0:>                                                          (0 + 1) / 1]

10-20 14:40:26.194 172.17.0.2:54321      6000   .0 (TID 0)  INFO org.apache.spark.sql.execution.datasources.FileScanRDD: Reading File path: file:///home/jovyan/data/census-train.csv, range: 0-3974305, partition values: [empty row]
10-20 14:40:26.248 172.17.0.2:54321      6000   .0 (TID 0)  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 46.793064 ms
10-20 14:40:26.601 172.17.0.2:54321      6000   .0 (TID 0)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 0.0 (TID 0). 1965 bytes result sent to driver
10-20 14:40:26.627 172.17.0.2:54321      6000   t-getter-0  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 892 ms on 95675304fa2d (executor driver) (1/1)
10-20 14:40:26.636 172.17.0.2:54321      6000   t-getter-0  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
10-20 14:40:26.650 172.17.0.2:54321      6000   event-loop  IN

                                                                                

32561

In [7]:
train_df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'label']

In [8]:
train_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- label: integer (nullable = false)



### Split train data

In [9]:
train_df, val_df = train_df.randomSplit([train_rate, 1-train_rate], seed=seed)
print(f'Train split size: {train_df.count()}')
print(f'Validation split size: {val_df.count()}')

10-20 14:40:30.727 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 14:40:30.727 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 14:40:30.728 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 14:40:30.826 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 51.186079 ms
10-20 14:40:30.834 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_3 stored as values in memory (estimated size 176.1 KiB, free 434.0 MiB)
10-20 14:40:30.870 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.stor

                                                                                

10-20 14:40:31.701 172.17.0.2:54321      6000   .0 (TID 2)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 2.0 (TID 2). 2732 bytes result sent to driver
10-20 14:40:31.705 172.17.0.2:54321      6000   t-getter-2  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 2.0 (TID 2) in 720 ms on 95675304fa2d (executor driver) (1/1)
10-20 14:40:31.706 172.17.0.2:54321      6000   t-getter-2  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
10-20 14:40:31.706 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 2 (count at NativeMethodAccessorImpl.java:0) finished in 0.729 s
10-20 14:40:31.706 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 14:40:31.707 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: running: Set()
10-20 

### Class distribution

In [10]:
train_df.groupBy('label').count().toPandas()

10-20 14:40:34.059 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Pushed Filters: 
10-20 14:40:34.060 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Post-Scan Filters: 
10-20 14:40:34.061 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy: Output Data Schema: struct<age: int, workclass: string, fnlwgt: double, education: string, marital_status: string ... 12 more fields>
10-20 14:40:34.132 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 31.058113 ms
10-20 14:40:34.217 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 56.432359 ms
10-20 14:40:34.222 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadca



10-20 14:40:35.389 172.17.0.2:54321      6000   0 (TID 91)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Getting 0 (0.0 B) non-empty blocks including 0 (0.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
10-20 14:40:35.389 172.17.0.2:54321      6000   0 (TID 91)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
10-20 14:40:35.395 172.17.0.2:54321      6000   ent-loop-3  INFO org.apache.spark.scheduler.TaskSetManager: Starting task 84.0 in stage 7.0 (TID 92) (95675304fa2d, executor driver, partition 84, PROCESS_LOCAL, 4453 bytes) taskResourceAssignments Map()
10-20 14:40:35.395 172.17.0.2:54321      6000   t-getter-3  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 80.0 in stage 7.0 (TID 88) in 32 ms on 95675304fa2d (executor driver) (82/200)
10-20 14:40:35.397 172.17.0.2:54321      6000   0 (TID 92)  INFO org.apache.spark.executor.Executor: Running task 84.0 in stage 7.0 (TID 92)
10-20 14:40:35.404 17



10-20 14:40:35.799 172.17.0.2:54321      6000    (TID 169)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Getting 0 (0.0 B) non-empty blocks including 0 (0.0 B) local and 0 (0.0 B) host-local and 0 (0.0 B) remote blocks
10-20 14:40:35.799 172.17.0.2:54321      6000    (TID 169)  INFO org.apache.spark.storage.ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
10-20 14:40:35.801 172.17.0.2:54321      6000    (TID 166)  INFO org.apache.spark.executor.Executor: Finished task 158.0 in stage 7.0 (TID 166). 3832 bytes result sent to driver
10-20 14:40:35.802 172.17.0.2:54321      6000   ent-loop-0  INFO org.apache.spark.scheduler.TaskSetManager: Starting task 162.0 in stage 7.0 (TID 170) (95675304fa2d, executor driver, partition 162, PROCESS_LOCAL, 4453 bytes) taskResourceAssignments Map()
10-20 14:40:35.802 172.17.0.2:54321      6000   t-getter-1  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 158.0 in stage 7.0 (TID 166) in 19 ms on 95675304fa2d (execu

                                                                                

Unnamed: 0,label,count
0,1,6289
1,0,19787


### ML Pipeline

In [None]:
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    OneHotEncoder,
    Imputer,
)
from pyspark.ml import (
    Pipeline
)
from pyspark.ml.classification import(
    RandomForestClassifier,
)

cols_to_impute = ['fnlwgt', 'age', 'capital_gain', 'capital_loss', 'hours_per_week']
cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
imputed_cols = [f'{x}_IMPUTED' for x in cols_to_impute]
imputer = Imputer(strategy='mean', inputCols=cols_to_impute, outputCols=imputed_cols)
string_indexers = []
ohe_indexers = []
for cat_col in cat_cols:
    si = StringIndexer(inputCol=cat_col, outputCol=f'{cat_col}_idx').setHandleInvalid('keep')
    enc = OneHotEncoder(inputCols=[si.getOutputCol()], outputCols=[f'{cat_col}_vec'])
    string_indexers.append(si)
    ohe_indexers.append(enc)

assembler_cols = [f'{c}_vec' for c in cat_cols] + imputed_cols
vector_assembler = VectorAssembler(inputCols=assembler_cols, outputCol='features')
rf = RandomForestClassifier(labelCol='label', featuresCol='features')
rf_stages = [imputer] + string_indexers + ohe_indexers + [vector_assembler] + [rf]
pipeline = Pipeline().setStages(rf_stages)
rf_model = pipeline.fit(train_df)
rf_model.write().overwrite().save(model_path)

### Evaluation

In [12]:
from pyspark.ml import (
    PipelineModel
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

pipeline_model = PipelineModel.load(model_path)
train_df_pred = pipeline_model.transform(train_df)
val_df_pred = pipeline_model.transform(val_df)
evaluator = BinaryClassificationEvaluator()
print(f'Metric name: {evaluator.getMetricName()}')
print(f'Metric value: {evaluator.evaluate(val_df_pred)}')


10-20 14:40:55.289 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_68 stored as values in memory (estimated size 176.1 KiB, free 433.4 MiB)
10-20 14:40:55.314 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_68_piece0 stored as bytes in memory (estimated size 27.2 KiB, free 433.4 MiB)
10-20 14:40:55.315 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added broadcast_68_piece0 in memory on 95675304fa2d:39707 (size: 27.2 KiB, free: 434.3 MiB)
10-20 14:40:55.317 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.SparkContext: Created broadcast 68 from textFile at NativeMethodAccessorImpl.java:0
10-20 14:40:55.417 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.SparkContext: Starting job: runJob at PythonRDD.scala:166
10-20 14:40:55.420 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler

[Stage 128:>                                                        (0 + 1) / 1]

10-20 14:41:03.962 172.17.0.2:54321      6000    (TID 349)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 128.0 (TID 349). 2133 bytes result sent to driver
10-20 14:41:03.963 172.17.0.2:54321      6000   t-getter-1  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 128.0 (TID 349) in 827 ms on 95675304fa2d (executor driver) (1/1)
10-20 14:41:03.963 172.17.0.2:54321      6000   t-getter-1  INFO org.apache.spark.scheduler.TaskSchedulerImpl: Removed TaskSet 128.0, whose tasks have all completed, from pool 
10-20 14:41:03.964 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: ShuffleMapStage 128 (map at BinaryClassificationMetrics.scala:48) finished in 0.844 s
10-20 14:41:03.964 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: looking for newly runnable stages
10-20 14:41:03.964 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: runnin

                                                                                

10-20 14:41:04.213 172.17.0.2:54321      6000   ad-pool-15  INFO org.apache.spark.storage.BlockManager: Removing RDD 369


### Feature Importance

In [13]:
import pandas as pd

fitted_model = pipeline_model.stages[-1]
binary_features = train_df_pred.schema['features'].metadata['ml_attr']['attrs']['binary']
numerical_features = train_df_pred.schema['features'].metadata['ml_attr']['attrs']['numeric']
features_map = pd.DataFrame(binary_features + numerical_features)
feature_importances = fitted_model.featureImportances
features_map['score'] = features_map.idx.apply(lambda x: feature_importances[x])
features_map = features_map.set_index('name')['score']
features_map = pd.DataFrame(features_map)
print(f'Feature importance: {features_map}')

Feature importance:                                     score
name                                     
workclass_vec_ Private           0.000400
workclass_vec_ Self-emp-not-inc  0.000021
workclass_vec_ Local-gov         0.000000
workclass_vec_NA                 0.000000
workclass_vec_ State-gov         0.000000
...                                   ...
fnlwgt_IMPUTED                   0.000876
age_IMPUTED                      0.013504
capital_gain_IMPUTED             0.172791
capital_loss_IMPUTED             0.032275
hours_per_week_IMPUTED           0.031168

[106 rows x 1 columns]


### Inference

In [14]:
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import when
from pyspark.ml import (
    PipelineModel
)

 

schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", DoubleType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", IntegerType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capital_gain", DoubleType(), True),
    StructField("capital_loss", DoubleType(), True),
    StructField("hours_per_week", DoubleType(), True),
    StructField("native_country", StringType(), True),
    StructField("income_level", StringType(), True),
])

test_df = (
    spark.read
    .format('csv')
    .option('header', 'false')
    .option('delimiter', ',')
    .schema(schema)
    .load(test_path)
    .withColumn('label', when(col('income_level').contains('>50K'), lit(1)).otherwise(lit(0)))
    .drop('education_num', 'income_level', 'label')
    .withColumn('workclass', when(col('workclass') == ' ?', lit('NA')).otherwise(col('workclass')))
    .withColumn('occupation', when(col('occupation') == ' ?', lit('NA')).otherwise(col('occupation')))
    .withColumn('native_country', when(col('native_country') == ' ?', lit('NA')).otherwise(col('native_country')))
)

pipeline_model = PipelineModel.load(model_path)
udf_pos_prob = udf(lambda v: float(v[1]), FloatType())
test_df_pred = pipeline_model.transform(test_df)
test_df_pred = test_df_pred.withColumn('prob', udf_pos_prob(col('probability')))
cols = ['rawPrediction', 'probability', 'prediction', 'prob']
test_df_pred.select(*cols).write.parquet(pred_path, mode='overwrite')

10-20 14:41:37.758 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.InMemoryFileIndex: It took 1 ms to list leaf files for 1 paths.
10-20 14:41:37.845 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_216 stored as values in memory (estimated size 176.1 KiB, free 433.3 MiB)
10-20 14:41:37.862 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_211_piece0 on 95675304fa2d:39707 in memory (size: 85.7 KiB, free: 434.3 MiB)
10-20 14:41:37.863 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_216_piece0 stored as bytes in memory (estimated size 27.2 KiB, free 433.6 MiB)
10-20 14:41:37.864 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added broadcast_216_piece0 in memory on 95675304fa2d:39707 (size: 27.2 KiB, free: 434.3 MiB)
10-20 14:41:37.866 

[Stage 221:>                                                        (0 + 1) / 1]

10-20 14:41:44.372 172.17.0.2:54321      6000    (TID 458)  INFO org.apache.spark.sql.execution.python.PythonUDFRunner: Times: total = 995, boot = 6, init = 256, finish = 733
10-20 14:41:44.378 172.17.0.2:54321      6000    (TID 458)  INFO org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 644784
10-20 14:41:44.427 172.17.0.2:54321      6000    (TID 458)  INFO org.apache.spark.mapred.SparkHadoopMapRedUtil: attempt_202310201441434029071695467480550_0221_m_000000_458: Committed
10-20 14:41:44.430 172.17.0.2:54321      6000    (TID 458)  INFO org.apache.spark.executor.Executor: Finished task 0.0 in stage 221.0 (TID 458). 3254 bytes result sent to driver
10-20 14:41:44.431 172.17.0.2:54321      6000   t-getter-3  INFO org.apache.spark.scheduler.TaskSetManager: Finished task 0.0 in stage 221.0 (TID 458) in 1390 ms on 95675304fa2d (executor driver) (1/1)
10-20 14:41:44.432 172.17.0.2:54321      6000   t-getter-3  INFO org.apache.spark.s

                                                                                

In [15]:
spark.read.load('outputs/income_rf_spark_pred').limit(10).toPandas()

10-20 14:42:25.309 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.sql.execution.datasources.InMemoryFileIndex: It took 4 ms to list leaf files for 1 paths.
10-20 14:42:25.352 172.17.0.2:54321      6000     Thread-4  INFO org.apache.spark.SparkContext: Starting job: load at NativeMethodAccessorImpl.java:0
10-20 14:42:25.353 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Got job 188 (load at NativeMethodAccessorImpl.java:0) with 1 output partitions
10-20 14:42:25.353 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Final stage: ResultStage 222 (load at NativeMethodAccessorImpl.java:0)
10-20 14:42:25.353 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Parents of final stage: List()
10-20 14:42:25.353 172.17.0.2:54321      6000   event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Missing parents: List()
10-20 14:42:25.354 172.17.0.2:54321      6000   e

Unnamed: 0,rawPrediction,probability,prediction,prob
0,"[16.37736275283466, 3.622637247165343]","[0.8188681376417328, 0.18113186235826712]",0.0,0.181132
1,"[19.010115101836554, 0.9898848981634504]","[0.9505057550918276, 0.04949424490817251]",0.0,0.049494
2,"[13.375161764175118, 6.624838235824885]","[0.6687580882087558, 0.33124191179124424]",0.0,0.331242
3,"[13.326906456552484, 6.673093543447515]","[0.6663453228276242, 0.33365467717237574]",0.0,0.333655
4,"[8.207082112497687, 11.792917887502313]","[0.4103541056248844, 0.5896458943751156]",1.0,0.589646
5,"[19.021352795491893, 0.9786472045081104]","[0.9510676397745945, 0.04893236022540551]",0.0,0.048932
6,"[18.497068651104033, 1.5029313488959692]","[0.9248534325552015, 0.07514656744479845]",0.0,0.075147
7,"[18.23298354762287, 1.7670164523771308]","[0.9116491773811435, 0.08835082261885654]",0.0,0.088351
8,"[10.331123697187872, 9.668876302812127]","[0.5165561848593936, 0.4834438151406063]",0.0,0.483444
9,"[18.87149111861569, 1.1285088813843114]","[0.9435745559307845, 0.05642544406921557]",0.0,0.056425


10-20 14:42:25.673 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_358_piece0 on 95675304fa2d:39707 in memory (size: 28.0 KiB, free: 434.1 MiB)
10-20 14:42:25.677 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_359_piece0 on 95675304fa2d:39707 in memory (size: 143.1 KiB, free: 434.3 MiB)
10-20 14:44:58.935 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_360_piece0 on 95675304fa2d:39707 in memory (size: 30.1 KiB, free: 434.3 MiB)
10-20 14:44:58.970 172.17.0.2:54321      6000   agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_362_piece0 on 95675304fa2d:39707 in memory (size: 5.3 KiB, free: 434.3 MiB)
