# Dataproc scaling experiment

In this notebook will test Dataproc scaling across a number of tasks in the context of fitting a machine learning model.

In [38]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import isnan, when, count, col
import datetime as dt

In [39]:
spark = SparkSession.builder.master('yarn').appName('spark-test').getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used
# by the connector.
bucket = "pyspark_temp_20201006_eu"
spark.conf.set('temporaryGcsBucket', bucket)

In [40]:
def count_data(df):
    
    start = dt.datetime.now()
    print("Counting data")
    data_count = df.count()
    print("Data count", data_count)
    
    end = dt.datetime.now()
    delta = end - start
    print("COUNT: {}".format(delta))
    print()
    
def group_data(df):
    
    start = dt.datetime.now()
    print("Grouping data")
    
    data = df.groupBy("year").count().sort("year",ascending=False).toPandas()
    
    end = dt.datetime.now()
    delta = end - start
    print("GROUP: {}".format(delta))
    print()
    return data

def describe_data(df):
    
    start = dt.datetime.now()
    print("Describing data")
        
    describe = df.describe().toPandas()
    
    end = dt.datetime.now()
    delta = end - start
    print("Describe: {}".format(delta))
    print()
    return describe
    


def prep_model_data(df):

    start = dt.datetime.now()    
    print("Preparing model data")
    
    model_data = df.select("total_amount", "journey_length", "year", "month", 
                         "dayofweek", "pu_hour", "do_hour", 
                         "passenger_count", "trip_distance", 
                         "pu_m_center", 
                         "do_m_center", "pu_m_jfk", "do_m_jfk")
    
    model_data_map = model_data.rdd.map(lambda x: (x[0], DenseVector(x[1:])))    
    train_test = spark.createDataFrame(model_data_map, ["label", "features"])
    train, test = train_test.randomSplit([.8,.2],seed=1234)
    
    end = dt.datetime.now()
    delta = end - start
    print("Prep: {}".format(delta))
    print()
    
    return train, test


def fit_model(train):
    
    start = dt.datetime.now()    
    print("Fitting Mllib model")
    
    rf = RandomForestRegressor(featuresCol="features")
    model = rf.fit(train)
    
    end = dt.datetime.now()
    delta = end - start
    print("Fit: {}".format(delta))
    print()
    
    return model


def make_predictions(model, test):
    
    start = dt.datetime.now()
    print("Making predicitons with Mllib model")
    predictions = model.transform(test)
    
    end = dt.datetime.now()
    delta = end - start
    print("Predictions: {}".format(delta))
    print()
    
    return predictions
    
def evaluate_predictions(predictions):
    start = dt.datetime.now()
    
    print("Evaluating model")
    
    print("Calculating RMSE")
    evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    
    print("Calculating R2")
    evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    
    print("RMSE: {:.2f}, R2 {:.2f}".format(rmse, r2))
    
    end = dt.datetime.now()
    delta = end - start
    print("Evaluation: {}".format(delta))
    print()
    
    
def run_speed_test(df):
    
    count_data(df)
    group_data(df)
    describe_data(df)
    train, test = prep_model_data(df)
    model = fit_model(train)
    predictions = make_predictions(model,test)
    evaluate_predictions(predictions)

# 1000 rows, 2 primary workers, 3 secondary

67GB Yarn memory



In [26]:
# Load data from BigQuery.
taxi_1k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_1k').load()
taxi_1k.createOrReplaceTempView('taxi')

In [27]:
run_speed_test(taxi_1k)

Counting data
Data count 1000
COUNT: 0:00:00.359642

Grouping data
GROUP: 0:00:06.197512

Describing data
Describe: 0:00:04.627968

Preparing model data
Prep: 0:00:00.965123

Fitting Mllib model
Fit: 0:00:03.358198

Making predicitons with Mllib model
Predictions: 0:00:00.091519

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 3.54, R2 0.75
Evaluation: 0:00:05.756070



# 10K rows

In [None]:
# Load data from BigQuery.
taxi_10k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_10k').load()
taxi_10k.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_10k)

100K rows

In [None]:
# Load data from BigQuery.
taxi_100k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_100k').load()
taxi_100k.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_100k)

1M rows


In [None]:
# Load data from BigQuery.
taxi_1m = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_1m').load()
taxi_1m.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_1m)

10M rows

In [None]:
# Load data from BigQuery.
taxi_10m = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_10m').load()
taxi_10m.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_10m)

# 2 primary workers 8 secondary workers 

98.72 GB memory

In [None]:
# 1K

In [None]:
# Load data from BigQuery.
taxi_1k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_1k').load()
taxi_1k.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_1k)

In [None]:
# 10K

In [None]:
# Load data from BigQuery.
taxi_10k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_10k').load()
taxi_10k.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_10k)

In [None]:
# 100K

In [None]:
# Load data from BigQuery.
taxi_100k = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_100k').load()
taxi_100k.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_100k)

In [None]:
# 1M 

In [None]:
# Load data from BigQuery.
taxi_1m = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_1m').load()
taxi_1m.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_1m)

In [None]:
# 10M 

In [None]:
# Load data from BigQuery.
taxi_10m = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_10m').load()
taxi_10m.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_10m)

In [None]:
# 100M

In [None]:
# Load data from BigQuery.
taxi_100m = spark.read.format('bigquery').option('table', 'sap-ds-demo:big_data_demo_ew2.ny_taxi_100m').load()
taxi_100m.createOrReplaceTempView('taxi')

In [None]:
run_speed_test(taxi_100m)

In [None]:
# Load data from HDFS

In [12]:
# Load data from BigQuery.

taxi_1m = spark.read.csv('ny_taxi_1m')
taxi_1m.show()

+---------+--------------+----+-----+---------+-------+-------+---------------+-------------+-------------------+------------------+------------------+-------------------+------------------+------------+------------+------------------+------------------+------------------+------------------+
|      _c0|           _c1| _c2|  _c3|      _c4|    _c5|    _c6|            _c7|          _c8|                _c9|              _c10|              _c11|               _c12|              _c13|        _c14|        _c15|              _c16|              _c17|              _c18|              _c19|
+---------+--------------+----+-----+---------+-------+-------+---------------+-------------+-------------------+------------------+------------------+-------------------+------------------+------------+------------+------------------+------------------+------------------+------------------+
|vendor_id|journey_length|year|month|dayofweek|pu_hour|do_hour|passenger_count|trip_distance|   pickup_longitude|        

# Load data locally

In [16]:
taxi_1m = spark.read.load('ny_taxi_1m',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [17]:
taxi_1m.show()

+---------+--------------+----+-----+---------+-------+-------+---------------+-------------+------------------+------------------+------------------+------------------+------------------+------------+------------+------------------+------------------+------------------+------------------+
|vendor_id|journey_length|year|month|dayofweek|pu_hour|do_hour|passenger_count|trip_distance|  pickup_longitude|         rate_code|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|total_amount|       pu_m_center|       do_m_center|          pu_m_jfk|          do_m_jfk|
+---------+--------------+----+-----+---------+-------+-------+---------------+-------------+------------------+------------------+------------------+------------------+------------------+------------+------------+------------------+------------------+------------------+------------------+
|        1|            24|2015|    3|        2|      0|      0|              1|         17.3|-73.78961181640625| 40.64690017700

In [18]:
run_speed_test(taxi_1m)

Counting data
Data count 1000000
COUNT: 0:00:07.438796

Grouping data
GROUP: 0:00:02.772813

Describing data
Describe: 0:00:08.969309

Preparing model data
Prep: 0:00:01.381856

Fitting Mllib model
Fit: 0:01:00.381875

Making predicitons with Mllib model
Predictions: 0:00:00.185529

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.70, R2 0.77
Evaluation: 0:00:27.620122



In [19]:
# 10 million rows local

In [22]:
taxi_10m = spark.read.load('ny_taxi_10m',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [23]:
run_speed_test(taxi_10m)

Counting data
Data count 10000000
COUNT: 0:00:19.074863

Grouping data
GROUP: 0:00:02.437312

Describing data
Describe: 0:00:34.151717

Preparing model data
Prep: 0:00:01.307285

Fitting Mllib model
Fit: 0:03:25.537359

Making predicitons with Mllib model
Predictions: 0:00:00.036590

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.50, R2 0.77
Evaluation: 0:01:35.566430



# GCS storage

# 1M

In [34]:
taxi_1m = spark.read.load('gs://pyspark_temp_20201006_eu/ny_taxi_1m',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [35]:
run_speed_test(taxi_1m)

Counting data
Data count 1000000
COUNT: 0:00:00.081962

Grouping data
GROUP: 0:00:00.415750

Describing data
Describe: 0:00:04.898204

Preparing model data
Prep: 0:00:00.306620

Fitting Mllib model
Fit: 0:00:31.139412

Making predicitons with Mllib model
Predictions: 0:00:00.023607

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.17, R2 0.81
Evaluation: 0:00:13.660784



In [30]:
run_speed_test(taxi_1m)

Counting data
Data count 1000000
COUNT: 0:00:00.101730

Grouping data
GROUP: 0:00:00.721013

Describing data
Describe: 0:00:05.284211

Preparing model data
Prep: 0:00:00.110020

Fitting Mllib model
Fit: 0:00:35.709474

Making predicitons with Mllib model
Predictions: 0:00:00.061698

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.17, R2 0.81
Evaluation: 0:00:14.225229



# 10m

10 million row file is the same data which was uploaded to HDFS. This file took the 3 files exported from BQ and manually combined using bash cat commands.

In [32]:
taxi_10m = spark.read.load('gs://pyspark_temp_20201006_eu/ny_taxi_10m',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [33]:
run_speed_test(taxi_10m)

Counting data
Data count 10000000
COUNT: 0:00:14.354154

Grouping data
GROUP: 0:00:02.460545

Describing data
Describe: 0:00:19.528686

Preparing model data
Prep: 0:00:01.391287

Fitting Mllib model
Fit: 0:02:24.247229

Making predicitons with Mllib model
Predictions: 0:00:00.077017

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.71, R2 0.75
Evaluation: 0:01:06.466939



# 10 million multifile

In [36]:
taxi_10m = spark.read.load('gs://pyspark_temp_20201006_eu/ny_taxi_10m000*',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [37]:
run_speed_test(taxi_10m)

Counting data
Data count 10000000
COUNT: 0:00:09.581532

Grouping data
GROUP: 0:00:00.748562

Describing data
Describe: 0:00:19.096682

Preparing model data
Prep: 0:00:00.291812

Fitting Mllib model
Fit: 0:02:20.994508

Making predicitons with Mllib model
Predictions: 0:00:00.030538

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 5.27, R2 0.78
Evaluation: 0:01:04.689766



# 100 million multifile

2 pw, 5 secondary

In [41]:
taxi_100m = spark.read.load('gs://pyspark_temp_20201006_eu/ny_taxi_100m_000*',
                    format='com.databricks.spark.csv', 
                    header='true', 
                    inferSchema='true').cache()

In [None]:
run_speed_test(taxi_100m)

Counting data
Data count 100000000
COUNT: 0:01:15.780924

Grouping data
GROUP: 0:00:01.561427

Describing data
Describe: 0:02:33.502025

Preparing model data
Prep: 0:00:00.382757

Fitting Mllib model
Fit: 0:23:38.931776

Making predicitons with Mllib model
Predictions: 0:00:00.122678

Evaluating model
Calculating RMSE
Calculating R2
RMSE: 4801.70, R2 0.00
Evaluation: 0:10:32.800169

