In [1]:
import pandas as pd
from spark_session import LocalSparkSession
from dataset import Dataset
from mr_id3 import MapReduceIDR3
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from crossvalidation_pyspark import CrossValidationPySpark
from crossvalidation_sklearn import CrossValidationSkLearn
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import DecisionTreeClassifier

In [2]:
%time
num_fields = [
    'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
    'hours_per_week', ]

categorical_fields = [
    'workclass', 'education',
    'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'native_country', ]

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.44 µs


In [3]:
%time
target = 'label'
filename = 'dataset/adult.data'

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.44 µs


In [4]:
%time
number_of_cores = 4
dataset_size = 1000
dataset_sizes = list(range(1,200, 20))
metrics = []

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.44 µs


In [5]:
%time
spark = LocalSparkSession(number_of_cores)
spark.start()

2022-10-15 20:31:47,448 [INFO] LocalSparkSession : Starting with 4 clusters


CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/15 20:31:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
%time
dataset = Dataset(spark.spark, filename, num_fields, categorical_fields, target)
dataset.load()
dataset.one_hot_encode_categorical_fields()

2022-10-15 20:31:51,785 [INFO] Dataset : Starting
2022-10-15 20:31:51,790 [INFO] Dataset : Loading Dataset dataset/adult.data


CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.4 µs


2022-10-15 20:31:55,469 [INFO] Dataset : One Hot Encode Categorical Fields
                                                                                

In [7]:
metrics = []

In [8]:
dt = DecisionTreeClassifier()

In [9]:
for dataset_size in dataset_sizes:
    df = dataset.multiply_dataset(dataset_size)
    df_pandas = df.toPandas()
    metric_dict = {'dataset_size': dataset_size, }
    
    # PySpark
    crossvalidation_pyspark = CrossValidationPySpark(df)
    parameters = ParamGridBuilder() \
        .addGrid(dt.maxDepth, [10, 20, 30, 40, 50, 60, 70]).build()
    crossvalidation_pyspark.train(parameters)
    m = crossvalidation_pyspark.get_metrics()
    metric_dict['pyspark'] = m['time']
    
    # SKLearn
    crossvalidation_sklearn = CrossValidationSkLearn(df_pandas)
    crossvalidation_sklearn.set_x_y()
    parameters = dict(
        #df__criterion=['entropy', ],
        dt__max_depth=[10, 20, 30, 40, 50, 60, 70],
        #dt__min_samples_split=[1, 2, 3],
        #dt__max_features=[16, 32, 64]
    )
    crossvalidation_sklearn.train(parameters)
    m = crossvalidation_sklearn.get_metrics()
    metric_dict['sklearn'] = m['time']
    
    metrics.append(metric_dict)

2022-10-15 20:32:04,033 [INFO] Dataset : Multiplying Dataset by 1x
2022-10-15 20:32:04,217 [INFO] MapReduceIDR3 : Starting
2022-10-15 20:32:04,220 [INFO] CrossValidationSkLearn : Setting X and y
2022-10-15 20:32:07,375 [INFO] CrossValidationPySpark : Training                
2022-10-15 20:32:07,676 [INFO] CrossValidationPySpark : Training time 0.299632 seconds
2022-10-15 20:32:07,677 [INFO] CrossValidationPySpark : Getting metrics
2022-10-15 20:32:07,680 [INFO] Dataset : Multiplying Dataset by 21x
2022-10-15 20:32:09,631 [INFO] MapReduceIDR3 : Starting
2022-10-15 20:32:09,633 [INFO] CrossValidationSkLearn : Setting X and y
2022-10-15 20:32:21,848 [INFO] CrossValidationPySpark : Training                
2022-10-15 20:32:22,659 [INFO] CrossValidationPySpark : Training time 0.809146 seconds
2022-10-15 20:32:22,660 [INFO] CrossValidationPySpark : Getting metrics
2022-10-15 20:32:22,662 [INFO] Dataset : Multiplying Dataset by 41x
2022-10-15 20:32:27,477 [INFO] MapReduceIDR3 : Starting
2022-

KeyboardInterrupt: 

# CrossValidationPySpark

In [None]:
# %time
# crossvalidation_pyspark = CrossValidationPySpark(df)

In [None]:
# parameters = ParamGridBuilder() \
#     .addGrid(dt.maxDepth, [10, 20, 30, 40, 50, 60, 70]).build()

In [None]:
# %time
# crossvalidation_pyspark.train(parameters)

In [None]:
# m = crossvalidation_pyspark.get_metrics()
# m['Algorithm'] = 'PySpark'
# metrics.append(m)

# CrossValidationSkLearn

In [None]:
# %time
# crossvalidation_sklearn = CrossValidationSkLearn(df)
# crossvalidation_sklearn.set_x_y()

In [None]:
# parameters = dict(
#     #df__criterion=['entropy', ],
#     dt__max_depth=[10, 20, 30, 40, 50, 60, 70],
#     #dt__min_samples_split=[1, 2, 3],
#     #dt__max_features=[16, 32, 64]
# )

In [None]:
# %time
# crossvalidation_sklearn.train(parameters)

In [None]:
# m = crossvalidation_sklearn.get_metrics()
# m['Algorithm'] = 'SkLearn'
# metrics.append(m)

# Results

In [None]:
%time
df = pd.DataFrame.from_dict(metrics)

In [None]:
df