In [1]:
import sys
import pandas as pd
from spark_session import LocalSparkSession
from dataset import Dataset
from mr_id3 import MapReduceIDR3
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from decisiontree_pyspark import DecisionTreePySpark
from decisiontree_sklearn import DecisionTreeSklearn
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.mllib.tree import DecisionTree

In [2]:
%time
num_fields = [
    'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
    'hours_per_week', ]

categorical_fields = [
    'workclass', 'education',
    'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'native_country', ]

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [3]:
%time
target = 'label'
filename = 'dataset/adult.data'

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs


In [4]:
%time
number_of_cores = 4
dataset_size = 1000
dataset_sizes = list(range(1, 2))
metrics = []

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [5]:
%time
spark = LocalSparkSession(number_of_cores)
spark.start()

2022-10-16 11:06:08,662 [INFO] LocalSparkSession : Starting with 4 clusters


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
22/10/16 11:06:10 WARN Utils: Your hostname, Mac-Pro-de-MARCELO.local resolves to a loopback address: 127.0.0.1; using 192.168.0.62 instead (on interface en2)
22/10/16 11:06:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/16 11:06:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/16 11:06:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/16 11:06:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [6]:
%time
dataset = Dataset(spark.spark, filename, num_fields, categorical_fields, target)
dataset.load()
dataset.one_hot_encode_categorical_fields()

2022-10-16 11:06:12,680 [INFO] Dataset : Starting
2022-10-16 11:06:12,681 [INFO] Dataset : Loading Dataset dataset/adult.data


CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.11 µs


2022-10-16 11:06:15,936 [INFO] Dataset : One Hot Encode Categorical Features
                                                                                

In [7]:
metrics = []

In [8]:
dt = DecisionTree()

In [11]:
for dataset_size in dataset_sizes:
    df = dataset.multiply_dataset(dataset_size)
    df_pandas = df.toPandas()
    metric_dict = {'dataset_size_num': dataset_size, 'dataset_size': sys.getsizeof(df_pandas)}
    
    # PySpark
    cvp = DecisionTreePySpark(df)
    parameters = ParamGridBuilder() \
        .addGrid(dt.trainClassifier.maxDepth, [10, 20, 30, 40, 50, 60, 70]).build()
    cvp.crossvalidation_train(parameters)
    m = cvp.get_metrics()
    metric_dict['pyspark'] = m['time']
    
    # SKLearn
    cvs = DecisionTreeSklearn(df_pandas)
    cvs.set_x_y()
    parameters = dict(
        #df__criterion=['entropy', ],
        dt__max_depth=[10, 20, 30, 40, 50, 60, 70],
        dt__min_samples_split=[1, 2, 3],
        #dt__max_features=[16, 32, 64]
    )
    cvs.crossvalidation_train(parameters)
    m = cvs.get_metrics()
    metric_dict['sklearn'] = m['time']
    
    metrics.append(metric_dict)

2022-10-16 11:07:54,922 [INFO] Dataset : Multiplying Dataset by 1x
2022-10-16 11:07:55,629 [INFO] DecisionTreePySpark : Starting


AttributeError: 'function' object has no attribute 'maxDepth'

# CrossValidationPySpark

In [None]:
# %time
# crossvalidation_pyspark = CrossValidationPySpark(df)

In [None]:
# parameters = ParamGridBuilder() \
#     .addGrid(dt.maxDepth, [10, 20, 30, 40, 50, 60, 70]).build()

In [None]:
# %time
# crossvalidation_pyspark.train(parameters)

In [None]:
# m = crossvalidation_pyspark.get_metrics()
# m['Algorithm'] = 'PySpark'
# metrics.append(m)

# CrossValidationSkLearn

In [None]:
# %time
# crossvalidation_sklearn = CrossValidationSkLearn(df)
# crossvalidation_sklearn.set_x_y()

In [None]:
# parameters = dict(
#     #df__criterion=['entropy', ],
#     dt__max_depth=[10, 20, 30, 40, 50, 60, 70],
#     #dt__min_samples_split=[1, 2, 3],
#     #dt__max_features=[16, 32, 64]
# )

In [None]:
# %time
# crossvalidation_sklearn.train(parameters)

In [None]:
# m = crossvalidation_sklearn.get_metrics()
# m['Algorithm'] = 'SkLearn'
# metrics.append(m)

# Results

In [None]:
%time
df = pd.DataFrame.from_dict(metrics)

In [None]:
df