In [1]:
import sys
import pandas as pd
from spark_session import LocalSparkSession
from dataset import Dataset
from mr_id3 import MapReduceIDR3
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from decisiontree_pyspark import DecisionTreePySpark
from decisiontree_sklearn import DecisionTreeSklearn
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.mllib.tree import DecisionTree

In [2]:
%time
num_fields = [
    'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
    'hours_per_week', ]

categorical_fields = [
    'workclass', 'education',
    'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'native_country', ]

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [3]:
%time
target = 'label'
filename = 'dataset/adult.data'

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs


In [4]:
%time
number_of_cores = 8
multiplication_factors = [100, 200, 300, 400, 500]

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [5]:
%time
spark = LocalSparkSession(number_of_cores)
spark.start()

2022-10-16 12:38:31,686 [INFO] LocalSparkSession : Starting with 8 clusters


CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
22/10/16 12:38:33 WARN Utils: Your hostname, Mac-Pro-de-MARCELO.local resolves to a loopback address: 127.0.0.1; using 192.168.0.62 instead (on interface en2)
22/10/16 12:38:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/16 12:38:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/16 12:38:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race',
    'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country','label', ]

2022-10-16 12:38:36,193 [INFO] Dataset : Starting
2022-10-16 12:38:36,195 [INFO] Dataset : Loading Dataset dataset/adult.data


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


2022-10-16 12:38:39,316 [INFO] Dataset : Select Only Numerical Features


In [7]:
metrics = []

In [8]:
for f in multiplication_factors:

    dataset = Dataset(spark.spark, f'dataset/adult_{f}x.data', num_fields, categorical_fields, target)
    dataset.load()
    dataset.select_only_numerical_features()

    df = dataset.df
    df_pandas = dataset.df_pandas
    # df_pandas = pd.read_csv(f'dataset/adult_{f}x.data', header=0, names = columns)
    # df_pandas = df_pandas[['label']+num_fields ]

    metric_dict = {'dataset_size_num': f, 'dataset_size': sys.getsizeof(df_pandas)}

    # PySpark
    dt_pyspark = DecisionTreePySpark(df)
    dt_pyspark.train()
    m = dt_pyspark.get_metrics()
    metric_dict['pyspark'] = m['time']

    # SKLearn
    dt_sklearn = DecisionTreeSklearn(df_pandas)
    dt_sklearn.train()
    m = dt_sklearn.get_metrics()
    metric_dict['sklearn'] = m['time']

    metrics.append(metric_dict)

    print(f'dataset/adult_{f}x.data OK!')

2022-10-16 12:38:39,397 [INFO] Dataset : Multiplying Dataset by 0x
2022-10-16 12:38:41,513 [INFO] DecisionTreePySpark : Starting                   
2022-10-16 12:38:41,514 [INFO] DecisionTreePySpark : Training
2022-10-16 12:38:41,515 [INFO] DecisionTreePySpark : Setting Labeled Point
2022-10-16 12:38:41,587 [INFO] DecisionTreePySpark : Splitting
2022-10-16 12:38:41,594 [INFO] DecisionTreePySpark : Assembling
2022-10-16 12:38:46,806 [INFO] DecisionTreePySpark : Training time 4.860382 seconds
2022-10-16 12:38:46,808 [INFO] DecisionTreePySpark : Get metrics
2022-10-16 12:38:46,809 [INFO] DecisionTreeSklearn : Starting
2022-10-16 12:38:46,810 [INFO] DecisionTreeSklearn : Training
2022-10-16 12:38:46,811 [INFO] DecisionTreeSklearn : Setting X and y
2022-10-16 12:38:46,820 [INFO] DecisionTreeSklearn : Splitting
2022-10-16 12:38:46,925 [INFO] DecisionTreeSklearn : Training time 0.075515 seconds
2022-10-16 12:38:46,926 [INFO] DecisionTreeSklearn : Getting metrics


# CrossValidationPySpark

In [9]:
# %time
# crossvalidation_pyspark = CrossValidationPySpark(df)

In [10]:
# parameters = ParamGridBuilder() \
#     .addGrid(dt.maxDepth, [10, 20, 30, 40, 50, 60, 70]).build()

In [11]:
# %time
# crossvalidation_pyspark.train(parameters)

In [12]:
# m = crossvalidation_pyspark.get_metrics()
# m['Algorithm'] = 'PySpark'
# metrics.append(m)

# CrossValidationSkLearn

In [13]:
# %time
# crossvalidation_sklearn = CrossValidationSkLearn(df)
# crossvalidation_sklearn.set_x_y()

In [14]:
# parameters = dict(
#     #df__criterion=['entropy', ],
#     dt__max_depth=[10, 20, 30, 40, 50, 60, 70],
#     #dt__min_samples_split=[1, 2, 3],
#     #dt__max_features=[16, 32, 64]
# )

In [15]:
# %time
# crossvalidation_sklearn.train(parameters)

In [16]:
# m = crossvalidation_sklearn.get_metrics()
# m['Algorithm'] = 'SkLearn'
# metrics.append(m)

# Results

In [17]:
%time
df = pd.DataFrame.from_dict(metrics)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 9.3 µs


In [None]:
df.to_csv('results/decisiontree_compare.csv')

In [18]:
print(df)

Unnamed: 0,dataset_size_num,dataset_size,pyspark,sklearn
0,1000,911852,4.860382,0.075515
