In [1]:
spark

In [2]:
import pandas as pd

from catboost import CatBoostClassifier

from itertools import product

from pyspark.sql import DataFrame
from pyspark.sql import functions as sf
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import (
    DoubleType, FloatType, IntegerType, StringType, StructField, StructType
)

from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
N_FEATURES = 20
N_CLASSES = 10

In [4]:
def train_and_evaluate_model(X_train, y_train, X_test, y_test, kwargs={}):

    # split data
    X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # create model
    model = CatBoostClassifier(
        nan_mode='Min',
        random_seed=42,
        boosting_type='Plain',
        bootstrap_type='Bernoulli',
        rsm=0.1,
        loss_function='Logloss',
        use_best_model=True,
        early_stopping_rounds=100,
        **kwargs
    )

    # fit model
    model.fit(X_train.values, y_train.values, eval_set=(X_eval, y_eval))
    
    # evaluate model
    accuracy = accuracy_score(model.predict(X_test), y_test)

    return accuracy

# 產生data

In [5]:
X, y = make_multilabel_classification(
    n_samples=1000,
    n_features=N_FEATURES,
    n_classes=N_CLASSES,
    random_state=42
)

In [6]:
pdf = pd.DataFrame(X)
for i in range(N_CLASSES):
    pdf[f'y_{i}'] = y[:, i]
df = spark.createDataFrame(pdf)

In [7]:
print(f'number of rows in the dataset: {df.count()}')

22/02/18 17:03:01 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 0:>                                                          (0 + 2) / 2]

number of rows in the dataset: 1000


                                                                                

In [8]:
# 查看數據
df.show()

+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+---+---+---+---+---+---+---+---+---+---+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11|  12| 13| 14| 15| 16| 17|  18|  19|y_0|y_1|y_2|y_3|y_4|y_5|y_6|y_7|y_8|y_9|
+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+---+---+---+---+---+---+---+---+---+---+
|2.0|2.0|0.0|1.0|3.0|5.0|0.0|3.0|4.0|1.0|2.0|5.0| 2.0|1.0|4.0|1.0|3.0|4.0|10.0| 2.0|  0|  1|  1|  0|  0|  0|  0|  1|  0|  0|
|4.0|3.0|2.0|2.0|0.0|4.0|1.0|2.0|0.0|3.0|1.0|7.0| 4.0|2.0|3.0|1.0|2.0|2.0| 2.0| 1.0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  1|
|2.0|2.0|3.0|0.0|0.0|0.0|0.0|6.0|0.0|3.0|4.0|0.0| 5.0|1.0|0.0|0.0|1.0|2.0| 4.0| 0.0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|0.0|1.0|4.0|4.0|2.0|0.0|2.0|1.0|3.0|2.0|1.0|1.0| 3.0|0.0|0.0|2.0|6.0|3.0| 3.0| 1.0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|0.0|0.0|7.0|2.0|1.0|0.0|1.0|2.0|1.0|2.0|2.0|1.0| 4.0|0.0|5.0|5.0|0.0|0.0| 4.0| 2.0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|


In [9]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.

'false'

# Distributed Grid Search

In [12]:
values_range = list(
    product(
        [200, 250],
        [3, 5, 7],
        [0.02, 0.1, 0.2],
        ['MinEntropy', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum'],
        [1.0],
        [0.5],
    )
)

schema = StructType(
    [
        StructField('iterations', IntegerType(), True),
        StructField('depth', IntegerType(), True),
        StructField('learning_rate', DoubleType(), True),
        StructField('feature_border_type', StringType(), True),
        StructField('l2_leaf_reg', FloatType(), True),
        StructField('subsample', FloatType(), True)
    ]
)

df_grid = spark.createDataFrame(data=values_range, schema=schema)
df_grid = df_grid.withColumn('replication_id', sf.monotonically_increasing_id())

In [13]:
df_grid.show()

+----------+-----+-------------+-------------------+-----------+---------+--------------+
|iterations|depth|learning_rate|feature_border_type|l2_leaf_reg|subsample|replication_id|
+----------+-----+-------------+-------------------+-----------+---------+--------------+
|       200|    3|         0.02|         MinEntropy|        1.0|      0.5|             0|
|       200|    3|         0.02|            Uniform|        1.0|      0.5|             1|
|       200|    3|         0.02|UniformAndQuantiles|        1.0|      0.5|             2|
|       200|    3|         0.02|       GreedyLogSum|        1.0|      0.5|             3|
|       200|    3|          0.1|         MinEntropy|        1.0|      0.5|             4|
|       200|    3|          0.1|            Uniform|        1.0|      0.5|             5|
|       200|    3|          0.1|UniformAndQuantiles|        1.0|      0.5|             6|
|       200|    3|          0.1|       GreedyLogSum|        1.0|      0.5|             7|
|       20

In [14]:
print(f'number of different hyperparameter combinations: {df_grid.count()}')

number of different hyperparameter combinations: 72


In [15]:
# 將 data 和 參數表 結合
df_replicated = df.crossJoin(df_grid)

In [16]:
print(f'number of rows in the replicated dataset: {df_replicated.count()}')

number of rows in the replicated dataset: 72000


# Pandas_udf

In [17]:
# declare the schema for the output of our function
schema = StructType(
    [
        StructField('replication_id', IntegerType(),True),
        StructField('accuracy', FloatType(),True),
        StructField("iterations", IntegerType(), True),
        StructField("depth", IntegerType(), True),
        StructField("learning_rate", DoubleType(), True),
        StructField("feature_border_type", StringType(), True),
        StructField("l2_leaf_reg", FloatType(), True),
        StructField("subsample", FloatType(), True)
     ]
)

# decorate our function with pandas_udf decorator
@pandas_udf(schema, sf.PandasUDFType.GROUPED_MAP)
def hyperparameter_search(pdf):

    # get hyperparameter values
    kwargs = {
        'iterations': pdf.iterations.values[0],
        'depth': pdf.depth.values[0],
        'learning_rate': pdf.learning_rate.values[0],
        'feature_border_type': pdf.feature_border_type.values[0],
        'l2_leaf_reg': pdf.l2_leaf_reg.values[0],
        'subsample': pdf.subsample.values[0]
    }
    
    # get data and label
    X = pdf[[str(i) for i in range(N_FEATURES)]]
    y = pdf['y_0']

    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # get accuracy
    accuracy = train_and_evaluate_model(X_train, y_train, X_test, y_test, kwargs)

    # return results as pandas DF
    kwargs.update({
        'replication_id': pdf.replication_id.values[0],
        'accuracy': accuracy
    })
    results = pd.DataFrame([kwargs])

    return results

In [18]:
results = df_replicated.groupby('replication_id').apply(hyperparameter_search)




In [19]:
results5 = results.sort('accuracy', ascending=False).limit(5).show()

22/02/18 17:06:01 WARN TaskSetManager: Lost task 1.0 in stage 14.0 (TID 19) (172.22.33.143 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 603, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 423, in read_udfs
    arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 251, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
    command = serializer._read_with_length(file)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/usr/local/spark/python/lib/

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 603, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 423, in read_udfs
    arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 251, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
    command = serializer._read_with_length(file)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'sklearn'


In [64]:
results5

Unnamed: 0,replication_id,accuracy,iterations,depth,learning_rate,feature_border_type,l2_leaf_reg,subsample
0,10,0.905,200,3,0.2,UniformAndQuantiles,1.0,0.5
1,11,0.905,250,3,0.2,GreedyLogSum,1.0,0.5
2,8,0.905,200,3,0.2,MinEntropy,1.0,0.5
3,11,0.905,200,3,0.2,GreedyLogSum,1.0,0.5
4,8,0.905,250,3,0.2,MinEntropy,1.0,0.5


# Distributed K-Fold Cross-Validation

In [66]:
N_FOLDS = 8

In [67]:
proportion = 1 / N_FOLDS
splits = df.randomSplit([proportion] * N_FOLDS, 42)
df_folds = splits[0].withColumn('fold', sf.lit(0))
for i in range(1, N_FOLDS):
    df_folds = df_folds.union(
        splits[i].withColumn('fold', sf.lit(i))
    )

In [68]:
df_numbers = spark.createDataFrame(
    pd.DataFrame(list(range(N_FOLDS)),columns=['replication_id'])
)

In [69]:
df_numbers.toPandas()

Unnamed: 0,replication_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7


In [70]:
df_replicated = df_folds.crossJoin(df_numbers)


In [71]:
print(f'number of rows in the replicated dataset: {df_replicated.count()}')


number of rows in the replicated dataset: 8000


In [72]:
# declare the schema for the output of our function
schema = StructType(
    [
        StructField('replication_id', IntegerType(), True),
        StructField('accuracy', FloatType(), True)
    ]
)

# decorate our function with pandas_udf decorator
@pandas_udf(schema, sf.PandasUDFType.GROUPED_MAP)
def cross_validation(pdf):
    
    # get repliaction id
    replication_id = pdf.replication_id.values[0]
    
    # get data and label
    columns = [str(i) for i in range(N_FEATURES)]
    X_train = pdf[pdf.fold != replication_id][columns]
    X_test = pdf[pdf.fold == replication_id][columns]
    y_train = pdf[pdf.fold != replication_id]['y_0']
    y_test = pdf[pdf.fold == replication_id]['y_0']

    # get accuracy
    accuracy = train_and_evaluate_model(X_train, y_train, X_test, y_test)

    # return results as pandas DF
    results = pd.DataFrame([{
        'replication_id': replication_id,
        'accuracy': accuracy
    }])

    # save the model (if you want to retrieve it later)

    return results

In [73]:
results = df_replicated.groupby('replication_id').apply(cross_validation)



In [75]:
%%time

results8 = results.sort('accuracy', ascending=False).toPandas()

Learning rate set to 0.029098
0:	learn: 0.6745432	test: 0.6745236	best: 0.6745236 (0)	total: 50.8ms	remaining: 50.7s
1:	learn: 0.6573091	test: 0.6580609	best: 0.6580609 (1)	total: 51.2ms	remaining: 25.5s
2:	learn: 0.6429421	test: 0.6437792	best: 0.6437792 (2)	total: 51.3ms	remaining: 17.1s
3:	learn: 0.6284098	test: 0.6301011	best: 0.6301011 (3)	total: 51.5ms	remaining: 12.8s
4:	learn: 0.6143420	test: 0.6170370	best: 0.6170370 (4)	total: 51.7ms	remaining: 10.3s
5:	learn: 0.6030299	test: 0.6058227	best: 0.6058227 (5)	total: 51.8ms	remaining: 8.58s
6:	learn: 0.5861650	test: 0.5895569	best: 0.5895569 (6)	total: 52ms	remaining: 7.38s
7:	learn: 0.5708797	test: 0.5751147	best: 0.5751147 (7)	total: 52.2ms	remaining: 6.47s
8:	learn: 0.5605356	test: 0.5648275	best: 0.5648275 (8)	total: 52.3ms	remaining: 5.76s
9:	learn: 0.5513746	test: 0.5559366	best: 0.5559366 (9)	total: 52.5ms	remaining: 5.2s
10:	learn: 0.5411499	test: 0.5464210	best: 0.5464210 (10)	total: 52.8ms	remaining: 4.75s
11:	learn: 0.5

CPU times: user 34.1 ms, sys: 8.84 s, total: 8.88 s
Wall time: 10.5 s


	learn: 0.1091943	test: 0.2490049	best: 0.2483748 (474)	total: 170ms	remaining: 183ms
482:	learn: 0.1091157	test: 0.2488279	best: 0.2483748 (474)	total: 170ms	remaining: 182ms
483:	learn: 0.1087038	test: 0.2489085	best: 0.2483748 (474)	total: 170ms	remaining: 182ms
484:	learn: 0.1087002	test: 0.2488611	best: 0.2483748 (474)	total: 171ms	remaining: 181ms
485:	learn: 0.1085340	test: 0.2491300	best: 0.2483748 (474)	total: 171ms	remaining: 181ms
486:	learn: 0.1085244	test: 0.2491914	best: 0.2483748 (474)	total: 171ms	remaining: 180ms
487:	learn: 0.1084837	test: 0.2493124	best: 0.2483748 (474)	total: 171ms	remaining: 179ms
488:	learn: 0.1083767	test: 0.2489956	best: 0.2483748 (474)	total: 171ms	remaining: 179ms
489:	learn: 0.1083765	test: 0.2489961	best: 0.2483748 (474)	total: 171ms	remaining: 178ms
490:	learn: 0.1079686	test: 0.2489249	best: 0.2483748 (474)	total: 171ms	remaining: 178ms
491:	learn: 0.1079340	test: 0.2490347	best: 0.2483748 (474)	total: 172ms	remaining: 177ms
492:	learn: 0.

In [76]:
results8

Unnamed: 0,replication_id,accuracy
0,2,0.937984
1,1,0.901515
2,3,0.899225
3,0,0.86087
4,6,0.858268
5,4,0.842105
6,5,0.840278
7,7,0.827273
