In [5]:
spark

In [6]:
import pandas as pd

from catboost import CatBoostClassifier

from itertools import product

from pyspark.sql import DataFrame
from pyspark.sql import functions as sf
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import (
    DoubleType, FloatType, IntegerType, StringType, StructField, StructType
)

from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
N_FEATURES = 20
N_CLASSES = 10

In [8]:
def train_and_evaluate_model(X_train, y_train, X_test, y_test, kwargs={}):

    # split data
    X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # create model
    model = CatBoostClassifier(
        nan_mode='Min',
        random_seed=42,
        boosting_type='Plain',
        bootstrap_type='Bernoulli',
        rsm=0.1,
        loss_function='Logloss',
        use_best_model=True,
        early_stopping_rounds=100,
        **kwargs
    )

    # fit model
    model.fit(X_train.values, y_train.values, eval_set=(X_eval, y_eval))
    
    # evaluate model
    accuracy = accuracy_score(model.predict(X_test), y_test)

    return accuracy

# 產生data

In [9]:
X, y = make_multilabel_classification(
    n_samples=1000,
    n_features=N_FEATURES,
    n_classes=N_CLASSES,
    random_state=42
)

In [10]:
pdf = pd.DataFrame(X)
for i in range(N_CLASSES):
    pdf[f'y_{i}'] = y[:, i]
df = spark.createDataFrame(pdf)

In [11]:
print(f'number of rows in the dataset: {df.count()}')

2022-03-30 15:57:45,753 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

number of rows in the dataset: 1000


                                                                                

In [12]:
# 查看數據
df.show()

+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+---+---+---+---+---+---+---+---+---+---+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11|  12| 13| 14| 15| 16| 17|  18|  19|y_0|y_1|y_2|y_3|y_4|y_5|y_6|y_7|y_8|y_9|
+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+---+---+---+---+---+---+---+---+---+---+
|2.0|2.0|0.0|1.0|3.0|5.0|0.0|3.0|4.0|1.0|2.0|5.0| 2.0|1.0|4.0|1.0|3.0|4.0|10.0| 2.0|  0|  1|  1|  0|  0|  0|  0|  1|  0|  0|
|4.0|3.0|2.0|2.0|0.0|4.0|1.0|2.0|0.0|3.0|1.0|7.0| 4.0|2.0|3.0|1.0|2.0|2.0| 2.0| 1.0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  1|
|2.0|2.0|3.0|0.0|0.0|0.0|0.0|6.0|0.0|3.0|4.0|0.0| 5.0|1.0|0.0|0.0|1.0|2.0| 4.0| 0.0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|0.0|1.0|4.0|4.0|2.0|0.0|2.0|1.0|3.0|2.0|1.0|1.0| 3.0|0.0|0.0|2.0|6.0|3.0| 3.0| 1.0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|0.0|0.0|7.0|2.0|1.0|0.0|1.0|2.0|1.0|2.0|2.0|1.0| 4.0|0.0|5.0|5.0|0.0|0.0| 4.0| 2.0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|


In [13]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.

'false'

# Distributed Grid Search

In [14]:
values_range = list(
    product(
        [200, 250],
        [3, 5, 7],
        [0.02, 0.1, 0.2],
        ['MinEntropy', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum'],
        [1.0],
        [0.5],
    )
)

schema = StructType(
    [
        StructField('iterations', IntegerType(), True),
        StructField('depth', IntegerType(), True),
        StructField('learning_rate', DoubleType(), True),
        StructField('feature_border_type', StringType(), True),
        StructField('l2_leaf_reg', FloatType(), True),
        StructField('subsample', FloatType(), True)
    ]
)

df_grid = spark.createDataFrame(data=values_range, schema=schema)
df_grid = df_grid.withColumn('replication_id', sf.monotonically_increasing_id())

In [15]:
df_grid.show()

+----------+-----+-------------+-------------------+-----------+---------+--------------+
|iterations|depth|learning_rate|feature_border_type|l2_leaf_reg|subsample|replication_id|
+----------+-----+-------------+-------------------+-----------+---------+--------------+
|       200|    3|         0.02|         MinEntropy|        1.0|      0.5|             0|
|       200|    3|         0.02|            Uniform|        1.0|      0.5|             1|
|       200|    3|         0.02|UniformAndQuantiles|        1.0|      0.5|             2|
|       200|    3|         0.02|       GreedyLogSum|        1.0|      0.5|    8589934592|
|       200|    3|          0.1|         MinEntropy|        1.0|      0.5|    8589934593|
|       200|    3|          0.1|            Uniform|        1.0|      0.5|    8589934594|
|       200|    3|          0.1|UniformAndQuantiles|        1.0|      0.5|   17179869184|
|       200|    3|          0.1|       GreedyLogSum|        1.0|      0.5|   17179869185|
|       20

In [16]:
print(f'number of different hyperparameter combinations: {df_grid.count()}')

number of different hyperparameter combinations: 72


In [17]:
# 將 data 和 參數表 結合
df_replicated = df.crossJoin(df_grid)

In [18]:
print(f'number of rows in the replicated dataset: {df_replicated.count()}')



number of rows in the replicated dataset: 72000


                                                                                

# Pandas_udf

In [19]:
# declare the schema for the output of our function
schema = StructType(
    [
        StructField('replication_id', IntegerType(),True),
        StructField('accuracy', FloatType(),True),
        StructField("iterations", IntegerType(), True),
        StructField("depth", IntegerType(), True),
        StructField("learning_rate", DoubleType(), True),
        StructField("feature_border_type", StringType(), True),
        StructField("l2_leaf_reg", FloatType(), True),
        StructField("subsample", FloatType(), True)
     ]
)

# decorate our function with pandas_udf decorator
# @pandas_udf(schema, sf.PandasUDFType.GROUPED_MAP)
def hyperparameter_search(pdf):

    # get hyperparameter values
    kwargs = {
        'iterations': pdf.iterations.values[0],
        'depth': pdf.depth.values[0],
        'learning_rate': pdf.learning_rate.values[0],
        'feature_border_type': pdf.feature_border_type.values[0],
        'l2_leaf_reg': pdf.l2_leaf_reg.values[0],
        'subsample': pdf.subsample.values[0]
    }
    
    # get data and label
    X = pdf[[str(i) for i in range(N_FEATURES)]]
    y = pdf['y_0']

    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # get accuracy
    accuracy = train_and_evaluate_model(X_train, y_train, X_test, y_test, kwargs)

    # return results as pandas DF
    kwargs.update({
        'replication_id': pdf.replication_id.values[0],
        'accuracy': accuracy
    })
    results = pd.DataFrame([kwargs])

    return results

In [20]:
results = df_replicated.groupby('replication_id').applyInPandas(hyperparameter_search,schema=schema)


In [21]:
results5 = results.sort('accuracy', ascending=False).limit(5)

In [22]:
results5.show()



+--------------+--------+----------+-----+-------------+-------------------+-----------+---------+
|replication_id|accuracy|iterations|depth|learning_rate|feature_border_type|l2_leaf_reg|subsample|
+--------------+--------+----------+-----+-------------+-------------------+-----------+---------+
|             2|   0.915|       250|    7|          0.1|            Uniform|        1.0|      0.5|
|             2|    0.91|       200|    7|          0.1|            Uniform|        1.0|      0.5|
|             1|    0.91|       250|    3|          0.2|UniformAndQuantiles|        1.0|      0.5|
|             0|     0.9|       250|    3|          0.1|UniformAndQuantiles|        1.0|      0.5|
|             1|     0.9|       250|    3|          0.1|         MinEntropy|        1.0|      0.5|
+--------------+--------+----------+-----+-------------+-------------------+-----------+---------+



                                                                                

# Distributed K-Fold Cross-Validation

In [23]:
N_FOLDS = 8

In [24]:
proportion = 1 / N_FOLDS
splits = df.randomSplit([proportion] * N_FOLDS, 42)

In [45]:
df_folds = splits[0].withColumn('fold', sf.lit(0))
for i in range(1, N_FOLDS):
    df_folds = df_folds.union(
        splits[i].withColumn('fold', sf.lit(i))
    )

In [54]:
df.count()

1000

In [49]:
df_numbers = spark.createDataFrame(
    pd.DataFrame(list(range(N_FOLDS)),columns=['replication_id'])
)

In [50]:
df_numbers.toPandas()

Unnamed: 0,replication_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7


In [52]:
df_replicated = df_folds.crossJoin(df_numbers)


In [53]:
print(f'number of rows in the replicated dataset: {df_replicated.count()}')




number of rows in the replicated dataset: 8000


                                                                                

In [55]:
# declare the schema for the output of our function
schema = StructType(
    [
        StructField('replication_id', IntegerType(), True),
        StructField('accuracy', FloatType(), True)
    ]
)

# decorate our function with pandas_udf decorator
@pandas_udf(schema, sf.PandasUDFType.GROUPED_MAP)
def cross_validation(pdf):
    
    # get repliaction id
    replication_id = pdf.replication_id.values[0]
    
    # get data and label
    columns = [str(i) for i in range(N_FEATURES)]
    X_train = pdf[pdf.fold != replication_id][columns]
    X_test = pdf[pdf.fold == replication_id][columns]
    y_train = pdf[pdf.fold != replication_id]['y_0']
    y_test = pdf[pdf.fold == replication_id]['y_0']

    # get accuracy
    accuracy = train_and_evaluate_model(X_train, y_train, X_test, y_test)

    # return results as pandas DF
    results = pd.DataFrame([{
        'replication_id': replication_id,
        'accuracy': accuracy
    }])

    # save the model (if you want to retrieve it later)

    return results

In [56]:
results = df_replicated.groupby('replication_id').apply(cross_validation)



In [59]:
results8 = results.sort('accuracy', ascending=False)

In [62]:
results8.count()

                                                                                

8

In [63]:
results8.show()

[Stage 58:>                                                         (0 + 1) / 1]

+--------------+----------+
|replication_id|  accuracy|
+--------------+----------+
|             2| 0.9310345|
|             6| 0.8939394|
|             1| 0.8914729|
|             3|0.88429755|
|             5| 0.8602941|
|             7| 0.8429752|
|             4|0.84166664|
|             0|     0.824|
+--------------+----------+



                                                                                