<a href="https://colab.research.google.com/github/morrowbord/Spark/blob/main/Hyperparameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark spark_sklearn -q

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.memory', '3g')\
        .getOrCreate()

In [None]:
sc = spark.sparkContext
spark

In [None]:
# Выборка https://www.kaggle.com/shivam2503/diamonds
import pandas as pd
pdf = pd.read_csv("diamonds.csv", header=0)
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Feature engineering

In [None]:
pdf['cut'] = pdf['cut'].replace({'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4})
pdf['color'] = pdf['color'].replace({'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6})
pdf['clarity'] = pdf['clarity'].replace({'I1': 0, 'SI1': 1, 'SI2': 2, 'VS1': 3, 'VS2': 4, 'VVS1': 5, 'VVS2': 6, 'IF': 7})
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,3,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,4,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,0,2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
pdf.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [None]:
labels = pdf['price'].values
featureNames = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
features = pdf[featureNames].values

In [None]:
from sklearn.preprocessing import normalize
features = normalize(features, axis=0)
features

array([[0.00106702, 0.00553547, 0.005655  , ..., 0.0029123 , 0.00293078,
        0.00289958],
       [0.00097424, 0.0041516 , 0.005655  , ..., 0.00286806, 0.00282769,
        0.00275639],
       [0.00106702, 0.00138387, 0.005655  , ..., 0.00298603, 0.00299705,
        0.00275639],
       ...,
       [0.00324745, 0.00276773, 0.006786  , ..., 0.00417307, 0.00418262,
        0.00424794],
       [0.00398973, 0.0041516 , 0.002262  , ..., 0.00453434, 0.00450662,
        0.00446272],
       [0.00347941, 0.00553547, 0.006786  , ..., 0.0042984 , 0.00432253,
        0.0043434 ]])

In [None]:
from sklearn import linear_model

model = linear_model.Ridge().fit(features, labels)

In [None]:
model.coef_

array([317631.26407957,  -5570.00639578, -35361.60027062, -12561.03271112,
         -391.85286415,   2893.356447  , 113393.7440294 , 112724.63907073,
       112036.32664852])

## Parameter tuning со Spark

Parameter tuning - это задача тьюнинга (гипер) параметров ML алгоритма с целью повысить качество модели. Тренируются различные модели (каждая со своим набором параметров) на одном и том же наборе данных и далее все полученные модели тестируются на одном и том же отложенном наборе данных, что снижает риск переобучения.

k-fold cross validation:


 - Случайным образом разбиваем данные на к равных частей ("folds")
     -  i = 1, 2, ..., k, откладываем набор данных i как validation set.
     -  training set - все кроме i

     -  для каждого набора параметров тренируем модель, подсчитываем ошибку на k различных validation set, усредняем, находим лучший набор параметров

 - Тренируем модель с лучшим набором параметров на всех данных 


Для каждой пары (fold, parameter set) можно обучать модель независимо от всех остальных. Мы распараллелим эти задания: scikit-learn будет обучать модель на каждом executor'е. Это параллелизация очень эффективна, так как обучение моделей - самая вычислительно сложная часть ML workflow.

Если используются k фолдов и P различных наборов параметров, то во сколько раз можно ускорить вычисление?


### Отложим random test set


In [None]:
from sklearn.model_selection import train_test_split

trainingLabels, testLabels, trainingFeatures, testFeatures = train_test_split(labels, features, test_size=0.3)
ntrain, ntest = len(trainingLabels), len(testLabels)
print('Split data randomly into 2 sets: %d training and %d test instances.' % (ntrain, ntest))

Split data randomly into 2 sets: 37758 training and 16182 test instances.


### Разобьем данные и определим таски, которые будем параллелизировать
Каждое распределенное задание это пара - (fold, parameter set) pair.

In [None]:
from sklearn.model_selection import KFold
numFolds = 3 # more (10 or so) in practice
kf = KFold(n_splits=numFolds)

In [None]:
alphas = [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
len(alphas)

9

In [None]:
tasks = []
for alpha in alphas:
    for k in range(numFolds):
        tasks.append((k, alpha))

In [None]:
tasks

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (0, 0.0001),
 (1, 0.0001),
 (2, 0.0001),
 (0, 0.001),
 (1, 0.001),
 (2, 0.001),
 (0, 0.01),
 (1, 0.01),
 (2, 0.01),
 (0, 0.1),
 (1, 0.1),
 (2, 0.1),
 (0, 1.0),
 (1, 1.0),
 (2, 1.0),
 (0, 10.0),
 (1, 10.0),
 (2, 10.0),
 (0, 100.0),
 (1, 100.0),
 (2, 100.0),
 (0, 1000.0),
 (1, 1000.0),
 (2, 1000.0)]

In [None]:
len(tasks)

27

In [None]:
tasksRDD = spark.sparkContext.parallelize(tasks, numSlices = len(tasks))
tasksRDD.getNumPartitions()

27

In [None]:
tasksRDD.take(10)

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (0, 0.0001),
 (1, 0.0001),
 (2, 0.0001),
 (0, 0.001),
 (1, 0.001),
 (2, 0.001),
 (0, 0.01)]

### Broadcast dataset

In [None]:
trainingFeaturesBroadcast = spark.sparkContext.broadcast(trainingFeatures)
trainingLabelsBroadcast = spark.sparkContext.broadcast(trainingLabels)

### Запустим параллельную cross-validation

Определим функцию, которая будет запускаться на каждом worker'e, эта функция берет одну пару (1 hyperparameter alpha value + 1 fold index) и тренируем соотвествующую модель. Используем RDD.map для этого.

In [None]:
from sklearn import linear_model

def trainOneModel(fold, alpha):
    """
    Given 1 task (1 hyperparameter alpha value + 1 fold index), train the corresponding model.
    Return: model, score on the fold's test data, task info.
    """
    localTrainingFeatures = trainingFeaturesBroadcast.value
    localTrainingLabels = trainingLabelsBroadcast.value
    trainIndex, valIndex = [], []
    fold_ = 0
    
    for trainIndex_, valIndex_ in kf.split(localTrainingFeatures):
        if fold_ == fold:
            trainIndex, valIndex = trainIndex_, valIndex_
            break
        fold_ += 1
    X_train, X_val = localTrainingFeatures[trainIndex], localTrainingFeatures[valIndex]
    Y_train, Y_val = localTrainingLabels[trainIndex], localTrainingLabels[valIndex]

    clf = linear_model.Ridge(alpha=alpha)
    clf.fit(X_train, Y_train)
    score = clf.score(X_val, Y_val)
    return clf, score, alpha, fold

In [None]:
trainedModelAndScores = tasksRDD.map(lambda alpha_fold: trainOneModel(alpha_fold[0], alpha_fold[1]))
trainedModelAndScores.cache()
trainedModelAndScores.count()

27

In [None]:
result = trainedModelAndScores.collect()

In [None]:
trainingFeaturesBroadcast.unpersist()
trainingLabelsBroadcast.unpersist()

### Соберем результаты для лучшей hyperparameter alpha

In [None]:
allScores = trainedModelAndScores.map(lambda x: (x[1], x[2], x[3])).collect()
avgScores = dict(map(lambda alpha: (alpha, 0.0), alphas))

In [None]:
for score, alpha, fold in allScores:
    avgScores[alpha] += score
for alpha in alphas:
    avgScores[alpha] /= numFolds
avgScores

{0.0: 0.8924062503088138,
 0.0001: 0.8923667650788563,
 0.001: 0.8901866361320506,
 0.01: 0.8771425031691389,
 0.1: 0.7305941591418175,
 1.0: 0.22909967607846105,
 10.0: 0.028522454941469058,
 100.0: 0.002810089361864998,
 1000.0: 0.00016733252461006062}

Теперь у нас есть список alpha values с соотвествующими средними scores, найдем среди них лучший.

In [None]:
bestAlpha = -1
bestScore = -1
for alpha in alphas:
    if avgScores[alpha] > bestScore:
        bestAlpha = alpha
        bestScore = avgScores[alpha]
print('Found best alpha: %g, which gives score: %g' % (bestAlpha, bestScore))

Found best alpha: 0, which gives score: 0.892406


In [None]:
spark.stop()

### Обучим финальную модель с лучшим набором гиперпараметров

Так это только 1 таск, то запустим его на драйвере.

In [None]:
tunedClf = linear_model.Ridge(alpha=bestAlpha)
tunedClf.fit(trainingFeatures, trainingLabels)

Ridge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

THE END!

### Spark-sklearn (joblib)
https://github.com/databricks/spark-sklearn

https://github.com/joblib/joblib-spark

In [None]:
import sys
from spark_sklearn import GridSearchCV

In [None]:
parameters = {"alpha": alphas}
parameters

{'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}

In [None]:
est = linear_model.Ridge()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.memory', '3g')\
        .getOrCreate()
        
clf = GridSearchCV(spark.sparkContext, est, parameters, n_jobs=4)

In [None]:
clf.fit(trainingFeatures, trainingLabels)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sc=<SparkContext master=local appName=Colab_pyspark>, scoring=None,
       verbose=0)

In [None]:
clf.best_estimator_

Ridge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
clf.cv_results_

{'mean_fit_time': array([0.00801373, 0.00703637, 0.00785232, 0.00746322, 0.00601006,
        0.00608913, 0.00602055, 0.00637905, 0.00629767]),
 'mean_score_time': array([0.00078472, 0.00082739, 0.00114671, 0.00088716, 0.00075134,
        0.00073504, 0.00075062, 0.0007813 , 0.00075992]),
 'mean_test_score': array([8.92406250e-01, 8.92366765e-01, 8.90186636e-01, 8.77142503e-01,
        7.30594159e-01, 2.29099676e-01, 2.85224549e-02, 2.81008936e-03,
        1.67332525e-04]),
 'mean_train_score': array([8.92736902e-01, 8.92515000e-01, 8.90256183e-01, 8.77246587e-01,
        7.30643262e-01, 2.29206931e-01, 2.86472561e-02, 2.93698313e-03,
        2.94439671e-04]),
 'param_alpha': masked_array(data=[0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': ({'alpha': 0.0},
  {'alpha': 0.0001},
  {'alpha': 0.0

In [None]:
spark.stop()

## Model conversion

In [None]:
from spark_sklearn import Converter

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.memory', '3g')\
        .getOrCreate()

In [None]:
converter = Converter(spark.sparkContext)

In [None]:
est = linear_model.LinearRegression()

In [None]:
type(est.fit(trainingFeatures, trainingLabels))

sklearn.linear_model.base.LinearRegression

In [None]:
est.coef_

array([ 2289700.84920483,    98695.39117225,   253268.96476034,
         301769.11580973, -1279554.2612814 ,  -472211.06396154,
       -1237094.62644464,    46990.92729461,   -47042.49745931])

In [None]:
spark_est = converter.toSpark(est)

In [None]:
type(spark_est)

pyspark.ml.regression.LinearRegressionModel

In [None]:
spark_est.coefficients, spark_est.intercept

(DenseVector([2289700.8492, 98695.3912, 253268.9648, 301769.1158, -1279554.2613, -472211.064, -1237094.6264, 46990.9273, -47042.4975]),
 5731.129571560336)

In [None]:
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector, VectorUDT

In [None]:
schema = StructType(fields=[
    StructField("features", VectorUDT()),
    StructField("labels", IntegerType())
])

In [None]:
test_df = spark.createDataFrame(zip(map(DenseVector, testFeatures), map(int, testLabels)), schema=schema)

In [None]:
test_df.show(truncate=100)

+----------------------------------------------------------------------------------------------------+------+
|                                                                                            features|labels|
+----------------------------------------------------------------------------------------------------+------+
|[0.004175293803486826,0.0,0.0022619991621070324,0.0012118150143577032,0.0045729741076692785,0.004...|  3024|
|[0.006773254392323074,0.00415160199655832,0.0022619991621070324,0.0024236300287154063,0.004280192...|  7604|
|[0.0024587841287200198,0.005535469328744426,0.006785997486321097,0.006059075071788516,0.004322018...|  2732|
|[0.005984587784997784,0.005535469328744426,0.0,0.0012118150143577032,0.004294134223055298,0.00426...|  5237|
|[0.0033402350427894608,0.002767734664372213,0.0022619991621070324,0.0036354450430731095,0.0042244...|  2782|
|[0.0032474507360453087,0.0,0.004523998324214065,0.0012118150143577032,0.004517206130746482,0.0044...|  2167|
|[0.009742

In [None]:
@F.udf(returnType=FloatType())
def predict(vector):
    return est.predict(vector)

In [None]:
spark_est.transform(test_df).show()

+--------------------+------+-------------------+
|            features|labels|         prediction|
+--------------------+------+-------------------+
|[0.00417529380348...|  3024|  2840.180417736866|
|[0.00677325439232...|  7604|  8732.896867774449|
|[0.00245878412872...|  2732|  3202.022125307526|
|[0.00598458778499...|  5237|  6452.532408273042|
|[0.00334023504278...|  2782| 2374.1039090742975|
|[0.00324745073604...|  2167| 1685.9400933486058|
|[0.00974235220813...| 16479|  14605.40707262528|
|[0.00334023504278...|  2795|  3359.091140069051|
|[0.00510313687092...|  4071|  6732.380188676643|
|[0.00565984271139...|  8975|  8205.185799031118|
|[0.00273713704895...|  1257|  552.3702859675877|
|[0.00245878412872...|  2290|  3026.299827802154|
|[0.00621654855185...| 17663| 10280.186162858761|
|[0.00250517628209...|  2090| 2877.6977991238705|
|[0.00463921533720...|  6389|  5444.739387330309|
|[0.00139176460116...|   552|-100.64061607262374|
|[0.00134537244779...|   607|  371.0307142245383|


In [None]:
spark.stop()