Здравствуйте.

Spark умеет валидировать модели. Попробуем это сделать. Evaluation ипортируется следующим образом:


```
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
```

В частности [RegressionEvaluator](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.RegressionEvaluator.html#pyspark.ml.evaluation.RegressionEvaluator.metricName)

# Задание
Ниже обучается и оцениваться модель. 

Нужно перевести этот в Pipeline (вам понадобится VectorAssembler), а затем оценить MAE с помощью spark.


In [1]:
# https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-dataset

from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator

In [2]:
import pandas as pd
from sklearn.datasets import load_diabetes, load_iris, load_boston
from sklearn.metrics import mean_absolute_error

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.regression import RandomForestRegressor

spark = SparkSession.builder\
    .master("local[2]")\
    .appName("Lesson_2")\
    .config("spark.executor.instances",2)\
    .config("spark.executor.memory",'2g')\
    .config("spark.executor.cores",1)\
    .getOrCreate()
sc = spark.sparkContext

In [3]:
data = load_boston()
dataset = pd.DataFrame(data['data'], columns=data['feature_names'])
dataset['target'] = data['target']

cols_to_vector = F.udf(lambda l: Vectors.dense(l), VectorUDT())

spark_dataset = spark.createDataFrame(dataset).select(cols_to_vector(F.array(*data['feature_names'])).alias('features'), 'target').cache()

#посмотрим данные

In [35]:
spark_dataset.columns

['features', 'target']

In [37]:
spark_dataset.show()

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.00632,18.0,2.3...|  24.0|
|[0.02731,0.0,7.07...|  21.6|
|[0.02729,0.0,7.07...|  34.7|
|[0.03237,0.0,2.18...|  33.4|
|[0.06905,0.0,2.18...|  36.2|
|[0.02985,0.0,2.18...|  28.7|
|[0.08829,12.5,7.8...|  22.9|
|[0.14455,12.5,7.8...|  27.1|
|[0.21124,12.5,7.8...|  16.5|
|[0.17004,12.5,7.8...|  18.9|
|[0.22489,12.5,7.8...|  15.0|
|[0.11747,12.5,7.8...|  18.9|
|[0.09378,12.5,7.8...|  21.7|
|[0.62976,0.0,8.14...|  20.4|
|[0.63796,0.0,8.14...|  18.2|
|[0.62739,0.0,8.14...|  19.9|
|[1.05393,0.0,8.14...|  23.1|
|[0.7842,0.0,8.14,...|  17.5|
|[0.80271,0.0,8.14...|  20.2|
|[0.7258,0.0,8.14,...|  18.2|
+--------------------+------+
only showing top 20 rows



In [92]:
train, test = spark_dataset.randomSplit([0.7, 0.3])
rfr = RandomForestRegressor(featuresCol='features',labelCol='target')
rfr = rfr.fit(train)
train_predictions = rfr.transform(train)
test_predictions = rfr.transform(test)

In [113]:
train_predictions.show()

+--------------------+------+------------------+
|            features|target|        prediction|
+--------------------+------+------------------+
|[0.00632,18.0,2.3...|  24.0| 26.43339806943908|
|[0.01311,90.0,1.2...|  35.4| 33.50688736263736|
|[0.01381,80.0,0.4...|  50.0| 45.76861155812786|
|[0.01778,95.0,1.4...|  32.9|  32.2933906063743|
|[0.02009,95.0,2.6...|  50.0| 47.18409165401236|
|[0.02187,60.0,2.9...|  31.1| 29.84822910494832|
|[0.02729,0.0,7.07...|  34.7| 35.40151893939394|
|[0.02763,75.0,2.9...|  30.8|29.560764873904226|
|[0.02875,28.0,15....|  25.0| 24.72409075260569|
|[0.0315,95.0,1.47...|  34.9|31.104085549166836|
|[0.03237,0.0,2.18...|  33.4| 36.47886487776929|
|[0.03359,75.0,2.9...|  34.9| 33.78507954545454|
|[0.0351,95.0,2.68...|  48.5| 46.84239720956792|
|[0.03584,80.0,3.3...|  23.5|25.550332204206207|
|[0.03659,25.0,4.8...|  24.8| 24.03220558375607|
|[0.04011,80.0,1.5...|  33.3|33.214039415898114|
|[0.04203,28.0,15....|  22.9|23.583519908174782|
|[0.04294,28.0,15...

In [142]:
# Заменить нужно эту часть

# pandas_train_predictions = train_predictions.toPandas()
# pandas_test_predictions = test_predictions.toPandas()

# print(f'''
#     Scores:: 
#         train: {mean_absolute_error(
#             pandas_train_predictions.target, 
#             pandas_train_predictions.prediction)}, 
#         test: {mean_absolute_error(
#             pandas_test_predictions.target, 
#             pandas_test_predictions.prediction)}
#     ''')


In [None]:
# from pyspark.ml.feature import VectorAssembler

In [194]:
evaluator = RegressionEvaluator(
    predictionCol='prediction',
    labelCol='target',
    metricName='mae')

In [196]:
round(evaluator.evaluate(train_predictions),2)

1.92

ответ: средняя абсолютная ошибка - 1.92