In [3]:
import pyspark
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.master('local[*]').getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/21 01:45:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/21 01:45:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Обучите модель классификации для цветков Iris.
Примерная последовательность действий:

* 1) - Взять данные.
* 2) - Загрузить в PySpark.
* 3) -При помощи VectorAssembler преобразовать все колонки с признаками в одну (использовать Pipeline — опционально).
* 4) - Разбить данные на train и test.
* 5) -Создать модель логистической регреcсии или модель дерева и обучить её.
* 6) - Воспользоваться MulticlassClassificationEvaluator для оценки качества на train и test множестве.

## Загрузка данных

In [33]:
df = spark.read.csv('./Data/iris.CSV', inferSchema=True, header=True)

In [34]:
df.show(5)

+------------+-----------+------------+-----------+-------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|
+------------+-----------+------------+-----------+-------+-----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|
+------------+-----------+------------+-----------+-------+-----------+
only showing top 5 rows



## VectorAssembler

In [35]:
vector_assembler_columns = [
    'sepal_length',
    'sepal_width',
    'petal_length',
    'petal_width'
]

In [36]:
vector_assembler = VectorAssembler(inputCols=vector_assembler_columns, outputCol='Features')

In [37]:
df_vector_assembler = vector_assembler.transform(df)

In [38]:
df_vector_assembler.show(5)

+------------+-----------+------------+-----------+-------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|         Features|
+------------+-----------+------------+-----------+-------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----------+-----------------+
only showing top 5 rows



## Pipeline

In [39]:
pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=vector_assembler_columns, outputCol='Features')
])

In [40]:
pipeline_train = pipeline.fit(df)

In [41]:
pipeline_train.transform(df).show(5)

+------------+-----------+------------+-----------+-------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|         Features|
+------------+-----------+------------+-----------+-------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----------+-----------------+
only showing top 5 rows



## Train Test Split

In [42]:
train, test = df_vector_assembler.randomSplit([0.8, 0.2], seed=42)

## Построение модели

In [43]:
log_reg = LogisticRegression(featuresCol='Features', labelCol='variety_num')

In [44]:
log_reg_model = log_reg.fit(train)

23/02/16 01:55:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/16 01:55:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [45]:
train_result = log_reg_model.transform(train)
test_result = log_reg_model.transform(test)

In [47]:
train_result.show(5)

+------------+-----------+------------+-----------+-------+-----------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|         Features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+-------+-----------+-----------------+--------------------+--------------------+----------+
|         4.3|        3.0|         1.1|        0.1| Setosa|          0|[4.3,3.0,1.1,0.1]|[60.0954414735541...|[1.0,1.0615924393...|       0.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|          0|[4.4,2.9,1.4,0.2]|[50.4387895503684...|[1.0,1.9406791125...|       0.0|
|         4.4|        3.2|         1.3|        0.2| Setosa|          0|[4.4,3.2,1.3,0.2]|[61.4018436371999...|[1.0,6.0030709215...|       0.0|
|         4.5|        2.3|         1.3|        0.3| Setosa|          0|[4.5,2.3,1.3,0.3]|[28.7879051398375...|[0.99999999755928...|       0.0|

## Оценка качества

In [48]:
evaluation = MulticlassClassificationEvaluator(labelCol='variety_num')

In [49]:
evaluation.evaluate(train_result), evaluation.evaluate(test_result)

(0.9841269841269842, 1.0)