Для произвольно выбранного датасета провести обработку данных и построить предсказательную модель с использованием функционала pySpark.

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline


In [2]:
# read table from csv-file
csv_path = 'winequality-white.csv'

spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
wine_df = spark.read.csv(csv_path, sep=';', header=True, inferSchema=True)
wine_df.show()


+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|           

In [34]:
# show table columns types
wine_df.printSchema()


root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [35]:
# check nans in columns
nulls_in_columns = {column: wine_df.filter(wine_df[column].isNull()).count() for column in wine_df.columns}
nulls_in_columns


{'fixed acidity': 0,
 'volatile acidity': 0,
 'citric acid': 0,
 'residual sugar': 0,
 'chlorides': 0,
 'free sulfur dioxide': 0,
 'total sulfur dioxide': 0,
 'density': 0,
 'pH': 0,
 'sulphates': 0,
 'alcohol': 0,
 'quality': 0}

In [3]:
# set target and features columns
TARGET = 'quality'
features = wine_df.columns
features.remove(TARGET)
print(features)


['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [37]:
# calculate unique TARGET values
wine_df.groupby(TARGET).count().orderBy('count', ascending=False).show()


+-------+-----+
|quality|count|
+-------+-----+
|      6| 2198|
|      5| 1457|
|      7|  880|
|      8|  175|
|      4|  163|
|      3|   20|
|      9|    5|
+-------+-----+



В таблице нету пропусков и все столбцы числовые. Из распределения значений целевой переменной видно, что вина качества 9 и 3 меньше всего, поэтому объединим некоторые классы: отнесем вино класса 9 к классу 8, а вино класса 3 к классу 4.

In [4]:
wine_df = wine_df.withColumn(TARGET, when(wine_df[TARGET] == 9, 8).when(wine_df[TARGET] == 3, 4).otherwise(wine_df[TARGET]))
wine_df.groupby(TARGET).count().orderBy('count', ascending=False).show()


+-------+-----+
|quality|count|
+-------+-----+
|      6| 2198|
|      5| 1457|
|      7|  880|
|      4|  183|
|      8|  180|
+-------+-----+



Разделяем выборку на обучающую и тестовую, собираем все свойства с помощью VectorAssembler, подбирем параметры для модели RandomForestClassifier, основываясь на метрике точности.

In [11]:
# split dataset into train and test
train_df, test_df = wine_df.randomSplit([0.8, 0.2])
print(train_df.count(), test_df.count())


3905 993


In [14]:
# create pipeline with features assembler and RandomForestClassifier model 
random_forest = RandomForestClassifier(labelCol=TARGET, featuresCol='features')
model_pipeline = Pipeline(stages=[VectorAssembler(inputCols=features, outputCol='features'), random_forest])


In [29]:
# create grid builder for find best params
params = ParamGridBuilder().addGrid(random_forest.maxDepth, list(range(10, 40, 10))) \
                           .addGrid(random_forest.minInstancesPerNode, list(range(1, 6))) \
                           .addGrid(random_fores.numTrees, list(range(10, 40, 10))).build()

# create cross validation object for find best model
evaluator=MulticlassClassificationEvaluator(labelCol=TARGET, metricName='accuracy')
validator = CrossValidator(estimator=model_pipeline,
                          estimatorParamMaps=params,
                          evaluator=evaluator,
                          numFolds=5)

# search for best model
validator_model = validator.fit(train_df)
best_pipe_model = validator_model.bestModel


In [51]:
# run test, calculate model accuracy
preds = best_pipe_model.transform(test_df)
accuracy = evaluator.evaluate(preds)
print('Test accuracy is', accuracy)


Test accuracy is 0.6847935548841894


На тестовом наборе данных модель показала точность 68.48%.<br>
В отличие от моделей библиотеки sklearn диапазоны варьирования некоторых параметров моделей из pyspark ограничены, и работа самого модуля в целом оказалась медленнее.