# Eksperyment polegający na przwidywaniu szczepu winogron na podstawie danych o winie

## Import sparka i wczytanie danych

In [1]:

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Predicting the grape variety from wine characteristics") \
    .getOrCreate()
        
rawData = spark.read \
            .format('csv')\
            .option('header','false') \
            .load('../datasets/wine.data')

In [2]:
rawData

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]

In [3]:
rawData.show(5)

+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7|_c8| _c9|_c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|  1|14.23|1.71|2.43|15.6|127| 2.8|3.06|.28|2.29|5.64|1.04|3.92|1065|
|  1| 13.2|1.78|2.14|11.2|100|2.65|2.76|.26|1.28|4.38|1.05| 3.4|1050|
|  1|13.16|2.36|2.67|18.6|101| 2.8|3.24| .3|2.81|5.68|1.03|3.17|1185|
|  1|14.37|1.95| 2.5|16.8|113|3.85|3.49|.24|2.18| 7.8| .86|3.45|1480|
|  1|13.24|2.59|2.87|  21|118| 2.8|2.69|.39|1.82|4.32|1.04|2.93| 735|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
only showing top 5 rows



### Przypisanie nazw kolumnom i zapisanie ich w zmiennej 'dataset'

In [4]:
dataset = rawData.toDF('Label',
                      'Alcohol',
                      'MalicAcid',
                      'Ash',
                      'AshAlalinity',
                      'Magnesium',
                       'TotalPhenols',
                       'Favanoids',
                       'NonflavanoidPhenols',
                       'Proanthocyanins',
                       'ColorIntensity',
                       'Hue',
                       'OD',
                       'Proline'
                      )

In [5]:
dataset

DataFrame[Label: string, Alcohol: string, MalicAcid: string, Ash: string, AshAlalinity: string, Magnesium: string, TotalPhenols: string, Favanoids: string, NonflavanoidPhenols: string, Proanthocyanins: string, ColorIntensity: string, Hue: string, OD: string, Proline: string]

In [6]:
dataset.show(5)

+-----+-------+---------+----+------------+---------+------------+---------+-------------------+---------------+--------------+----+----+-------+
|Label|Alcohol|MalicAcid| Ash|AshAlalinity|Magnesium|TotalPhenols|Favanoids|NonflavanoidPhenols|Proanthocyanins|ColorIntensity| Hue|  OD|Proline|
+-----+-------+---------+----+------------+---------+------------+---------+-------------------+---------------+--------------+----+----+-------+
|    1|  14.23|     1.71|2.43|        15.6|      127|         2.8|     3.06|                .28|           2.29|          5.64|1.04|3.92|   1065|
|    1|   13.2|     1.78|2.14|        11.2|      100|        2.65|     2.76|                .26|           1.28|          4.38|1.05| 3.4|   1050|
|    1|  13.16|     2.36|2.67|        18.6|      101|         2.8|     3.24|                 .3|           2.81|          5.68|1.03|3.17|   1185|
|    1|  14.37|     1.95| 2.5|        16.8|      113|        3.85|     3.49|                .24|           2.18|           7

### Na potrzeby SparkML reorganizujemy dane w wektory (wektoryzowanie???)

In [7]:
from pyspark.ml.linalg import Vectors

def vectorize(data):
    return data.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['label','features'])

In [8]:
vectorizedData = vectorize(dataset)
vectorizedData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[14.23,1.71,2.43,...|
|    1|[13.2,1.78,2.14,1...|
|    1|[13.16,2.36,2.67,...|
|    1|[14.37,1.95,2.5,1...|
|    1|[13.24,2.59,2.87,...|
+-----+--------------------+
only showing top 5 rows



In [9]:
vectorizedData.take(5)

[Row(label='1', features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0])),
 Row(label='1', features=DenseVector([13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.4, 1050.0])),
 Row(label='1', features=DenseVector([13.16, 2.36, 2.67, 18.6, 101.0, 2.8, 3.24, 0.3, 2.81, 5.68, 1.03, 3.17, 1185.0])),
 Row(label='1', features=DenseVector([14.37, 1.95, 2.5, 16.8, 113.0, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480.0])),
 Row(label='1', features=DenseVector([13.24, 2.59, 2.87, 21.0, 118.0, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735.0]))]

### import funkcji StringIndexer używanej do reprezentowania danych kategorycznych w formie numerycznej (orginalnie zmienna 'label' mimo wartosci 1,2,3 jest typu string, zamieniamy ją na typ float

In [10]:
from pyspark.ml.feature import StringIndexer

labelIndexer = StringIndexer(inputCol = 'label',
                             outputCol = 'indexedLabel')

In [11]:
indexedData = labelIndexer.fit(vectorizedData).transform(vectorizedData)
indexedData.take(5)

[Row(label='1', features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0]), indexedLabel=1.0),
 Row(label='1', features=DenseVector([13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.4, 1050.0]), indexedLabel=1.0),
 Row(label='1', features=DenseVector([13.16, 2.36, 2.67, 18.6, 101.0, 2.8, 3.24, 0.3, 2.81, 5.68, 1.03, 3.17, 1185.0]), indexedLabel=1.0),
 Row(label='1', features=DenseVector([14.37, 1.95, 2.5, 16.8, 113.0, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480.0]), indexedLabel=1.0),
 Row(label='1', features=DenseVector([13.24, 2.59, 2.87, 21.0, 118.0, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735.0]), indexedLabel=1.0)]

In [12]:
indexedData

DataFrame[label: string, features: vector, indexedLabel: double]

In [13]:
indexedData.select('label').distinct().show()

+-----+
|label|
+-----+
|    3|
|    1|
|    2|
+-----+



In [14]:
indexedData.select('indexedLabel').distinct().show()

+------------+
|indexedLabel|
+------------+
|         0.0|
|         1.0|
|         2.0|
+------------+



## Budowa  modelu

In [15]:
(trainingData, testData) = indexedData.randomSplit([0.75,0.25])

In [16]:
from pyspark.ml.classification import DecisionTreeClassifier

dtree = DecisionTreeClassifier(
    labelCol = 'indexedLabel',
    featuresCol = 'features',
    maxDepth = 4,
    impurity = 'gini'
)


In [17]:
model = dtree.fit(trainingData)

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'indexedLabel',
                                             predictionCol = 'prediction',
                                             metricName = 'f1')

transformed_data = model.transform(testData)
transformed_data.show(5)

+-----+--------------------+------------+--------------+-------------+----------+
|label|            features|indexedLabel| rawPrediction|  probability|prediction|
+-----+--------------------+------------+--------------+-------------+----------+
|    1|[12.93,3.8,2.65,1...|         1.0|[0.0,44.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    1|[13.05,1.65,2.55,...|         1.0|[0.0,44.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    1|[13.16,2.36,2.67,...|         1.0|[0.0,44.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    1|[13.24,2.59,2.87,...|         1.0| [0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    1|[13.5,1.81,2.61,2...|         1.0|[0.0,44.0,0.0]|[0.0,1.0,0.0]|       1.0|
+-----+--------------------+------------+--------------+-------------+----------+
only showing top 5 rows



In [19]:
print(evaluator.getMetricName(),
     'accuracy: ',
     evaluator.evaluate(transformed_data))

f1 accuracy:  0.9052775499042931


In [20]:
type(transformed_data)

pyspark.sql.dataframe.DataFrame

In [21]:
print(transformed_data)

DataFrame[label: string, features: vector, indexedLabel: double, rawPrediction: vector, probability: vector, prediction: double]
