In [1]:
# !pip install pyspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName('my_app').master("local[*]").getOrCreate()

23/08/13 09:04:17 WARN Utils: Your hostname, md-ASUS resolves to a loopback address: 127.0.1.1; using 192.168.30.211 instead (on interface eth0)
23/08/13 09:04:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/13 09:04:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Wektory

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml import feature  # obsługa zmiennych

**Dwa typy wektorów:**
- `sparse` - większość wartości to zera, w celu optymalizacji zajmowanej pamięci podawane są tylko indeksy (wraz z wartościami) gdzie `wartość != 0`
- `dense` - podane są wszystkie wartości

In [3]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), "A", 1),  # długość wektora, a potem pary niezerowych (indeks, wartość)
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]), "B", 6),       # standardowy wektor o długości 4 z wartościami elementów
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]), "A", 3),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), "B", 2), 
        (Vectors.sparse(4, [(1, 1.0), (2, 2.0)]), "C", 4)]

dummy_df = spark.createDataFrame(data, ["features", "categ", "num"])
dummy_df.show()

                                                                                

+--------------------+-----+---+
|            features|categ|num|
+--------------------+-----+---+
|(4,[0,3],[1.0,-2.0])|    A|  1|
|   [4.0,5.0,0.0,3.0]|    B|  6|
|   [6.0,7.0,0.0,8.0]|    A|  3|
| (4,[0,3],[9.0,1.0])|    B|  2|
| (4,[1,2],[1.0,2.0])|    C|  4|
+--------------------+-----+---+



In [4]:
dummy_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- categ: string (nullable = true)
 |-- num: long (nullable = true)



**Przechodzenie od kolumny kategorycznej do wektora**

In [7]:
indexer = feature.StringIndexer(inputCol="categ", outputCol="categIndex")  # inputCol, outputCol

In [8]:
IDXmodel = indexer.fit(dummy_df)

                                                                                

In [9]:
dummy_df1 = IDXmodel.transform(dummy_df)
dummy_df1.show()

+--------------------+-----+---+----------+
|            features|categ|num|categIndex|
+--------------------+-----+---+----------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       0.0|
|   [4.0,5.0,0.0,3.0]|    B|  6|       1.0|
|   [6.0,7.0,0.0,8.0]|    A|  3|       0.0|
| (4,[0,3],[9.0,1.0])|    B|  2|       1.0|
| (4,[1,2],[1.0,2.0])|    C|  4|       2.0|
+--------------------+-----+---+----------+



In [10]:
# OneHotEncoderEstimator
OHencoder = feature.OneHotEncoder(inputCols=["categIndex"], outputCols=["categVect"])  # inputCols, outputCols

In [11]:
OHmodel = OHencoder.fit(dummy_df1)

In [12]:
dummy_df2 = OHmodel.transform(dummy_df1)
dummy_df2.show()

+--------------------+-----+---+----------+-------------+
|            features|categ|num|categIndex|    categVect|
+--------------------+-----+---+----------+-------------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       0.0|(2,[0],[1.0])|
|   [4.0,5.0,0.0,3.0]|    B|  6|       1.0|(2,[1],[1.0])|
|   [6.0,7.0,0.0,8.0]|    A|  3|       0.0|(2,[0],[1.0])|
| (4,[0,3],[9.0,1.0])|    B|  2|       1.0|(2,[1],[1.0])|
| (4,[1,2],[1.0,2.0])|    C|  4|       2.0|    (2,[],[])|
+--------------------+-----+---+----------+-------------+



**Łączenie zmiennych w wektory**

In [13]:
# inputCols, ale: outputCol
vectAssembler = feature.VectorAssembler(inputCols=["features", "categVect", "num"], outputCol="featuresFull")

dummy_df3 = vectAssembler.transform(dummy_df2)
dummy_df3.show(truncate=False)

+--------------------+-----+---+----------+-------------+------------------------------+
|features            |categ|num|categIndex|categVect    |featuresFull                  |
+--------------------+-----+---+----------+-------------+------------------------------+
|(4,[0,3],[1.0,-2.0])|A    |1  |0.0       |(2,[0],[1.0])|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0]   |B    |6  |1.0       |(2,[1],[1.0])|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0]   |A    |3  |0.0       |(2,[0],[1.0])|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|(4,[0,3],[9.0,1.0]) |B    |2  |1.0       |(2,[1],[1.0])|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(4,[1,2],[1.0,2.0]) |C    |4  |2.0       |(2,[],[])    |(7,[1,2,6],[1.0,2.0,4.0])     |
+--------------------+-----+---+----------+-------------+------------------------------+



**Skalowanie zmiennych**

In [14]:
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

In [15]:
scalerModel = scaler.fit(dummy_df3)

In [16]:
dummy_df3.select("featuresFull").show(truncate=False)

scalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+------------------------------+
|featuresFull                  |
+------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(7,[1,2,6],[1.0,2.0,4.0])     |
+------------------------------+

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,1.8257418583505538,0.0,0.5198752449100363]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,0.0,1.8257418583505538,3.119251469460218]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.8257418583505538,0.0,1.559625734730109] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,0.0,1.8257418583505

**Skalowanie `min-max` _(w wyniku transformacji powstaje `DenseVector`)_**

In [17]:
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_scaled = X_std * (X_std.max() - X_std.min()) + X_std.min()

MMscaler = feature.MinMaxScaler(inputCol="featuresFull", outputCol="featuresScal")

In [18]:
MMscalerModel = MMscaler.fit(dummy_df3)

In [19]:
dummy_df3.select("featuresFull").show(truncate=False)

MMscalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+------------------------------+
|featuresFull                  |
+------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |
|(7,[1,2,6],[1.0,2.0,4.0])     |
+------------------------------+

+------------------------------------------------------------+
|featuresScal                                                |
+------------------------------------------------------------+
|(7,[0,4],[0.1111111111111111,1.0])                          |
|[0.4444444444444444,0.7142857142857142,0.0,0.5,0.0,1.0,1.0] |
|[0.6666666666666666,1.0,0.0,1.0,1.0,0.0,0.4]                |
|[1.0,0.0,0.0,0.30000000000000004,0.0,1.0,0.2]               |
|[0.0,0.14285714285714285,1.0,0.2,0.0,0.0,0.6000000000000001]|
+------------------------------------------------------------+



**PCA**

In [20]:
pca = feature.PCA(k=3, inputCol="featuresFull", outputCol="featuresPCA")

In [21]:
PCAmodel = pca.fit(dummy_df3)

                                                                                

In [22]:
PCAmodel.transform(dummy_df3).select("featuresFull", "featuresPCA").show(truncate=False)

+------------------------------+--------------------------------------------------------------+
|featuresFull                  |featuresPCA                                                   |
+------------------------------+--------------------------------------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0,1.0]|[0.83086263635747,0.8263675323478634,-1.31104932792516]       |
|[4.0,5.0,0.0,3.0,0.0,1.0,6.0] |[-7.36978449442349,-0.8524399357794814,-5.302300671552504]    |
|[6.0,7.0,0.0,8.0,1.0,0.0,3.0] |[-12.433855979366033,0.006784898917439053,-1.2633329470730525]|
|[9.0,0.0,0.0,1.0,0.0,1.0,2.0] |[-4.867266422112711,6.86274574617497,-3.658108599245715]      |
|(7,[1,2,6],[1.0,2.0,4.0])     |[-0.9319103583071204,-1.8364015478262758,-3.5681300808770082] |
+------------------------------+--------------------------------------------------------------+



In [23]:
PCAmodel.explainedVariance

DenseVector([0.6419, 0.27, 0.0687])

## Pipeline

In [24]:
from pyspark.ml import Pipeline

In [None]:
# przygotowanie estymatorów/transformerów

indexer = feature.StringIndexer(inputCol="categ", outputCol="categIndex")
OHencoder = feature.OneHotEncoder(inputCols=["categIndex"], outputCols=["categVect"])
vectAssembler = feature.VectorAssembler(inputCols=["features", "num", "categVect"], outputCol="featuresFull")
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

In [25]:
# utworzenie estymatora
pipeline = Pipeline(stages=[indexer, OHencoder, vectAssembler, scaler])

In [26]:
# utworzenie transformera
pipelineModel = pipeline.fit(dummy_df)

In [27]:
# transformacja
pipelineModel.transform(dummy_df).select("featuresScal").show(truncate=False)

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,1.8257418583505538,0.0,0.5198752449100363]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,0.0,1.8257418583505538,3.119251469460218]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.8257418583505538,0.0,1.559625734730109] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,0.0,1.8257418583505538,1.0397504898200727]               |
|(7,[1,2,6],[0.3115884764248779,2.23606797749979,2.0795009796401454])                                   |
+-------------------------------------------------------------------------------------------------------+



In [28]:
# zapisanie potoku (pipeline)
pipelineModel.save("pipe")

                                                                                

In [29]:
# wczytanie modelu
from pyspark.ml import PipelineModel

pip = PipelineModel.load("pipe")

In [30]:
pip.transform(dummy_df).select("featuresScal").show(truncate=False)

+-------------------------------------------------------------------------------------------------------+
|featuresScal                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.2721655269759087,0.0,0.0,-0.5252257314388902,1.8257418583505538,0.0,0.5198752449100363]             |
|[1.0886621079036347,1.5579423821243896,0.0,0.7878385971583353,0.0,1.8257418583505538,3.119251469460218]|
|[1.632993161855452,2.1811193349741456,0.0,2.1009029257555607,1.8257418583505538,0.0,1.559625734730109] |
|[2.449489742783178,0.0,0.0,0.2626128657194451,0.0,1.8257418583505538,1.0397504898200727]               |
|(7,[1,2,6],[0.3115884764248779,2.23606797749979,2.0795009796401454])                                   |
+-------------------------------------------------------------------------------------------------------+



### **ZADANIE 1**: przygotowanie danych e-commerce

1. Wczytaj dane `online_retail.csv`

In [62]:
df = spark.read.csv("./online_retail.csv", header=True, inferSchema=True)

df.show(5)
df.printSchema()

+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/10 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+
only showing top 5 rows

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: stri

#### Eksploracja danych 

2. Sprawdź ilość wierszy

In [40]:
df.count()  # relatywnie duży zbiór danych (pół miliona rekordów)

541909

3. Sprawdź ilość klientów (unikalną wartość dla `CustomerID`)

In [41]:
df.select('CustomerID').distinct().count()  # bardzo mało unikalnych klientów

                                                                                

4373

4. Z którego kraju pochodzi najwięcej klientów? (`f.countDistinct`)

In [47]:
from pyspark.sql import functions as f

df.groupby('Country') \
    .agg(f.count_distinct('CustomerID') \
    .alias('#clients')) \
    .sort('#clients', ascending=False) \
    .show(5)

+--------------+--------+
|       Country|#clients|
+--------------+--------+
|United Kingdom|    3950|
|       Germany|      95|
|        France|      87|
|         Spain|      31|
|       Belgium|      25|
+--------------+--------+
only showing top 5 rows



5. Kiedy dokonany był pierwszy a kiedy ostatni zakup ?

In [63]:
df = df.withColumn("Invoice_date_2", f.to_date("InvoiceDate", "M/d/yy H:m"))
df.show(5)

df.printSchema()

+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|Invoice_date_2|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|    2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/10 8:26|     2.75|     17850|United Kingdom|    2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12-01|
+---------+---------+--------------------+--------+------------+--------

In [52]:
df.orderBy('Invoice_date_2').show(1)
df.orderBy('Invoice_date_2', ascending=False).show(1)



+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|Invoice_date_2|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|    2010-12-01|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
only showing top 1 row





+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|Invoice_date_2|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
|   581475|    22596|CHRISTMAS STAR WI...|      36|12/9/11 8:39|     0.39|     13069|United Kingdom|    2011-12-09|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+
only showing top 1 row



                                                                                

#### Przygotowanie danych do uczenia (features engineering)

- `Nowość w czasie`: Kiedy ostatnio każdy klient dokonał zakupu?

- `Częstotliwość`: Jak często coś kupowali?

- `Wartość pieniężna`: Ile pieniędzy średnio wydają na zakupy?

6. Dla każdej transakcji oblicz ile czasu upłynęło od pierwszej transakcji (policzonej w poprzednim punkcie) i dodaj wynik do reszty tabeli. Następnie dla każdego klienta policz jego max czas (`recency` = nowosc w czasie)

In [64]:
df = df.withColumn("from_date", f.lit("12/1/10"))  # wstawianie kolumny o jednakowej wartości dla wszystkich rekordów
df = df.withColumn("from_date", f.to_date("from_date", "M/d/yy"))

df.show(5)

+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|Invoice_date_2| from_date|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|    2010-12-01|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12-01|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/10 8:26|     2.75|     17850|United Kingdom|    2010-12-01|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12-01|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|    2010-12

In [65]:
df = df.withColumn("time_diff", f.col('Invoice_date_2') - f.col('from_date'))
df.toPandas()



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Invoice_date_2,from_date,time_diff
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom,2010-12-01,2010-12-01,0 days
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom,2010-12-01,2010-12-01,0 days
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom,2010-12-01,2010-12-01,0 days
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom,2010-12-01,2010-12-01,0 days
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom,2010-12-01,2010-12-01,0 days
...,...,...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,12680.0,France,2011-12-09,2010-12-01,373 days
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,12680.0,France,2011-12-09,2010-12-01,373 days
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,12680.0,France,2011-12-09,2010-12-01,373 days
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,12680.0,France,2011-12-09,2010-12-01,373 days


In [66]:
df.groupBy('CustomerID').agg(f.max('time_diff').alias('recency')).show()

+----------+------------------+
|CustomerID|           recency|
+----------+------------------+
|     17420|INTERVAL '323' DAY|
|     16861|INTERVAL '314' DAY|
|     16503|INTERVAL '267' DAY|
|     15727|INTERVAL '357' DAY|
|     17389|INTERVAL '373' DAY|
|     15447| INTERVAL '43' DAY|
|     14450|INTERVAL '193' DAY|
|     15100| INTERVAL '43' DAY|
|     12471|INTERVAL '371' DAY|
|     16916|INTERVAL '350' DAY|
|     14514|INTERVAL '312' DAY|
|     16500|INTERVAL '369' DAY|
|     12626|INTERVAL '350' DAY|
|     18161|INTERVAL '363' DAY|
|     15967|INTERVAL '350' DAY|
|     17809|INTERVAL '357' DAY|
|     15738|INTERVAL '355' DAY|
|     17044|INTERVAL '365' DAY|
|     12393|INTERVAL '301' DAY|
|     16791|INTERVAL '344' DAY|
+----------+------------------+
only showing top 20 rows



In [67]:
df = df.join(df.groupBy('CustomerID').agg(f.max('time_diff').alias('recency')), on='CustomerID')
df.toPandas()

                                                                                

Unnamed: 0,CustomerID,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,Invoice_date_2,from_date,time_diff,recency
0,17850,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,United Kingdom,2010-12-01,2010-12-01,0 days,71 days
1,17850,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days
2,17850,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,United Kingdom,2010-12-01,2010-12-01,0 days,71 days
3,17850,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days
4,17850,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days
...,...,...,...,...,...,...,...,...,...,...,...,...
406824,12680,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,France,2011-12-09,2010-12-01,373 days,373 days
406825,12680,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,France,2011-12-09,2010-12-01,373 days,373 days
406826,12680,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days
406827,12680,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days


7. Oblicz ilość transakcji dla każdego klienta i połącz wynik z resztą danych

In [68]:
df.groupBy('CustomerID').agg(f.count_distinct('InvoiceNo').alias('frequency')).show()

+----------+---------+
|CustomerID|frequency|
+----------+---------+
|     15727|        7|
|     17389|       43|
|     16503|        5|
|     17420|        3|
|     15447|        1|
|     16861|        3|
|     14450|        3|
|     13623|        7|
|     13285|        4|
|     14570|        2|
|     16339|        1|
|     16386|        2|
|     18024|        3|
|     12940|        4|
|     16574|        1|
|     17679|        2|
|     15619|        1|
|     15790|        1|
|     13832|        2|
|     15957|        1|
+----------+---------+
only showing top 20 rows



In [69]:
df = df.join(df.groupBy('CustomerID').agg(f.count_distinct('InvoiceNo').alias('frequency')), on='CustomerID')
df.toPandas()

                                                                                

Unnamed: 0,CustomerID,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,Invoice_date_2,from_date,time_diff,recency,frequency
0,17850,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35
1,17850,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35
2,17850,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35
3,17850,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35
4,17850,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
406824,12680,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,France,2011-12-09,2010-12-01,373 days,373 days,4
406825,12680,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,France,2011-12-09,2010-12-01,373 days,373 days,4
406826,12680,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days,4
406827,12680,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days,4


8. Oblicz ile każdy klient wydał w sklepie (stworz najpierw kolumnę z wartością `Quantity x UnitPrice`)

In [70]:
df = df.withColumn("value", f.col('Quantity') * f.col('UnitPrice'))
df.toPandas()

                                                                                

Unnamed: 0,CustomerID,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,Invoice_date_2,from_date,time_diff,recency,frequency,value
0,17850,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35,15.30
1,17850,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35,20.34
2,17850,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35,22.00
3,17850,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35,20.34
4,17850,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,United Kingdom,2010-12-01,2010-12-01,0 days,71 days,35,20.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406824,12680,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,France,2011-12-09,2010-12-01,373 days,373 days,4,10.20
406825,12680,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.10,France,2011-12-09,2010-12-01,373 days,373 days,4,12.60
406826,12680,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days,4,16.60
406827,12680,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,France,2011-12-09,2010-12-01,373 days,373 days,4,16.60


In [72]:
df.groupBy('CustomerID').agg(f.round(f.sum('value'), 2).alias('monetary_value')).show()



+----------+--------------+
|CustomerID|monetary_value|
+----------+--------------+
|     17420|        598.83|
|     16861|        151.65|
|     16503|       1421.43|
|     15727|       5178.96|
|     17389|      31300.08|
|     15447|        155.17|
|     14450|        483.25|
|     15100|         635.1|
|     12471|      18740.92|
|     16916|        576.26|
|     14514|       1055.35|
|     16500|        400.86|
|     12626|       6388.35|
|     18161|       1612.79|
|     15967|        418.83|
|     17809|       4627.62|
|     15738|       4788.77|
|     17044|        897.43|
|     12393|        1582.6|
|     16791|       1706.28|
+----------+--------------+
only showing top 20 rows





In [73]:
df = df.join(df.groupBy('CustomerID').agg(f.round(f.sum('value'), 2).alias('monetary_value')), on='CustomerID')
df.toPandas()

                                                                                

Unnamed: 0,CustomerID,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,Invoice_date_2,from_date,time_diff,recency,frequency,value,monetary_value
0,12347,537626,85116,BLACK CANDELABRA T-LIGHT HOLDER,12,12/7/10 14:57,2.10,Iceland,2010-12-07,2010-12-01,6 days,371 days,7,25.2,4310.0
1,12347,537626,22375,AIRLINE BAG VINTAGE JET SET BROWN,4,12/7/10 14:57,4.25,Iceland,2010-12-07,2010-12-01,6 days,371 days,7,17.0,4310.0
2,12347,537626,71477,COLOUR GLASS. STAR T-LIGHT HOLDER,12,12/7/10 14:57,3.25,Iceland,2010-12-07,2010-12-01,6 days,371 days,7,39.0,4310.0
3,12347,537626,22492,MINI PAINT SET VINTAGE,36,12/7/10 14:57,0.65,Iceland,2010-12-07,2010-12-01,6 days,371 days,7,23.4,4310.0
4,12347,537626,22771,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,12,12/7/10 14:57,1.25,Iceland,2010-12-07,2010-12-01,6 days,371 days,7,15.0,4310.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406824,18269,537817,22418,10 COLOUR SPACEBOY PEN,24,12/8/10 13:53,0.85,United Kingdom,2010-12-08,2010-12-01,7 days,15 days,2,20.4,138.9
406825,18269,537817,22451,SILK PURSE BABUSHKA RED,6,12/8/10 13:53,3.35,United Kingdom,2010-12-08,2010-12-01,7 days,15 days,2,20.1,138.9
406826,18269,537817,22450,SILK PURSE BABUSHKA BLUE,6,12/8/10 13:53,3.35,United Kingdom,2010-12-08,2010-12-01,7 days,15 days,2,20.1,138.9
406827,18269,537817,22449,SILK PURSE BABUSHKA PINK,6,12/8/10 13:53,3.35,United Kingdom,2010-12-08,2010-12-01,7 days,15 days,2,20.1,138.9


9. Wybierz z tabeli unikalne wartości dla kolumn `recency`, `frequency`, `monetary_value`, `CustomerID`

In [89]:
final_df = df.select(['CustomerID', 'recency', 'frequency', 'monetary_value']).distinct()
final_df.show(5)

                                                                                

+----------+------------------+---------+--------------+
|CustomerID|           recency|frequency|monetary_value|
+----------+------------------+---------+--------------+
|     12346| INTERVAL '48' DAY|        2|           0.0|
|     12347|INTERVAL '371' DAY|        7|        4310.0|
|     12349|INTERVAL '355' DAY|        1|       1757.55|
|     12355|INTERVAL '159' DAY|        1|         459.4|
|     12356|INTERVAL '351' DAY|        3|       2811.43|
+----------+------------------+---------+--------------+
only showing top 5 rows



#### Machine learning - przygotowanie (scale & transform)

10. Za pomocą `VectorAssembler` utwórz jedną kolumnę z kolumn `recency`,`frequency`,`monetary_value`

In [90]:
final_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- recency: interval day (nullable = true)
 |-- frequency: long (nullable = false)
 |-- monetary_value: double (nullable = true)



In [91]:
final_df = final_df.withColumn("recency", final_df.recency.cast('long'))  # zmiana typu kolumny z 'timeDelta' na 'long'
final_df.show(5)

final_df.printSchema()

                                                                                

+----------+-------+---------+--------------+
|CustomerID|recency|frequency|monetary_value|
+----------+-------+---------+--------------+
|     12346|     48|        2|           0.0|
|     12347|    371|        7|        4310.0|
|     12349|    355|        1|       1757.55|
|     12355|    159|        1|         459.4|
|     12356|    351|        3|       2811.43|
+----------+-------+---------+--------------+
only showing top 5 rows

root
 |-- CustomerID: integer (nullable = true)
 |-- recency: long (nullable = true)
 |-- frequency: long (nullable = false)
 |-- monetary_value: double (nullable = true)



In [92]:
vectAssembler = feature.VectorAssembler(inputCols=["recency", "frequency", "monetary_value"], outputCol="featuresFull")

final_df = vectAssembler.transform(final_df)
final_df.show(5, truncate=False)

                                                                                

+----------+-------+---------+--------------+-------------------+
|CustomerID|recency|frequency|monetary_value|featuresFull       |
+----------+-------+---------+--------------+-------------------+
|12346     |48     |2        |0.0           |[48.0,2.0,0.0]     |
|12347     |371    |7        |4310.0        |[371.0,7.0,4310.0] |
|12349     |355    |1        |1757.55       |[355.0,1.0,1757.55]|
|12355     |159    |1        |459.4         |[159.0,1.0,459.4]  |
|12356     |351    |3        |2811.43       |[351.0,3.0,2811.43]|
+----------+-------+---------+--------------+-------------------+
only showing top 5 rows



11. Za pomocą `StandardScaler` wystandarduj wartości w nowo utworzonej kolumnie

In [93]:
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

scalerModel = scaler.fit(final_df)

final_df = scalerModel.transform(final_df)
final_df.show(5, truncate=False)

                                                                                

+----------+-------+---------+--------------+-------------------+------------------------------------------------------------+
|CustomerID|recency|frequency|monetary_value|featuresFull       |featuresScal                                                |
+----------+-------+---------+--------------+-------------------+------------------------------------------------------------+
|12346     |48     |2        |0.0           |[48.0,2.0,0.0]     |[0.4763221295769767,0.21416132869209512,0.0]                |
|12347     |371    |7        |4310.0        |[371.0,7.0,4310.0] |[3.6815731265220495,0.7495646504223329,0.5243726751937274]  |
|12349     |355    |1        |1757.55       |[355.0,1.0,1757.55]|[3.5227990833297236,0.10708066434604756,0.21383090377882497]|
|12355     |159    |1        |459.4         |[159.0,1.0,459.4]  |[1.5778170542237353,0.10708066434604756,0.05589253062273744]|
|12356     |351    |3        |2811.43       |[351.0,3.0,2811.43]|[3.483105572531642,0.32124199303814266,0.34205

## Klasyfikacja - przygotowanie danych

In [94]:
from pyspark.ml import classification

https://archive.ics.uci.edu/ml/datasets/adult

In [150]:
col_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship",
             "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "earnings"]

df = spark.read.csv("../spark materiały_dzien_1/adult.data", header=False, inferSchema=True, ignoreLeadingWhiteSpace=True)

df.toDF(*col_names).toPandas().head()  # zmiana nazw kolumn (inny sposób poniżej)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earnings
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [151]:
df = df.select(*[f.col(old).alias(new) for old, new in zip(df.columns, col_names)]).drop("fnlwgt").dropna("any")
df.show(3, vertical=True)

-RECORD 0----------------------------
 age            | 39                 
 workclass      | State-gov          
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Never-married      
 occupation     | Adm-clerical       
 relationship   | Not-in-family      
 race           | White              
 sex            | Male               
 capital-gain   | 2174               
 capital-loss   | 0                  
 hours-per-week | 40                 
 native-country | United-States      
 earnings       | <=50K              
-RECORD 1----------------------------
 age            | 50                 
 workclass      | Self-emp-not-inc   
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Married-civ-spouse 
 occupation     | Exec-managerial    
 relationship   | Husband            
 race           | White              
 sex            | Male               
 capital-gain   | 0                  
 capital-los

### **ZADANIE 2:**
1 podziel `df` na zbiór treningowy i ewaluacyjny

2 na podstawie kolumny `earnings` stwórz zmienną celu `label` z wartościami zakodowanymi jako 0 i 1

3 stwórz (przeskalowaną) kolumnę `features` zawierającą wektory powstałe na podstawie pozostałych kolumn

4 wynikowe DFy nazwij `df_train` i `df_eval`

In [158]:
# 1 
df_train, df_test = df.randomSplit([0.7, 0.3], 42)

In [153]:
# 2
indexer = feature.StringIndexer(inputCol="earnings", outputCol="label")

In [154]:
# 3
df.printSchema()

categ_col = [el[0] for el in df.dtypes if el[1] == 'string' and el[0] != "earnings"]
num_col = [el[0] for el in df.dtypes if el[1] != 'string']

print(f'category columns: {categ_col} \nnumerical columns: {num_col}')

categ_col_idx = [col+"_idx" for col in categ_col]
categ_col_ohe = [col+"_ohe" for col in categ_col]

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- earnings: string (nullable = true)

category columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] 
numerical columns: ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [155]:
#przygotowanie estymatorów/transformerów

indexer_features = feature.StringIndexer(inputCols=categ_col, outputCols=categ_col_idx)
OHencoder = feature.OneHotEncoder(inputCols=categ_col_idx, outputCols=categ_col_ohe)
vectAssembler = feature.VectorAssembler(inputCols=categ_col_ohe+num_col, outputCol="featuresFull")
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="features")

pipeline = Pipeline(stages=[indexer, indexer_features, OHencoder, vectAssembler, scaler])

In [159]:
# 4
pipe_model = pipeline.fit(df_train)

df_train = pipe_model.transform(df_train)
df_eval = pipe_model.transform(df_test)

df_train.toPandas().head()

[Stage 649:>                                                        (0 + 1) / 1]                                                                                

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,workclass_ohe,education_ohe,marital-status_ohe,occupation_ohe,relationship_ohe,race_ohe,sex_ohe,native-country_ohe,featuresFull,features
0,17,?,10th,6,Never-married,?,Other-relative,White,Male,0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 4.336497144146086, 0.0, 0.0, 0..."
1,17,?,10th,6,Never-married,?,Other-relative,White,Male,0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 4.336497144146086, 0.0, 0.0, 0..."
2,17,?,10th,6,Never-married,?,Own-child,Black,Male,0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 4.336497144146086, 0.0, 0.0, 0..."
3,17,?,10th,6,Never-married,?,Own-child,Other,Female,0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0)",(0.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 4.336497144146086, 0.0, 0.0, 0..."
4,17,?,10th,6,Never-married,?,Own-child,White,Female,0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(0.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 4.336497144146086, 0.0, 0.0, 0..."


**Ostatnie przygotowania**

In [168]:
df_train = df_train.select("label", "features")
df_eval = df_eval.select("label", "features")

df_train.show(10)  # można użyć 'truncate=False' żeby zobaczyć, że wynikowe 'features' są różne

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
|  0.0|(99,[3,15,24,36,4...|
+-----+--------------------+
only showing top 10 rows



In [165]:
# zachowanie przetransformowanych DataFrame w pamięci podręcznej (RAM)
df_train.cache()
df_eval.cache()

DataFrame[label: double, features: vector]

In [166]:
# sprawdzenie zbalansowania klas

print("Train:")
df_train.groupBy("label").count().show()

print("Eval:")
df_eval.groupBy("label").count().show()

Train:


[Stage 653:>                                                        (0 + 1) / 1]                                                                                

+-----+-----+
|label|count|
+-----+-----+
|  0.0|17414|
|  1.0| 5461|
+-----+-----+

Eval:
+-----+-----+
|label|count|
+-----+-----+
|  0.0| 7306|
|  1.0| 2380|
+-----+-----+



## Klasyfikacja - różne modele

### Regresja logistyczna

In [169]:
lr = classification.LogisticRegression(maxIter=100)

In [170]:
lrModel = lr.fit(df_train)

In [171]:
lrModel.coefficients

DenseVector([0.1195, -0.0554, 0.0161, 0.3467, -0.0077, 0.0838, 0.1235, -0.2308, 1.2144, 0.7871, -0.2696, -0.3507, 0.1791, 0.7714, -0.0424, 0.8735, 0.9751, -0.287, 0.7776, 0.3989, -0.3501, 0.8862, 0.688, -0.2852, -1.5105, -0.9485, -0.5117, -0.483, -0.3337, 0.1965, 0.4094, 0.3198, 0.1618, 0.2199, -0.1254, 0.0667, -0.3217, 0.0584, -0.0726, -0.1017, 0.1704, 0.1236, -0.824, 0.2883, 0.5033, -0.0505, 0.3162, 0.394, 0.216, 0.1411, 0.1273, -0.004, 0.4099, 0.1362, -0.044, -0.0098, 0.0576, 0.0238, 0.0316, -0.0027, 0.0296, -0.011, 0.0187, -0.0069, -0.056, 0.0247, -0.0219, -0.0431, -0.0402, 0.0378, 0.0417, -0.0668, -0.0634, 0.0185, 0.0126, 0.0193, 0.0083, -0.0238, 0.0026, 0.0231, -0.0094, -0.3282, 0.0311, 0.0184, -0.0224, 0.0168, 0.0047, -0.0194, -0.0125, 0.006, 0.0096, -0.2245, 0.0003, -0.196, 0.3383, 2.8936, 2.3354, 0.2631, 0.3949])

In [172]:
lrModel.intercept

-18.890890917184684

In [173]:
trainingSummary = lrModel.summary
type(trainingSummary)

pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary

In [174]:
trainingSummary.roc.show(120)

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|0.005493499359091741|
|                 0.0| 0.01025453213697125|
|                 0.0|0.014466214978941586|
|                 0.0|0.018311664530305805|
|5.742506029631331E-5|0.021790880791063907|
|1.148501205926266...| 0.02527009705182201|
|1.148501205926266...| 0.02893242995788317|
|1.148501205926266...| 0.03259476286394433|
|1.148501205926266...| 0.03625709577000549|
|1.148501205926266...| 0.04010254532136971|
|1.148501205926266...| 0.04376487822743087|
|1.148501205926266...| 0.04761032777879509|
|1.148501205926266...| 0.05127266068485625|
|1.148501205926266...| 0.05511811023622047|
|1.148501205926266...| 0.05878044314228163|
|1.148501205926266...|0.062442776048342793|
|2.297002411852532...| 0.06573887566379784|
|2.871253014815665...| 0.06921809192455594|
|3.445503617778798...| 0.0726973

In [182]:
trainingSummary.roc.toPandas()

Unnamed: 0,FPR,TPR
0,0.000000,0.000000
1,0.000000,0.005493
2,0.000000,0.010255
3,0.000000,0.014466
4,0.000000,0.018312
...,...,...
1024,0.997014,1.000000
1025,0.998162,1.000000
1026,0.999311,1.000000
1027,1.000000,1.000000


In [176]:
trainingSummary.pr.show(120)

+--------------------+------------------+
|              recall|         precision|
+--------------------+------------------+
|                 0.0|               1.0|
|0.005493499359091741|               1.0|
| 0.01025453213697125|               1.0|
|0.014466214978941586|               1.0|
|0.018311664530305805|               1.0|
|0.021790880791063907|0.9916666666666667|
| 0.02527009705182201|0.9857142857142858|
| 0.02893242995788317|            0.9875|
| 0.03259476286394433|0.9888888888888889|
| 0.03625709577000549|              0.99|
| 0.04010254532136971|0.9909502262443439|
| 0.04376487822743087| 0.991701244813278|
| 0.04761032777879509|0.9923664122137404|
| 0.05127266068485625|0.9929078014184397|
| 0.05511811023622047|0.9933993399339934|
| 0.05878044314228163|0.9938080495356038|
|0.062442776048342793|0.9941690962099126|
| 0.06573887566379784|0.9889807162534435|
| 0.06921809192455594|0.9869451697127938|
| 0.07269730818531404|0.9851116625310173|
| 0.07617652444607215| 0.983451536

In [177]:
trainingSummary.areaUnderROC

0.9084036480991468

In [178]:
trainingSummary.accuracy

0.8521092896174863

In [179]:
trainingSummary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[7.46914970741281...|[0.99942991207685...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[7.27705351689928...|[0.99930925822207...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[7.29325073524101...|[0.99932034865393...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.64035056744107...|[0.99982320635919...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.54061890992579...|[0.99980466885499...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.18844256065098...|[0.99972223090094...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.02836240188971...|[0.99967402453184...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.02836240188971...|[0.99967402453184...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[7.38804176684461...|[0.99938177629371...|       0.0|
|  0.0|(99,[3,15

In [180]:
# predykcje
lrModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[7.52206892746420...|[0.99945928044343...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.18844256065098...|[0.99972223090094...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.12441049714647...|[0.99970386841916...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.02836240188971...|[0.99967402453184...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[7.38804176684461...|[0.99938177629371...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-2.8936533533836...|[0.05246819369720...|       1.0|
|  0.0|(99,[3,15,24,36,4...|[7.54341595564847...|[0.99947069482440...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[6.83906325709886...|[0.99893003976008...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[6.83906325709886...|[0.99893003976008...|       0.0|
|  0.0|(99,[3,15

### SVM

In [183]:
svm = classification.LinearSVC(maxIter=100)

In [184]:
svmModel = svm.fit(df_train)

In [185]:
svmModel.coefficients

DenseVector([0.0334, -0.0591, -0.0041, -0.0333, -0.0179, 0.049, 0.0604, -0.0223, -0.1256, -0.0539, 0.1835, 0.1166, -0.0387, -0.0466, -0.0163, -0.0504, -0.042, 0.0774, -0.0102, -0.0225, 0.0664, 0.0133, 0.0104, 0.2382, -0.2641, -0.1016, -0.0563, -0.0451, -0.049, -0.0458, 0.1824, 0.1211, -0.0339, 0.0102, -0.1452, -0.0516, -0.0339, -0.0571, -0.0785, -0.1114, 0.0707, 0.0064, -0.1552, 0.3136, 0.0394, -0.0377, 0.0186, 0.2608, 0.0888, 0.0735, 0.0611, -0.0094, 0.1975, 0.0401, -0.0298, -0.0362, 0.0221, 0.0074, 0.0148, -0.0006, 0.0022, -0.0199, 0.0064, -0.013, -0.0681, 0.0059, -0.0126, -0.0226, -0.0507, 0.0091, 0.0128, -0.0644, -0.0068, 0.0044, 0.0026, 0.0041, 0.0058, -0.0221, -0.0004, 0.0095, -0.0198, -0.034, 0.0268, 0.0032, -0.0133, -0.0099, -0.0021, -0.009, -0.0169, 0.0024, -0.0021, -0.0174, -0.0011, 0.0046, 0.1437, 0.2451, 1.7857, 0.1961, 0.2022])

In [186]:
svmModel.intercept

-4.556880210922177

In [187]:
# predykcje
svmModel.transform(df_eval).show()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[4.02634101387953...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[4.43370424351513...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[4.40091618696056...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[4.35173410212869...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[4.02385353658290...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-3.9023068350281...|       1.0|
|  0.0|(99,[3,15,24,36,4...|[4.12965682107067...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.76898819897030...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.76898819897030...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.60504791619741...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.60504791619741...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.60504791619741...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[3.60504791619741...|       0.0|
|  0.0|(99,[3,15,24,36,4

### Drzewo decyzyjne

In [188]:
tree = classification.DecisionTreeClassifier()

In [189]:
treeModel = tree.fit(df_train)

In [190]:
treeModel.depth

5

In [191]:
treeModel.numNodes

35

In [192]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_886399278c18, depth=5, numNodes=35, numClasses=2, numFeatures=99
  If (feature 23 <= 1.0035900941414533)
   If (feature 96 <= 1.0012372970722383)
    If (feature 95 <= 4.873830618810881)
     If (feature 98 <= 3.6074818888073166)
      If (feature 43 <= 1.0192656067064028)
       Predict: 0.0
      Else (feature 43 > 1.0192656067064028)
       Predict: 1.0
     Else (feature 98 > 3.6074818888073166)
      Predict: 0.0
    Else (feature 95 > 4.873830618810881)
     Predict: 0.0
   Else (feature 96 > 1.0012372970722383)
    If (feature 94 <= 1.5689317394691287)
     Predict: 0.0
    Else (feature 94 > 1.5689317394691287)
     Predict: 1.0
  Else (feature 23 > 1.0035900941414533)
   If (feature 95 <= 4.873830618810881)
    If (feature 96 <= 1.0012372970722383)
     If (feature 95 <= 3.7041112702962695)
      If (feature 97 <= 4.405840064461252)
       Predict: 0.0
      Else (feature 97 > 4.405840064461252)
       Predict: 1.0
  

In [193]:
# predykcje
treeModel.transform(df_eval).show()

+-----+--------------------+--------------+--------------------+----------+
|label|            features| rawPrediction|         probability|prediction|
+-----+--------------------+--------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|     [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8135.0,119.0]|[0.98558274775866...|       0.0|
|  0.0|(99,[

### Las losowy

In [194]:
forest = classification.RandomForestClassifier()

In [195]:
forestModel = forest.fit(df_train)

In [196]:
forestModel.featureImportances

SparseVector(99, {0: 0.0002, 1: 0.0001, 2: 0.0002, 3: 0.0005, 4: 0.0001, 5: 0.002, 6: 0.0006, 8: 0.0111, 10: 0.0144, 11: 0.0309, 12: 0.0004, 14: 0.0001, 15: 0.001, 17: 0.0099, 20: 0.0062, 22: 0.0004, 23: 0.2683, 24: 0.0586, 25: 0.0041, 26: 0.003, 27: 0.0005, 29: 0.0013, 30: 0.0237, 31: 0.0213, 32: 0.0, 33: 0.0002, 34: 0.0086, 35: 0.0011, 36: 0.0005, 38: 0.0002, 39: 0.0013, 40: 0.0002, 41: 0.0, 43: 0.1619, 44: 0.0251, 45: 0.0069, 46: 0.0085, 47: 0.005, 48: 0.0001, 49: 0.0001, 50: 0.0001, 51: 0.0001, 52: 0.0061, 53: 0.0001, 54: 0.0022, 55: 0.0002, 57: 0.0001, 61: 0.0001, 63: 0.0003, 65: 0.0001, 66: 0.0001, 68: 0.0, 69: 0.0001, 75: 0.0, 77: 0.0001, 78: 0.0002, 79: 0.0001, 80: 0.0, 81: 0.0, 87: 0.0, 94: 0.0317, 95: 0.0901, 96: 0.1318, 97: 0.018, 98: 0.0398})

In [197]:
print(forestModel.toDebugString)

RandomForestClassificationModel: uid=RandomForestClassifier_009ec1af8ddf, numTrees=20, numClasses=2, numFeatures=99
  Tree 0 (weight 1.0):
    If (feature 31 <= 1.5160103368044824)
     If (feature 95 <= 4.873830618810881)
      If (feature 25 <= 1.4599784308832806)
       If (feature 34 <= 1.6508711694001317)
        If (feature 96 <= 0.42415776050710774)
         Predict: 0.0
        Else (feature 96 > 0.42415776050710774)
         Predict: 1.0
       Else (feature 34 > 1.6508711694001317)
        If (feature 79 <= 15.444127847290664)
         Predict: 0.0
        Else (feature 79 > 15.444127847290664)
         Predict: 1.0
      Else (feature 25 > 1.4599784308832806)
       If (feature 98 <= 3.4453478713328303)
        If (feature 96 <= 1.0012372970722383)
         Predict: 0.0
        Else (feature 96 > 1.0012372970722383)
         Predict: 1.0
       Else (feature 98 > 3.4453478713328303)
        Predict: 0.0
     Else (feature 95 > 4.873830618810881)
      If (feature 17 <= 3.881

In [198]:
# predykcje
forestModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[18.7199027359335...|[0.93599513679667...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[11.4623472227092...|[0.57311736113546...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[18.6663953604359...|[0.93331976802179...|       0.0|
|  0.0|(99,[3,15

### Gradient-Boosted Trees

In [199]:
gbt = classification.GBTClassifier()

In [200]:
gbtModel = gbt.fit(df_train)

In [201]:
gbtModel.featureImportances

SparseVector(99, {0: 0.0009, 1: 0.0178, 2: 0.0018, 3: 0.0007, 4: 0.0004, 5: 0.0044, 6: 0.0054, 8: 0.0079, 9: 0.0062, 10: 0.0035, 11: 0.0032, 12: 0.0019, 15: 0.0004, 16: 0.0003, 19: 0.0008, 20: 0.0006, 23: 0.2384, 24: 0.0004, 25: 0.0005, 26: 0.0014, 27: 0.0003, 29: 0.002, 30: 0.0478, 31: 0.011, 32: 0.0007, 33: 0.0024, 34: 0.0221, 37: 0.0025, 38: 0.0059, 39: 0.0216, 40: 0.0048, 41: 0.0001, 43: 0.0074, 44: 0.0021, 45: 0.0008, 47: 0.0118, 48: 0.0018, 49: 0.0004, 50: 0.0008, 51: 0.0017, 52: 0.0142, 53: 0.0069, 54: 0.0001, 55: 0.001, 56: 0.0016, 57: 0.001, 58: 0.001, 60: 0.0004, 62: 0.0004, 63: 0.0013, 64: 0.001, 66: 0.0007, 68: 0.0003, 69: 0.0003, 70: 0.0008, 73: 0.0003, 76: 0.0001, 79: 0.0007, 82: 0.0008, 83: 0.0004, 84: 0.0005, 88: 0.001, 94: 0.1433, 95: 0.1359, 96: 0.101, 97: 0.0601, 98: 0.0801})

In [202]:
print(gbtModel.toDebugString)

GBTClassificationModel: uid = GBTClassifier_7e1dee46fcd8, numTrees=20, numClasses=2, numFeatures=99
  Tree 0 (weight 1.0):
    If (feature 23 <= 1.0035900941414533)
     If (feature 96 <= 1.0012372970722383)
      If (feature 95 <= 4.873830618810881)
       If (feature 98 <= 3.6074818888073166)
        If (feature 43 <= 1.0192656067064028)
         Predict: -0.9711654955173249
        Else (feature 43 > 1.0192656067064028)
         Predict: 0.3333333333333333
       Else (feature 98 > 3.6074818888073166)
        If (feature 94 <= 3.028403125021806)
         Predict: -0.9086918349429324
        Else (feature 94 > 3.028403125021806)
         Predict: -0.7183770883054893
      Else (feature 95 > 4.873830618810881)
       If (feature 98 <= 3.5264148800700736)
        If (feature 94 <= 2.444614570800735)
         Predict: -0.9536019536019537
        Else (feature 94 > 2.444614570800735)
         Predict: -0.6885644768856448
       Else (feature 98 > 3.5264148800700736)
        If (feature 9

In [203]:
# predykcje
gbtModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.00267317803291...|[0.88135726925531...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[1.51096892197482...|[0.95355542317705...|       0.0|
|  0.0|(99,[3,15

### Naiwny Bayes

In [204]:
bayes = classification.NaiveBayes()

In [205]:
bayesModel = bayes.fit(df_train)

In [208]:
# predykcje
bayesModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[-130.22225462507...|[0.99999999999999...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-117.69902941287...|[0.99999999999996...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-118.08460833743...|[0.99999999999996...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-118.66297672428...|[0.99999999999996...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-122.51876596994...|[0.99999999999996...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-154.62418932850...|[0.99999991987258...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-123.28386080512...|[0.99999999999994...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-127.52522897534...|[0.99999999999993...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[-127.52522897534...|[0.99999999999993...|       0.0|
|  0.0|(99,[3,15

### MLP (Multilayer Perceptron Classifier)

In [209]:
mlp = classification.MultilayerPerceptronClassifier(maxIter=100, layers=[99, 40, 2])

In [210]:
mlpModel = mlp.fit(df_train)

                                                                                

In [211]:
mlpModel.layers

Param(parent='MultilayerPerceptronClassifier_031cd4566c78', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.')

In [212]:
mlpModel.weights

DenseVector([0.4554, 0.0799, -0.3209, -0.5527, -0.5395, 0.1238, 0.4076, 0.1991, 0.0017, 0.0417, -0.246, 0.0569, 0.0098, 0.0508, 0.1935, 0.2084, 0.0494, -0.2959, 0.3071, 0.2938, 0.2929, 0.5414, -0.1394, 0.0676, -0.5613, 0.0717, 0.0099, -0.0438, -0.0494, -0.5785, 0.2934, -0.5305, 0.7413, 0.3741, 0.3226, -0.2936, -0.0678, -0.2394, -0.1923, 0.2054, -0.0133, -0.5194, 0.2991, 0.5876, -0.0347, 0.2392, -0.4287, 0.5326, -0.1049, 0.081, 0.6194, 0.0582, -0.005, 0.0321, -0.4554, -0.2595, -0.0199, 0.1833, -0.2674, -0.1143, -0.2589, 0.3905, -0.119, 0.5428, 0.2706, 0.1393, 0.2098, 0.0832, -0.4661, 0.1729, -0.1038, 0.7462, 0.4569, -0.0656, -0.1719, 0.305, 0.5263, 0.1837, 0.248, -0.2002, -0.2307, 0.1757, 0.0525, 0.3714, 0.3729, 0.2492, 0.0255, 0.0313, 0.4708, -0.2347, 0.4038, -0.2095, 0.1342, -0.1592, -0.5457, 0.1209, -0.0947, 0.1603, 0.1936, -0.23, 0.2671, -0.3586, 0.3603, -0.1111, -0.0873, 0.1819, -0.0504, 0.77, -0.1483, 0.6284, -0.2005, 0.4716, 0.1073, -0.2856, -0.0123, -0.0717, 0.2, 0.1138, 0.3074,

In [213]:
# predykcje
mlpModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[3,15,24,36,4...|[8.33850320116604...|[0.99999987502801...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[10.1364571223475...|[0.99999999673114...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[10.0640369548416...|[0.99999999619331...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[9.94242214528446...|[0.99999999509148...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.42715713531269...|[0.99999989949395...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[0.36308311511216...|[0.73634239094714...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[10.0237207801741...|[0.99999999567692...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.58350522878193...|[0.99999992826007...|       0.0|
|  0.0|(99,[3,15,24,36,4...|[8.58350522878193...|[0.99999992826007...|       0.0|
|  0.0|(99,[3,15

## Klasyfikacja - ewaluacja

In [214]:
from pyspark.ml import evaluation

In [215]:
evaluator = evaluation.BinaryClassificationEvaluator()  # metric_name='areuUnderROC' lub metric_name='areaUnderPR'

In [216]:
# AUC - regresja
evaluator.evaluate(lrModel.transform(df_eval))

0.9064497178559369

In [217]:
# AUC - SVM
evaluator.evaluate(svmModel.transform(df_eval))

0.9041854628519912

In [218]:
# AUC - drzewo decyzyjne
evaluator.evaluate(treeModel.transform(df_eval))

0.7600720140232385

In [219]:
# AUC - las losowy
evaluator.evaluate(forestModel.transform(df_eval))

0.8862898458041858

In [220]:
# AUC - gbt
evaluator.evaluate(gbtModel.transform(df_eval))

0.9094762679229919

In [221]:
# AUC - NB
evaluator.evaluate(bayesModel.transform(df_eval))

0.5523512963904423

In [222]:
# AUC - MLP
evaluator.evaluate(mlpModel.transform(df_eval))

0.8964583040990849

> **PRZYKŁAD:**
- napisz funkcję do obliczania `accuracy`
- oblicz `accuracy` powyższych modeli

In [223]:
def calculate_acc(df, label="label", prediction="prediction"):
    
    temp = df.select(f.when(df[label] == df[prediction], 1).otherwise(0).alias("same"))
    
    return temp.select(f.avg("same")).collect()[0][0]

In [224]:
calculate_acc(lrModel.transform(df_eval))

0.8526739624199876

In [225]:
calculate_acc(svmModel.transform(df_eval))

0.8512285773281024

In [226]:
calculate_acc(treeModel.transform(df_eval))

0.840491430931241

In [227]:
calculate_acc(forestModel.transform(df_eval))

0.829444559157547

In [228]:
calculate_acc(gbtModel.transform(df_eval))

0.8540161057195953

In [229]:
calculate_acc(bayesModel.transform(df_eval))

0.7901094362998141

In [230]:
calculate_acc(mlpModel.transform(df_eval))

0.8476151145983895

### **ZADANIE 3**
- popraw `accuracy` dwóch modeli

In [231]:
# model 1 - przed optymalizacją

tree = classification.DecisionTreeClassifier()
treeModel = tree.fit(df_train)

print(f'AUC: {evaluator.evaluate(treeModel.transform(df_eval))}')
print(f'acc: {calculate_acc(treeModel.transform(df_eval))}')

AUC: 0.7600720140232385
acc: 0.840491430931241


In [238]:
# model 1 - po optymalizacji

tree = classification.DecisionTreeClassifier(maxDepth=10, maxBins=100)
treeModel = tree.fit(df_train)

print(f'AUC: {evaluator.evaluate(treeModel.transform(df_eval))}')
print(f'acc: {calculate_acc(treeModel.transform(df_eval))}')

AUC: 0.790780054151417
acc: 0.8547387982655379


In [239]:
# model 2 - przed optymalizacją

bayes = classification.NaiveBayes()
bayesModel = bayes.fit(df_train)

print(f'AUC: {evaluator.evaluate(bayesModel.transform(df_eval))}')
print(f'acc: {calculate_acc(bayesModel.transform(df_eval))}')

AUC: 0.5523512963904423
acc: 0.7901094362998141


In [250]:
# model 2 - po optymalizacji

bayes = classification.NaiveBayes(modelType='gaussian')
bayesModel = bayes.fit(df_train)

print(f'AUC: {evaluator.evaluate(bayesModel.transform(df_eval))}')
print(f'acc: {calculate_acc(bayesModel.transform(df_eval))}')

AUC: 0.7577921738090253
acc: 0.4894693371876936


## Regresja - przygotowanie danych

In [251]:
from pyspark.ml import regression

https://archive.ics.uci.edu/ml/datasets/wine+quality

In [254]:
wine_red = spark.read.csv("./winequality-red.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(0))

wine_white = spark.read.csv("./winequality-white.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(1))

In [297]:
wine = wine_red.union(wine_white)

In [298]:
cols = [col.replace(" ", "_") for col in wine.columns]
cols

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality',
 'type']

In [299]:
wine = wine.toDF(*cols)
wine.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- type: integer (nullable = false)



### **ZADANIE 4:**

1. usuń wiersze zawierające braki danych
2. zmień nazwę kolumny `quality` na `label`
3. podziel `wine` na zbiór treningowy i ewaluacyjny
4. z pozostałych zmiennych stwórz (przeskalowaną) kolumnę `features` zawierającą wektory
5. wynikowym DFom nadaj nazwę `wine_train` i `wine_eval`

In [300]:
# 1
wine = wine.dropna()
wine.count()

6497

In [301]:
# 2
wine = wine.withColumnRenamed("quality", "label")
wine.toPandas().head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,label,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [302]:
# 3
wine_train, wine_test = wine.randomSplit([0.7, 0.3], 42)

In [305]:
# 4
features_col = wine.columns
features_col.remove('label')
print(features_col)

['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'type']


In [306]:
#przygotowanie estymatorów/transformerów

vectAssembler = feature.VectorAssembler(inputCols=features_col, outputCol="featuresFull")
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="features")

pipeline = Pipeline(stages=[vectAssembler, scaler])

In [307]:
# 5
pipe_model = pipeline.fit(wine_train)

wine_train = pipe_model.transform(wine_train)
wine_eval = pipe_model.transform(wine_test)

**Ostatnie przygotowania**

In [308]:
wine_train = wine_train.select("label", "features")
wine_eval = wine_eval.select("label", "features")

wine_train.toPandas()

Unnamed: 0,label,features
0,4,"[3.567142004687321, 3.13945111998625, 1.036114..."
1,6,"[3.6446885700066107, 3.6224435999841345, 1.174..."
2,6,"[3.877328265964479, 2.294214279989952, 0.06907..."
3,6,"[3.877328265964479, 2.4149623999894234, 3.4537..."
4,8,"[3.877328265964479, 2.535710519988894, 1.65778..."
...,...,...
4626,5,"[7.987296227886827, 1.5093514999933895, 3.3155..."
4627,6,"[8.297482489163984, 1.3282293199941828, 3.8681..."
4628,6,"[8.297482489163984, 1.3282293199941828, 3.8681..."
4629,3,"[9.150494707676172, 1.3886033799939184, 2.6248..."


In [309]:
wine_train.cache()
wine_eval.cache()

DataFrame[label: int, features: vector]

In [310]:
print("Train:")
wine_train.describe("label").show()

print("Eval:")
wine_eval.describe("label").show()

Train:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              4631|
|   mean|5.8168862016843015|
| stddev|0.8791709597478697|
|    min|                 3|
|    max|                 9|
+-------+------------------+

Eval:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              1866|
|   mean| 5.822079314040729|
| stddev|0.8586198577914707|
|    min|                 3|
|    max|                 9|
+-------+------------------+



## Regresja - różne modele

### Regresja liniowa

In [311]:
reg = regression.LinearRegression(maxIter=500)

In [312]:
regModel = reg.fit(wine_train)

23/08/13 14:20:47 WARN Instrumentation: [67d4641a] regParam is zero, which might cause numerical instability and overfitting.


In [313]:
regModel.coefficients

DenseVector([0.1008, -0.2373, -0.0113, 0.2762, -0.0232, 0.0946, -0.0873, -0.2824, 0.0662, 0.1057, 0.2789, -0.1337])

In [314]:
regModel.intercept

94.4248010100462

In [315]:
trainSummary = regModel.summary
type(trainSummary)

pyspark.ml.regression.LinearRegressionTrainingSummary

In [316]:
trainSummary.meanAbsoluteError

0.5735071995096598

In [317]:
trainSummary.meanSquaredError

0.5486947582555972

In [318]:
trainSummary.r2

0.2899679826341164

In [319]:
# predykcje
regModel.transform(wine_eval).show()

+-----+--------------------+-----------------+
|label|            features|       prediction|
+-----+--------------------+-----------------+
|    7|[3.79978170064518...|7.088596128745706|
|    6|[3.87732826596447...| 6.15688443368731|
|    5|[3.87732826596447...|5.333178584375986|
|    7|[3.95487483128376...|6.914388932219808|
|    5|[4.03242139660305...|5.379341371926159|
|    6|[4.03242139660305...|7.250840950574997|
|    6|[4.03242139660305...|7.250840950574997|
|    7|[4.10996796192234...|6.874580727314779|
|    7|[4.10996796192234...|6.332254543381978|
|    7|[4.18751452724163...|6.355781707478641|
|    6|[4.18751452724163...|5.561944831913337|
|    8|[4.26506109256092...|7.051003659599701|
|    5|[4.34260765788021...|5.360276610654935|
|    6|[4.34260765788021...|6.242332258147528|
|    5|[4.34260765788021...|6.405827801454379|
|    5|[4.34260765788021...|5.969597420455628|
|    5|[4.34260765788021...|6.368553108958139|
|    7|[4.34260765788021...|6.193197654055908|
|    5|[4.342

### Drzewo regresyjne

In [320]:
tree_reg = regression.DecisionTreeRegressor()

In [321]:
tree_regModel = tree_reg.fit(wine_train)

In [322]:
print(tree_regModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_a8fd50d6a493, depth=5, numNodes=63, numFeatures=12
  If (feature 10 <= 9.113586134636481)
   If (feature 1 <= 1.5244450149933235)
    If (feature 1 <= 1.2527617449945132)
     If (feature 7 <= 329.8139994767736)
      If (feature 0 <= 6.552684769479971)
       Predict: 5.988207547169812
      Else (feature 0 > 6.552684769479971)
       Predict: 5.321428571428571
     Else (feature 7 > 329.8139994767736)
      If (feature 2 <= 2.1067668075847323)
       Predict: 7.085106382978723
      Else (feature 2 > 2.1067668075847323)
       Predict: 6.0625
    Else (feature 1 > 1.2527617449945132)
     If (feature 9 <= 3.0223311418752203)
      If (feature 10 <= 7.3882660114381355)
       Predict: 5.2
      Else (feature 10 > 7.3882660114381355)
       Predict: 5.565217391304348
     Else (feature 9 > 3.0223311418752203)
      If (feature 10 <= 8.710083847759474)
       Predict: 5.7406143344709895
      Else (feature 10 > 8.710083847759474)
  

In [323]:
# predykcje
tree_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|    7|[3.79978170064518...| 6.794520547945205|
|    6|[3.87732826596447...| 5.605263157894737|
|    5|[3.87732826596447...| 6.357142857142857|
|    7|[3.95487483128376...| 6.794520547945205|
|    5|[4.03242139660305...|5.3056768558951966|
|    6|[4.03242139660305...|6.8428571428571425|
|    6|[4.03242139660305...|6.8428571428571425|
|    7|[4.10996796192234...| 6.794520547945205|
|    7|[4.10996796192234...| 6.794520547945205|
|    7|[4.18751452724163...|6.8428571428571425|
|    6|[4.18751452724163...| 5.477611940298507|
|    8|[4.26506109256092...|6.8428571428571425|
|    5|[4.34260765788021...|5.3056768558951966|
|    6|[4.34260765788021...|  5.59070796460177|
|    5|[4.34260765788021...| 5.605263157894737|
|    5|[4.34260765788021...| 5.066666666666666|
|    5|[4.34260765788021...| 5.605263157894737|
|    7|[4.34260765788021...| 5.605263157

### Las regresyjny

In [324]:
forest_reg = regression.RandomForestRegressor()

In [325]:
forest_regModel = forest_reg.fit(wine_train)

In [326]:
forest_regModel.featureImportances

SparseVector(12, {0: 0.0162, 1: 0.1326, 2: 0.0747, 3: 0.0244, 4: 0.041, 5: 0.0711, 6: 0.029, 7: 0.1845, 8: 0.0208, 9: 0.0379, 10: 0.3644, 11: 0.0035})

In [327]:
print(forest_regModel.toDebugString)

RandomForestRegressionModel: uid=RandomForestRegressor_51d3bf614acb, numTrees=20, numFeatures=12
  Tree 0 (weight 1.0):
    If (feature 4 <= 1.1167202647811711)
     If (feature 10 <= 9.113586134636481)
      If (feature 5 <= 0.9148754674724742)
       If (feature 10 <= 8.111787353424543)
        If (feature 8 <= 18.28583927012722)
         Predict: 3.0
        Else (feature 8 > 18.28583927012722)
         Predict: 5.3
       Else (feature 10 > 8.111787353424543)
        If (feature 7 <= 328.009344053748)
         Predict: 5.769230769230769
        Else (feature 7 > 328.009344053748)
         Predict: 5.148936170212766
      Else (feature 5 > 0.9148754674724742)
       If (feature 10 <= 8.306581560882423)
        If (feature 1 <= 2.0376245249910756)
         Predict: 5.809160305343512
        Else (feature 1 > 2.0376245249910756)
         Predict: 5.107142857142857
       Else (feature 10 > 8.306581560882423)
        If (feature 2 <= 0.10361148234023275)
         Predict: 4.0
        E

In [328]:
# predykcje
forest_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|    7|[3.79978170064518...| 6.584500398216221|
|    6|[3.87732826596447...| 5.950576126991017|
|    5|[3.87732826596447...| 5.882790160553404|
|    7|[3.95487483128376...| 6.556831526779023|
|    5|[4.03242139660305...| 5.269292408362675|
|    6|[4.03242139660305...| 6.537117540160466|
|    6|[4.03242139660305...| 6.537117540160466|
|    7|[4.10996796192234...|  6.51070240501822|
|    7|[4.10996796192234...| 5.876062314384201|
|    7|[4.18751452724163...|  6.26620611472278|
|    6|[4.18751452724163...| 5.601050485098492|
|    8|[4.26506109256092...| 6.577973448723993|
|    5|[4.34260765788021...|5.4178974575959735|
|    6|[4.34260765788021...| 5.655227534556642|
|    5|[4.34260765788021...| 6.026719908638333|
|    5|[4.34260765788021...|  5.52844747427398|
|    5|[4.34260765788021...|5.9635899289825405|
|    7|[4.34260765788021...| 5.886848121

### Gradient-Boosted Trees regression

In [329]:
gbt_reg = regression.GBTRegressor()

In [330]:
gbt_regModel = gbt_reg.fit(wine_train)

In [331]:
gbt_regModel.featureImportances

SparseVector(12, {0: 0.0813, 1: 0.1014, 2: 0.0625, 3: 0.0714, 4: 0.0766, 5: 0.0978, 6: 0.1021, 7: 0.0487, 8: 0.0867, 9: 0.1064, 10: 0.1651})

In [332]:
print(gbt_regModel.toDebugString)

GBTRegressionModel: uid=GBTRegressor_7ec4f311bfc8, numTrees=20, numFeatures=12
  Tree 0 (weight 1.0):
    If (feature 10 <= 9.113586134636481)
     If (feature 1 <= 1.5244450149933235)
      If (feature 1 <= 1.2527617449945132)
       If (feature 7 <= 329.8139994767736)
        If (feature 0 <= 6.552684769479971)
         Predict: 5.988207547169812
        Else (feature 0 > 6.552684769479971)
         Predict: 5.321428571428571
       Else (feature 7 > 329.8139994767736)
        If (feature 2 <= 2.1067668075847323)
         Predict: 7.085106382978723
        Else (feature 2 > 2.1067668075847323)
         Predict: 6.0625
      Else (feature 1 > 1.2527617449945132)
       If (feature 9 <= 3.0223311418752203)
        If (feature 10 <= 7.3882660114381355)
         Predict: 5.2
        Else (feature 10 > 7.3882660114381355)
         Predict: 5.565217391304348
       Else (feature 9 > 3.0223311418752203)
        If (feature 10 <= 8.710083847759474)
         Predict: 5.7406143344709895
      

In [333]:
# predykcje
gbt_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|    7|[3.79978170064518...| 6.990817372018714|
|    6|[3.87732826596447...| 6.083849171585131|
|    5|[3.87732826596447...| 5.980633936018512|
|    7|[3.95487483128376...| 6.985277089162348|
|    5|[4.03242139660305...| 5.201648455120008|
|    6|[4.03242139660305...| 6.896515093351612|
|    6|[4.03242139660305...| 6.896515093351612|
|    7|[4.10996796192234...| 6.953975621909426|
|    7|[4.10996796192234...| 6.927208695053671|
|    7|[4.18751452724163...| 6.774395135531267|
|    6|[4.18751452724163...| 5.458805532864422|
|    8|[4.26506109256092...| 7.064131040385137|
|    5|[4.34260765788021...| 5.032879331609377|
|    6|[4.34260765788021...|5.4731954147848985|
|    5|[4.34260765788021...| 5.675287677020733|
|    5|[4.34260765788021...|  5.04726130604063|
|    5|[4.34260765788021...|  5.73451423476066|
|    7|[4.34260765788021...|  5.84389719

## Regresja - ewaluacja

**RMSE (root mean-squared error)**

In [334]:
evaluator_reg = evaluation.RegressionEvaluator()

In [335]:
# rmse - regresja
evaluator_reg.evaluate(regModel.transform(wine_eval))

0.7116825029693381

In [336]:
# rmse - drzewo
evaluator_reg.evaluate(tree_regModel.transform(wine_eval))

0.7188322423963592

In [337]:
# rmse - las
evaluator_reg.evaluate(forest_regModel.transform(wine_eval))

0.7053457505731172

In [338]:
# rmse - gbt
evaluator_reg.evaluate(gbt_regModel.transform(wine_eval))

0.6874828494648342

**MSE (mean-squared error)**

In [339]:
er = evaluation.RegressionEvaluator(metricName="mse")

In [340]:
er.evaluate(regModel.transform(wine_eval))

0.5064919850327019

In [341]:
er.evaluate(tree_regModel.transform(wine_eval))

0.5167197927085782

In [342]:
er.evaluate(forest_regModel.transform(wine_eval))

0.49751262785155403

In [343]:
er.evaluate(gbt_regModel.transform(wine_eval))

0.4726326683082878

**R^2 (determination coefficient)**

In [344]:
er2 = evaluation.RegressionEvaluator(metricName="r2")

In [345]:
er2.evaluate(regModel.transform(wine_eval))

0.31260950326372294

In [346]:
er2.evaluate(tree_regModel.transform(wine_eval))

0.2987287351437897

In [347]:
er2.evaluate(forest_regModel.transform(wine_eval))

0.324795924718592

In [348]:
er2.evaluate(gbt_regModel.transform(wine_eval))

0.3585620024742351

### **ZADANIE 5:**
- popraw `R^2` jednego modelu

In [350]:
# model przed optymalizacją

tree_reg = regression.DecisionTreeRegressor()
tree_regModel = tree_reg.fit(wine_train)

print(f'MSE: {er.evaluate(tree_regModel.transform(wine_eval))}')
print(f'R^2: {er2.evaluate(tree_regModel.transform(wine_eval))}')

MSE: 0.5167197927085782
R^2: 0.2987287351437897


In [392]:
# model po optymalizacji

tree_reg = regression.DecisionTreeRegressor(maxDepth=5, minInstancesPerNode=10, minInfoGain=0.01)
tree_regModel = tree_reg.fit(wine_train)

print(f'MSE: {er.evaluate(tree_regModel.transform(wine_eval))}')
print(f'R^2: {er2.evaluate(tree_regModel.transform(wine_eval))}')

MSE: 0.5149273162422765
R^2: 0.30116141191848655


### Wybór najlepszych parametrów

In [393]:
from pyspark.ml import tuning

In [394]:
reg2 = regression.LinearRegression()

In [395]:
grid = tuning.ParamGridBuilder() \
    .addGrid(reg2.maxIter, [100, 500, 1000]) \
    .addGrid(reg2.regParam, [0.0, 0.1, 0.2]) \
    .build()

In [396]:
reg_eval = evaluation.RegressionEvaluator(metricName='r2')

In [397]:
cv = tuning.CrossValidator(estimator=reg2, estimatorParamMaps=grid, evaluator=reg_eval, parallelism=2)

In [398]:
cvModel = cv.fit(wine_train)

23/08/13 15:17:42 WARN Instrumentation: [54be044d] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:42 WARN BlockManager: Block rdd_7307_0 already exists on this machine; not re-adding it
23/08/13 15:17:43 WARN Instrumentation: [93afa42d] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:43 WARN Instrumentation: [79105d65] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:44 WARN Instrumentation: [04da7c13] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:44 WARN Instrumentation: [9df377b0] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:44 WARN Instrumentation: [6a0d4da2] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:45 WARN Instrumentation: [a3e271e7] regParam is zero, which might cause numerical instability and overfitting.
23/08/13 15:17:

In [399]:
cvModel.avgMetrics

[0.28393382951825347,
 0.2789748315802018,
 0.2698947573689127,
 0.28393382951825347,
 0.2789748315802018,
 0.2698947573689127,
 0.28393382951825347,
 0.2789748315802018,
 0.2698947573689127]

**Parametry najlepszego modelu**

In [400]:
cvModel.bestModel._java_obj.getMaxIter()

100

In [401]:
cvModel.bestModel._java_obj.getRegParam()

0.0

**Ewaluacja na danych testowych - najlepszy model**

In [402]:
reg_eval.evaluate(cvModel.transform(wine_eval))

0.31260950326372294

**Zapisanie i wczytanie modelu**

In [403]:
cvModel.save("model")

                                                                                

In [404]:
readInModel = tuning.CrossValidatorModel.load("model")

In [405]:
reg_eval.evaluate(readInModel.transform(wine_eval))

0.31260950326372294

## Zadanie 6



1. Za pomocą `VectorAssembler` połącz kolumny `cylinders`, `displacement`, `horsepower`, `weight`, `acceleration` w jeden wektor
2. Podziel dane na treningowe i testowe (`label=mpg`)
3. Zastosuj `StandardScaler` na danych treningowych
4. Wytrenuj `RandomForestRegressor`
5. Zastosuj powyższe algorytmy na danych testowych
6. Za pomocą `RegressionEvaluator` oblicz `R^2` i `RMSE`
7. Za pomocą `ParamGridBuilder` i `CrossValidator` znajdź najlepszy model


In [469]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

auto_df_fixed = spark.read.format("csv").option("header", True).load("./auto-mpg_fixed.csv")
auto_df_fixed.show(5, truncate=False)

auto_df_fixed.printSchema()

+----+---------+------------+----------+------+------------+---------+------+-------------------------+
|mpg |cylinders|displacement|horsepower|weight|acceleration|modelyear|origin|carname                  |
+----+---------+------------+----------+------+------------+---------+------+-------------------------+
|18.0|8        |307.0       |130.0     |3504.0|12.0        |70       |1     |chevrolet chevelle malibu|
|15.0|8        |350.0       |165.0     |3693.0|11.5        |70       |1     |buick skylark 320        |
|18.0|8        |318.0       |150.0     |3436.0|11.0        |70       |1     |plymouth satellite       |
|16.0|8        |304.0       |150.0     |3433.0|12.0        |70       |1     |amc rebel sst            |
|17.0|8        |302.0       |140.0     |3449.0|10.5        |70       |1     |ford torino              |
+----+---------+------------+----------+------+------------+---------+------+-------------------------+
only showing top 5 rows

root
 |-- mpg: string (nullable = true)

In [470]:
for column_name in ("mpg displacement horsepower weight acceleration".split()):
    auto_df_fixed = auto_df_fixed.withColumn(column_name, col(column_name).cast("double"))
    
for column_name in ("cylinders modelyear origin".split()):
    auto_df_fixed = auto_df_fixed.withColumn(column_name, col(column_name).cast("int"))
    
auto_df_fixed.show(5)

auto_df_fixed.printSchema()

+----+---------+------------+----------+------+------------+---------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|modelyear|origin|             carname|
+----+---------+------------+----------+------+------------+---------+------+--------------------+
|18.0|        8|       307.0|     130.0|3504.0|        12.0|       70|     1|chevrolet chevell...|
|15.0|        8|       350.0|     165.0|3693.0|        11.5|       70|     1|   buick skylark 320|
|18.0|        8|       318.0|     150.0|3436.0|        11.0|       70|     1|  plymouth satellite|
|16.0|        8|       304.0|     150.0|3433.0|        12.0|       70|     1|       amc rebel sst|
|17.0|        8|       302.0|     140.0|3449.0|        10.5|       70|     1|         ford torino|
+----+---------+------------+----------+------+------------+---------+------+--------------------+
only showing top 5 rows

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |

In [471]:
# 1
features_col = "mpg displacement horsepower weight acceleration cylinders modelyear origin".split()
vectAssembler = VectorAssembler(inputCols=features_col, outputCol="featuresFull", handleInvalid="skip")

auto_df_fixed = vectAssembler.transform(auto_df_fixed)
auto_df_fixed.show()

+----+---------+------------+----------+------+------------+---------+------+--------------------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|modelyear|origin|             carname|        featuresFull|
+----+---------+------------+----------+------+------------+---------+------+--------------------+--------------------+
|18.0|        8|       307.0|     130.0|3504.0|        12.0|       70|     1|chevrolet chevell...|[18.0,307.0,130.0...|
|15.0|        8|       350.0|     165.0|3693.0|        11.5|       70|     1|   buick skylark 320|[15.0,350.0,165.0...|
|18.0|        8|       318.0|     150.0|3436.0|        11.0|       70|     1|  plymouth satellite|[18.0,318.0,150.0...|
|16.0|        8|       304.0|     150.0|3433.0|        12.0|       70|     1|       amc rebel sst|[16.0,304.0,150.0...|
|17.0|        8|       302.0|     140.0|3449.0|        10.5|       70|     1|         ford torino|[17.0,302.0,140.0...|
|15.0|        8|       429.0|     198.0|

In [472]:
# 2
auto_df_fixed = auto_df_fixed.withColumnRenamed("mpg", "label")
auto_train, auto_eval = auto_df_fixed.randomSplit([0.7, 0.3], 42)

auto_train.show()

+-----+---------+------------+----------+------+------------+---------+------+--------------------+--------------------+
|label|cylinders|displacement|horsepower|weight|acceleration|modelyear|origin|             carname|        featuresFull|
+-----+---------+------------+----------+------+------------+---------+------+--------------------+--------------------+
|  9.0|        8|       304.0|     193.0|4732.0|        18.5|       70|     1|            hi 1200d|[9.0,304.0,193.0,...|
| 10.0|        8|       307.0|     200.0|4376.0|        15.0|       70|     1|       chevrolet c20|[10.0,307.0,200.0...|
| 11.0|        8|       318.0|     210.0|4382.0|        13.5|       70|     1|          dodge d200|[11.0,318.0,210.0...|
| 11.0|        8|       350.0|     180.0|3664.0|        11.0|       73|     1|    oldsmobile omega|[11.0,350.0,180.0...|
| 11.0|        8|       400.0|     150.0|4997.0|        14.0|       73|     1|    chevrolet impala|[11.0,400.0,150.0...|
| 12.0|        8|       350.0|  

In [473]:
# 3 i 5
scaler = StandardScaler(inputCol="featuresFull", outputCol="features")

scalerModel = scaler.fit(auto_train)

auto_train = scalerModel.transform(auto_train)
auto_eval = scalerModel.transform(auto_eval)

auto_train = auto_train.select("label", "features")
auto_eval = auto_eval.select("label", "features")

In [474]:
# 4 i 5
forest_reg = regression.RandomForestRegressor(maxDepth=8)

forest_regModel = forest_reg.fit(auto_train)
predictions = forest_regModel.transform(auto_eval)

predictions.show(5)

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
| 10.0|[1.29492403688120...|11.030357142857143|
| 11.0|[1.42441644056932...| 11.96438492063492|
| 12.0|[1.55390884425744...|11.985218253968254|
| 12.0|[1.55390884425744...|12.222718253968255|
| 13.0|[1.68340124794556...|14.182333333333332|
+-----+--------------------+------------------+
only showing top 5 rows



In [476]:
# 6
er = evaluation.RegressionEvaluator(metricName='rmse')
er2 = evaluation.RegressionEvaluator(metricName="r2")

print(f'RMSE: {er.evaluate(predictions)}')
print(f'R^2: {er2.evaluate(predictions)}')

RMSE: 0.859012033225024
R^2: 0.9879090290193802


In [485]:
# 7
grid = tuning.ParamGridBuilder() \
    .addGrid(forest_reg.maxDepth, [5, 10, 15, 20, 25]) \
    .build()

cv = tuning.CrossValidator(estimator=forest_reg, estimatorParamMaps=grid, evaluator=er, parallelism=2)
cvModel = cv.fit(auto_train)

cvModel.bestModel._java_obj.getMaxDepth()

15

In [486]:
print(f'RMSE: {er.evaluate(cvModel.transform(auto_eval))}')
print(f'R^2: {er2.evaluate(cvModel.transform(auto_eval))}')

RMSE: 0.8481621536341377
R^2: 0.9882125337311262
