![](imgs/kodolamaczlogo.png)

# Przetwarzanie Big Data z użyciem Apache Spark

Autor notebooka: Jakub Nowacki.

## Uczenie Maszynowe (Machine Learning) na Spark

Spark ML zawiera wiele algorytmów uczenia maszynowego, które się dobrze działają w sposób rozproszony i się skalują, w tym:

* regresja liniowa,
* regresja logistyczna,
* algorytm random forest,
* algorytm centroidów (k-means).

To API wykorzystuje DataFrames bezpośrednio, więc jest nieco łatwiejsze w użyciu. 

In [1]:
import pyspark
import pyspark.sql.functions as func

spark = pyspark.sql.SparkSession.builder \
    .appName("SparkML") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
# Ładujemy dane z pliku CSV.
# Szczegółowy opis danych jest dostępny w pliku README
rdd = sc.textFile("data/Bike-Sharing-Dataset/day.csv")

In [4]:
rdd.first()

'instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt'

In [5]:
def from_csv(line):
    cols = line.split(",")
    d = dict()
    d['yr'] = float(cols[3])
    d['workingday'] = float(cols[7])
    d['weathersit'] = float(cols[8])
    d['temp'] = float(cols[9])
    d['hum'] = float(cols[11])
    d['windspeed'] = float(cols[12])
    d['casual'] = float(cols[13])   
    return pyspark.Row(**d)

In [6]:
rows = rdd \
  .filter(lambda line: not line.startswith("instant")) \
  .map(lambda line: from_csv(line))

In [7]:
rows.take(5)

[Row(casual=331.0, hum=0.805833, temp=0.344167, weathersit=2.0, windspeed=0.160446, workingday=0.0, yr=0.0),
 Row(casual=131.0, hum=0.696087, temp=0.363478, weathersit=2.0, windspeed=0.248539, workingday=0.0, yr=0.0),
 Row(casual=120.0, hum=0.437273, temp=0.196364, weathersit=1.0, windspeed=0.248309, workingday=1.0, yr=0.0),
 Row(casual=108.0, hum=0.590435, temp=0.2, weathersit=1.0, windspeed=0.160296, workingday=1.0, yr=0.0),
 Row(casual=82.0, hum=0.436957, temp=0.226957, weathersit=1.0, windspeed=0.1869, workingday=1.0, yr=0.0)]

In [24]:
train_df = spark.createDataFrame(rows)\
    .withColumn('log_casual', func.log10('casual'))
train_df.show()

+------+--------+--------+----------+---------+----------+---+------------------+
|casual|     hum|    temp|weathersit|windspeed|workingday| yr|        log_casual|
+------+--------+--------+----------+---------+----------+---+------------------+
| 331.0|0.805833|0.344167|       2.0| 0.160446|       0.0|0.0| 2.519827993775719|
| 131.0|0.696087|0.363478|       2.0| 0.248539|       0.0|0.0|2.1172712956557644|
| 120.0|0.437273|0.196364|       1.0| 0.248309|       1.0|0.0|2.0791812460476247|
| 108.0|0.590435|     0.2|       1.0| 0.160296|       1.0|0.0|  2.03342375548695|
|  82.0|0.436957|0.226957|       1.0|   0.1869|       1.0|0.0|1.9138138523837167|
|  88.0|0.518261|0.204348|       1.0|0.0895652|       1.0|0.0|1.9444826721501687|
| 148.0|0.498696|0.196522|       2.0| 0.168726|       1.0|0.0|2.1702617153949575|
|  68.0|0.535833|   0.165|       2.0| 0.266804|       0.0|0.0|1.8325089127062364|
|  54.0|0.434167|0.138333|       1.0|  0.36195|       0.0|0.0|1.7323937598229686|
|  41.0|0.482917

In [9]:
VectorAssembler?

In [25]:
va = VectorAssembler(inputCols=['yr', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed'], outputCol='features')
t = va.transform(train_df)
t.printSchema()
t.show()

root
 |-- casual: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- weathersit: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- workingday: double (nullable = true)
 |-- yr: double (nullable = true)
 |-- log_casual: double (nullable = true)
 |-- features: vector (nullable = true)

+------+--------+--------+----------+---------+----------+---+------------------+--------------------+
|casual|     hum|    temp|weathersit|windspeed|workingday| yr|        log_casual|            features|
+------+--------+--------+----------+---------+----------+---+------------------+--------------------+
| 331.0|0.805833|0.344167|       2.0| 0.160446|       0.0|0.0| 2.519827993775719|[0.0,0.0,2.0,0.34...|
| 131.0|0.696087|0.363478|       2.0| 0.248539|       0.0|0.0|2.1172712956557644|[0.0,0.0,2.0,0.36...|
| 120.0|0.437273|0.196364|       1.0| 0.248309|       1.0|0.0|2.0791812460476247|[0.0,1.0,1.0,0.19...|
| 108.0|0.590435|     0.2|

In [26]:
# Regresja liniowa oczekuje wektora zmiennych objaśniających (features) typu Vector (albo SparseVector).
# Nazwa 'features' jest standardowa ale można przekazać inną nazwę kolumny do modelu.
t.select(t.features).show()

+--------------------+
|            features|
+--------------------+
|[0.0,0.0,2.0,0.34...|
|[0.0,0.0,2.0,0.36...|
|[0.0,1.0,1.0,0.19...|
|[0.0,1.0,1.0,0.2,...|
|[0.0,1.0,1.0,0.22...|
|[0.0,1.0,1.0,0.20...|
|[0.0,1.0,2.0,0.19...|
|[0.0,0.0,2.0,0.16...|
|[0.0,0.0,1.0,0.13...|
|[0.0,1.0,1.0,0.15...|
|[0.0,1.0,2.0,0.16...|
|[0.0,1.0,1.0,0.17...|
|[0.0,1.0,1.0,0.16...|
|[0.0,1.0,1.0,0.16...|
|[0.0,0.0,2.0,0.23...|
|[0.0,0.0,1.0,0.23...|
|[0.0,0.0,2.0,0.17...|
|[0.0,1.0,2.0,0.21...|
|[0.0,1.0,2.0,0.29...|
|[0.0,1.0,2.0,0.26...|
+--------------------+
only showing top 20 rows



In [27]:
# Regresja liniowa oczekuje standardowo żeby kolumna wartości objaśnianych była nazwana 'label',
# możemy zatem przezwać kolumnę lub przekazać inną nazwę.
t.select(t.casual).show()

+------+
|casual|
+------+
| 331.0|
| 131.0|
| 120.0|
| 108.0|
|  82.0|
|  88.0|
| 148.0|
|  68.0|
|  54.0|
|  41.0|
|  43.0|
|  25.0|
|  38.0|
|  54.0|
| 222.0|
| 251.0|
| 117.0|
|   9.0|
|  78.0|
|  83.0|
+------+
only showing top 20 rows



In [35]:
# tworzenie estymatora
lr = LinearRegression(maxIter=100, labelCol='log_casual')

In [36]:
# Możemy wytrenować estymator bezpośrednio używając metody fit(),
# ale lepiej jest użyć pipeline który połączy transormatory i estymatory ze sobą.
p = Pipeline(stages=[va, lr])
p

Pipeline_4cd9a4b42f0c4ab85aa7

In [37]:
# Trenujemy model używając podstawowej formy danych; 
# niezbędne przetworzenia wykona za nas pipeline
lrmodel = p.fit(train_df)

In [38]:
# predykcja
test_df = spark.createDataFrame([(0.0,0.0,2.0,0.344167,0.805833,0.160446)], va.getInputCols())
lrmodel.transform(test_df).show()

+---+----------+----------+--------+--------+---------+--------------------+------------------+
| yr|workingday|weathersit|    temp|     hum|windspeed|            features|        prediction|
+---+----------+----------+--------+--------+---------+--------------------+------------------+
|0.0|       0.0|       2.0|0.344167|0.805833| 0.160446|[0.0,0.0,2.0,0.34...|2.5902189240944224|
+---+----------+----------+--------+--------+---------+--------------------+------------------+



In [43]:
# prawdziwe odpowiedzi i predykcja
y_ypred = lrmodel.transform(train_df).select(func.col('log_casual').alias('casual'), 'prediction')
y_ypred.show()

+------------------+------------------+
|            casual|        prediction|
+------------------+------------------+
| 2.519827993775719|2.5902189240944224|
|2.1172712956557644|2.5854908064631426|
|2.0791812460476247|  2.14255685253507|
|  2.03342375548695| 2.173527588423627|
|1.9138138523837167|2.2325089974868315|
|1.9444826721501687|2.2447546495057007|
|2.1702617153949575|  2.04912273015218|
|1.8325089127062364|2.2970068066044864|
|1.7323937598229686| 2.347242589312237|
|1.6127838567197355| 2.077643305979551|
|1.6334684555795864|1.9952654305639337|
|1.3979400086720377| 2.030436176306222|
|1.5797835966168101| 2.049905585304254|
|1.7323937598229686| 2.146791110584616|
| 2.346352974450639|2.4868839384681407|
| 2.399673721481038| 2.600852937670821|
|2.0681858617461617| 2.363159388727811|
|0.9542425094393249|2.0137727695990955|
|1.8920946026904804|2.1178055453286775|
| 1.919078092376074| 2.124183386350578|
+------------------+------------------+
only showing top 20 rows



In [44]:
# policzmy R^2 ręcznie
df_r2 = y_ypred.select(
    func.mean('casual').alias('mean'), 
    func.variance('casual').alias('variance'), 
    (func.sum(func.pow(y_ypred.casual - y_ypred.prediction, 2))/func.count('casual')).alias('residual_variance')) 
df_r2.show()

+-----------------+-------------------+------------------+
|             mean|           variance| residual_variance|
+-----------------+-------------------+------------------+
|2.756815588647298|0.19559615147083714|0.0569941052663099|
+-----------------+-------------------+------------------+



In [45]:
r = df_r2.collect()[0]

R2 = 1 - r.residual_variance/r.variance

print("R^2 is: {:.3f}".format(R2))

R^2 is: 0.709


### Zadanie

* Użyj skali logarytmicznej dla `casual`.
* ★ Przeprowadź kroswalidację (podpowiedź: `train_df.sample(False, 0.75)` lub sprawdź `pyspark.ml.tuning.CrossValidator`).