# `Промышленное машинное обучение на Spark`
## `Занятие 05: Feature Engineering`

### `Находнов Максим (nakhodnov17@gmail.com)`
#### `Москва, 2023`

О чём можно узнать из этого ноутбука:

* Accumulator/Broadcast

In [1]:
! pip3 install pyspark pyarrow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import numpy as np

import pyspark
import pyspark.sql.types as T
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

from pyspark import SparkConf, SparkContext

conf = (
    SparkConf()
        .set('spark.ui.port', '4050')
        .setMaster('local[*]')
)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/09 20:34:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### `Accumulator`

In [3]:
acc = sc.accumulator(value=0)
acc

Accumulator<id=0, value=0>

In [4]:
acc.value

0

In [5]:
rdd = sc.parallelize([1, 2, 3, -4, 5])
rdd.foreach(lambda x: acc.add(x))
acc.value

                                                                                

7

In [6]:
acc_sum = sc.accumulator(0)

def count(x):
    global acc_sum
    acc_sum += x
    
rdd.foreach(count)
acc_sum.value

7

In [7]:
acc_cnt = sc.accumulator(0)
rdd_02 = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8])

rdd.foreach(lambda x: acc_cnt.add(1))
rdd_02.foreach(lambda x: acc_cnt.add(1))
acc_cnt.value

13

In [8]:
from pyspark.accumulators import AccumulatorParam

class VectorAccumulatorParam(AccumulatorParam):
    def zero(self, value):
        return [0.0] * len(value)
    
    def addInPlace(self, value_left, value_right):
        for idx in range(len(value_left)):
             value_left[idx] += value_right[idx]
        return value_left
    
vector_acc = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam())
vector_acc.value

[1.0, 2.0, 3.0]

In [9]:
def vector_add(x):
    global vector_acc
    vector_acc += [x] * 3
    
rdd = sc.parallelize([1, 2, 3])
rdd.foreach(vector_add)
vector_acc.value

[7.0, 8.0, 9.0]

### `Broadcast`

In [10]:
states = {"NY": "New York", "CA": "California", "FL": "Florida"}
broadcast_states = sc.broadcast(states)
broadcast_states.value

{'NY': 'New York', 'CA': 'California', 'FL': 'Florida'}

In [11]:
data = [
    ("James", "Smith", "USA", "CA"),
    ("Michael", "Rose", "USA", "NY"),
    ("Robert", "Williams", "USA", "CA"),
    ("Maria", "Jones", "USA", "FL")
]

rdd_03 = sc.parallelize(data)

def state_convert(code):
    return broadcast_states.value[code]

result = rdd_03.map(
    lambda x: (x[0], x[1], x[2], state_convert(x[3]))
).collect()
result

[('James', 'Smith', 'USA', 'California'),
 ('Michael', 'Rose', 'USA', 'New York'),
 ('Robert', 'Williams', 'USA', 'California'),
 ('Maria', 'Jones', 'USA', 'Florida')]

In [12]:
data = [
    ("James", "Smith", "USA", "CA"),
    ("Michael", "Rose", "USA", "NY"),
    ("Robert", "Williams", "USA", "CA"),
    ("Maria", "Jones", "USA", "FL")
]

rdd_03 = sc.parallelize(data)

def state_convert(code):
    return states[code]

result = rdd_03.map(
    lambda x: (x[0], x[1], x[2], state_convert(x[3]))
).collect()
result

[('James', 'Smith', 'USA', 'California'),
 ('Michael', 'Rose', 'USA', 'New York'),
 ('Robert', 'Williams', 'USA', 'California'),
 ('Maria', 'Jones', 'USA', 'Florida')]

#### `Broadcast JOIN`

In [13]:
# Enable broadcast Join and 
# Set Threshold limit of size in bytes of a DataFrame to broadcast
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 104857600)

# Disable broadcast Join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [14]:
path = "./m5-forecasting-accuracy"
# ! kaggle competitions download -c m5-forecasting-accuracy
# ! unzip m5-forecasting-accuracy.zip -d $path

In [15]:
# Зададим пути к файлам из датасета
file_calendar = f"{path}/calendar.csv"
file_validation = f"{path}/sales_train_validation.csv"
file_evaluation = f"{path}/sales_train_evaluation.csv"
file_prices = f"{path}/sell_prices.csv"
file_calendar = f"{path}/calendar.csv"

file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df_validation = (
    spark.read.format(file_type)
      .option("inferSchema", infer_schema)
      .option("header", first_row_is_header)
      .option("sep", delimiter)
      .load(file_validation)
)
df_evaluation = (
    spark.read.format(file_type)
      .option("inferSchema", infer_schema)
      .option("header", first_row_is_header)
      .option("sep", delimiter)
      .load(file_evaluation)
)
df_prices = (
    spark.read.format(file_type)
      .option("inferSchema", infer_schema)
      .option("header", first_row_is_header)
      .option("sep", delimiter)
      .load(file_prices)
)
df_calendar = (
    spark.read.format(file_type)
      .option("inferSchema", infer_schema)
      .option("header", first_row_is_header)
      .option("sep", delimiter)
      .load(file_calendar)
)

                                                                                

In [16]:
df_evaluation.limit(5).toPandas()

23/02/09 20:34:35 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [17]:
df_evaluation.select(
    df_evaluation.cat_id
).distinct().show()

[Stage 16:====>                                                   (1 + 11) / 12]

+---------+
|   cat_id|
+---------+
|    FOODS|
|HOUSEHOLD|
|  HOBBIES|
+---------+



                                                                                

In [18]:
cat_id_hex =[
    ('FOODS', '0x001'),
    ('HOUSEHOLD', '0x002'),
    ('HOBBIES', '0x003')
]
small_df = spark.createDataFrame(data=cat_id_hex, schema=['cat_id', 'hex_code'])

In [19]:
small_df.show()

+---------+--------+
|   cat_id|hex_code|
+---------+--------+
|    FOODS|   0x001|
|HOUSEHOLD|   0x002|
|  HOBBIES|   0x003|
+---------+--------+



In [20]:
broadcast_join_df = df_evaluation.join(
  F.broadcast(small_df), small_df.cat_id == df_evaluation.cat_id
)
broadcast_join_df.limit(1).toPandas()

                                                                                

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941,cat_id.1,hex_code
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,3,3,0,1,HOBBIES,0x003


In [21]:
broadcast_join_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [cat_id#3875], [cat_id#9791], Inner, BuildRight, false
   :- Filter isnotnull(cat_id#3875)
   :  +- FileScan csv [id#3872,item_id#3873,dept_id#3874,cat_id#3875,store_id#3876,state_id#3877,d_1#3878,d_2#3879,d_3#3880,d_4#3881,d_5#3882,d_6#3883,d_7#3884,d_8#3885,d_9#3886,d_10#3887,d_11#3888,d_12#3889,d_13#3890,d_14#3891,d_15#3892,d_16#3893,d_17#3894,d_18#3895,... 1923 more fields] Batched: false, DataFilters: [isnotnull(cat_id#3875)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/nakhodnov/HSE_DPO_Spark_2022/Seminars/Seminar 05/m5-foreca..., PartitionFilters: [], PushedFilters: [IsNotNull(cat_id)], ReadSchema: struct<id:string,item_id:string,dept_id:string,cat_id:string,store_id:string,state_id:string,d_1:...
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]),false), [plan_id=208]
      +- Filter isnotnull(cat_id#9791)
         +- Scan ExistingRDD[cat_id#9791,hex_cod

In [22]:
join_df = df_evaluation.join(
  small_df, small_df.cat_id == df_evaluation.cat_id
)
join_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [cat_id#3875], [cat_id#9791], Inner
   :- Sort [cat_id#3875 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(cat_id#3875, 200), ENSURE_REQUIREMENTS, [plan_id=232]
   :     +- Filter isnotnull(cat_id#3875)
   :        +- FileScan csv [id#3872,item_id#3873,dept_id#3874,cat_id#3875,store_id#3876,state_id#3877,d_1#3878,d_2#3879,d_3#3880,d_4#3881,d_5#3882,d_6#3883,d_7#3884,d_8#3885,d_9#3886,d_10#3887,d_11#3888,d_12#3889,d_13#3890,d_14#3891,d_15#3892,d_16#3893,d_17#3894,d_18#3895,... 1923 more fields] Batched: false, DataFilters: [isnotnull(cat_id#3875)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/nakhodnov/HSE_DPO_Spark_2022/Seminars/Seminar 05/m5-foreca..., PartitionFilters: [], PushedFilters: [IsNotNull(cat_id)], ReadSchema: struct<id:string,item_id:string,dept_id:string,cat_id:string,store_id:string,state_id:string,d_1:...
   +- Sort [cat_id#9791 ASC NULLS FIRST], false, 0
      +- Ex

### `Spark RDD Gradient Descent`

$$
D = \{(x_{i}, y_{i}) | x_{i} \in \mathbb{R}^{d}, y \in \mathbb{R}\}_{1}^{n}
$$
$$
\hat{y}_{i} = \langle x, w \rangle + b
$$
$$
L_{i} = \frac{1}{2} (\hat{y}_{i} - y_{i})^{2}
$$
$$
\mathfrak{L} = \frac{1}{n}\sum\limits_{i=1}^{n} L_{i}
$$

Необходимо найти оптимальные $w \in \mathbb{R}^{d}, b \in \mathbb{R}$

Один из вариантов решения задачи: Градиентный Спуск (GD):

$$
w^{i+1} = w^{i} - \alpha \nabla_{w}\mathfrak{L}
$$
$$
b^{i+1} = b^{i} - \alpha \nabla_{b}\mathfrak{L}
$$

In [23]:
X = np.random.randn(1000, 10)
w_star = np.random.randn(X.shape[1])
y = X.dot(w_star) + 0.001 * np.random.randn(X.shape[0])

In [24]:
from functools import partial

def gradient_descent(X, y, alpha=0.1, epochs=1):
    # YOUR CODE HERE:
    ...

In [25]:
gradient_descent(X, y, alpha=0.1, epochs=10)

Epoch: 0, Loss 0.006
Epoch: 3, Loss 0.003
Epoch: 6, Loss 0.002
Epoch: 9, Loss 0.001


array([ 0.94800571,  0.41386065,  0.66360389,  0.12221738,  1.15732219,
        0.87192051, -0.23238663, -0.36209349, -0.97724333, -0.39244455])

### `Spark Winsorizing`

In [29]:
def winsorizing(
    df: pyspark.sql.dataframe.DataFrame, 
    lower_percentile: float = 0.1,
    higher_percentile: float = 0.9
) -> pyspark.sql.dataframe.DataFrame:
    # YOUR CODE HERE:
    ...

In [27]:
data = [
    (0.77347701,  ),
    (0.77617723, ),
    (-0.26191574,  ),
    (0.06015559, ),
    (-0.18058041,),
    (1.15605904, ),
    (-0.54163328,  ),
    (0.83280377,),
    (-0.69920523, ),
    (-0.33986035,),
    (-0.94114708, ),
    (-0.88438698,  ),
    (1.18682329,  ),
    (1.21287342, ),
    (-0.82575258,),
    (0.5895868, ),
    (-1.646899, ),
    (-1.5341987, ),
    (-0.94135006,  ),
    (0.5699716,)
]
df = spark.createDataFrame(data, ['sales'])
df.show()


+-----------+
|      sales|
+-----------+
| 0.77347701|
| 0.77617723|
|-0.26191574|
| 0.06015559|
|-0.18058041|
| 1.15605904|
|-0.54163328|
| 0.83280377|
|-0.69920523|
|-0.33986035|
|-0.94114708|
|-0.88438698|
| 1.18682329|
| 1.21287342|
|-0.82575258|
|  0.5895868|
|  -1.646899|
| -1.5341987|
|-0.94135006|
|  0.5699716|
+-----------+



In [30]:
winsorizing(df).show()

23/02/09 20:35:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/09 20:35:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/09 20:35:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/09 20:35:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/02/09 20:35:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+-----------+-------------+-------------+--------------+
|      sales|10_percentile|90_percentile|sales_winzored|
+-----------+-------------+-------------+--------------+
| 0.