# 0. 設定 Spark

In [1]:
spark

In [2]:
import pandas as pd
import pyspark.pandas as ps
import numpy as np
import gc
import pyspark.sql.functions as sf
from pyspark.context import SparkContext
from pyspark.sql.functions import array_max
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.pandas.config import set_option, reset_option
from dateutil.relativedelta import relativedelta

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
ps.set_option("compute.default_index_type", "distributed")
set_option("compute.ops_on_diff_frames", True)

In [4]:
spark.conf.get("spark.kryoserializer.buffer.max")

'512m'

In [5]:
spark.conf.get("spark.executorEnv.SURPRISE_DATA_FOLDER")

'/home/hadoop'

In [6]:
TRAIN_PERIOD = 30

# 1. 讀取檔案 => tran_ps

In [7]:
# 讀取檔案
# customers = ps.read_parquet('/user/HM_parquet/customers.parquet')
# articles = ps.read_parquet('/user/HM_parquet/articles.parquet')
tran_ps = ps.read_parquet('/user/HM_parquet/transactions_train.parquet').drop(['price', 'sales_channel_id'], axis=1)

                                                                                

In [8]:
tran_ps.set_index('t_dat',inplace=True)
tran_ps['start_test'] = ''
tran_ps['split_id'] = ''
tran_ps.head(5)

                                                                                

Unnamed: 0_level_0,customer_id,article_id,start_test,split_id
t_dat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,,
2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,,
2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,,
2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,,
2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,,


# 2. 作時間分割 tran_ps => split_data, split_id

In [9]:
def split(data,train_period=30, test_period=7, stride=30,show_progress=False):
    
    split_data = ps.DataFrame(columns = ['t_dat','customer_id', 'article_id', 'split_id', 'start_test']).set_index('t_dat',inplace=True)

    end_test = data.index.max()
    start_test = end_test - relativedelta(days=test_period)
    start_train = start_test - relativedelta(days=train_period)
    split_id=0

    while start_train >= data.index.min():

        df = data.loc[start_train:end_test]
        df['start_test']=start_test
        df['split_id'] = split_id
        split_data = ps.concat([split_data,df])

        if(show_progress):
            print("Split_id:",split_id,", Train period:",start_train,"-" , start_test, ", test period", start_test, "-", end_test)

        # update dates:
        end_test = end_test - relativedelta(days=stride)
        start_test = end_test - relativedelta(days=test_period)
        start_train = start_test - relativedelta(days=train_period)
        split_id += 1
    
    return split_data, split_id

In [10]:
split_data, split_id = split(tran_ps,TRAIN_PERIOD,7,30,True)

                                                                                

Split_id: 0 , Train period: 2020-08-16 - 2020-09-15 , test period 2020-09-15 - 2020-09-22


                                                                                

Split_id: 1 , Train period: 2020-07-17 - 2020-08-16 , test period 2020-08-16 - 2020-08-23


                                                                                

Split_id: 2 , Train period: 2020-06-17 - 2020-07-17 , test period 2020-07-17 - 2020-07-24


                                                                                

Split_id: 3 , Train period: 2020-05-18 - 2020-06-17 , test period 2020-06-17 - 2020-06-24


                                                                                

Split_id: 4 , Train period: 2020-04-18 - 2020-05-18 , test period 2020-05-18 - 2020-05-25


                                                                                

Split_id: 5 , Train period: 2020-03-19 - 2020-04-18 , test period 2020-04-18 - 2020-04-25


                                                                                

Split_id: 6 , Train period: 2020-02-18 - 2020-03-19 , test period 2020-03-19 - 2020-03-26


                                                                                

Split_id: 7 , Train period: 2020-01-19 - 2020-02-18 , test period 2020-02-18 - 2020-02-25


                                                                                

Split_id: 8 , Train period: 2019-12-20 - 2020-01-19 , test period 2020-01-19 - 2020-01-26


                                                                                

Split_id: 9 , Train period: 2019-11-20 - 2019-12-20 , test period 2019-12-20 - 2019-12-27


                                                                                

Split_id: 10 , Train period: 2019-10-21 - 2019-11-20 , test period 2019-11-20 - 2019-11-27


                                                                                

Split_id: 11 , Train period: 2019-09-21 - 2019-10-21 , test period 2019-10-21 - 2019-10-28


                                                                                

Split_id: 12 , Train period: 2019-08-22 - 2019-09-21 , test period 2019-09-21 - 2019-09-28


                                                                                

Split_id: 13 , Train period: 2019-07-23 - 2019-08-22 , test period 2019-08-22 - 2019-08-29


                                                                                

Split_id: 14 , Train period: 2019-06-23 - 2019-07-23 , test period 2019-07-23 - 2019-07-30


                                                                                

Split_id: 15 , Train period: 2019-05-24 - 2019-06-23 , test period 2019-06-23 - 2019-06-30


                                                                                

Split_id: 16 , Train period: 2019-04-24 - 2019-05-24 , test period 2019-05-24 - 2019-05-31


                                                                                

Split_id: 17 , Train period: 2019-03-25 - 2019-04-24 , test period 2019-04-24 - 2019-05-01


                                                                                

Split_id: 18 , Train period: 2019-02-23 - 2019-03-25 , test period 2019-03-25 - 2019-04-01


                                                                                

Split_id: 19 , Train period: 2019-01-24 - 2019-02-23 , test period 2019-02-23 - 2019-03-02


                                                                                

Split_id: 20 , Train period: 2018-12-25 - 2019-01-24 , test period 2019-01-24 - 2019-01-31


                                                                                

Split_id: 21 , Train period: 2018-11-25 - 2018-12-25 , test period 2018-12-25 - 2019-01-01


                                                                                

Split_id: 22 , Train period: 2018-10-26 - 2018-11-25 , test period 2018-11-25 - 2018-12-02


                                                                                

Split_id: 23 , Train period: 2018-09-26 - 2018-10-26 , test period 2018-10-26 - 2018-11-02


                                                                                

In [11]:
split_data.reset_index(inplace=True)

In [12]:
split_data.count()

                                                                                

start_test     39919883
t_dat          39919883
split_id       39919883
article_id     39919883
customer_id    39919883
dtype: int64

In [None]:
split_data.head(5)

# 3. 製作參數表 split_id => para_cross_split

In [13]:
# 製作參數表 paras_grid
from itertools import product

paras = list(
    product(
        [25,50,100,150,200],
        [20,30,40,50],
        [0.01]
    )
)
paras_grid = ps.DataFrame(paras,columns= ['n_factors','n_epochs','reg_all'])
paras_grid.count()

n_factors    20
reg_all      20
n_epochs     20
dtype: int64

In [14]:
# 製作 split_id 表
split_id_ps = ps.DataFrame({'split_id': range(split_id)})
split_id_ps.count()

split_id    24
dtype: int64

In [15]:
# 將 paras_grid 與 split_id 做 cross join
paras_grid['key'] = 1
split_id_ps['key'] = 1

para_cross_split = ps.merge(paras_grid, split_id_ps, on ='key').drop('key')
del split_id_ps
len(para_cross_split)

480

In [22]:
# 將 cross join 後的表新增遞增的 group_id 欄位，之後要用來做 pandas_udf 的 groupby
para_cross_split['group_id'] = 0
para_cross_split['group_id'] = np.arange(len(para_cross_split)).tolist()
para_cross_split

Unnamed: 0,n_factors,n_epochs,reg_all,split_id,group_id
60129542144,25,20,0.01,14,14
60129542145,25,20,0.01,15,15
154618822657,25,30,0.01,13,37
154618822656,25,30,0.01,12,36
429496729600,50,20,0.01,4,100
429496729601,50,20,0.01,5,101
438086664193,50,20,0.01,7,103
438086664192,50,20,0.01,6,102
541165879296,50,30,0.01,6,126
541165879297,50,30,0.01,7,127


In [None]:
# para_cross_split.to_parquet('/user/HM_parquet/SVD_model/para_cross_split.parquet')

# 4. join 參數表和資料表 split_data, para_cross_split => join_data

In [16]:
join_data = split_data.join(para_cross_split.set_index('split_id'), on='split_id')
join_data.set_index('t_dat',inplace=True)

In [17]:
join_data.head(5)

                                                                                

Unnamed: 0_level_0,split_id,customer_id,article_id,start_test,n_factors,n_epochs,reg_all
t_dat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-08-16,0,000fb6e772c5d0023892065e659963da90b1866035558e...,902999001,2020-09-15,50,20,0.01
2020-08-16,0,000fb6e772c5d0023892065e659963da90b1866035558e...,902999001,2020-09-15,50,30,0.01
2020-08-16,0,000fb6e772c5d0023892065e659963da90b1866035558e...,902999001,2020-09-15,100,20,0.01
2020-08-16,0,000fb6e772c5d0023892065e659963da90b1866035558e...,902999001,2020-09-15,100,30,0.01
2020-08-16,0,000fb6e772c5d0023892065e659963da90b1866035558e...,902999001,2020-09-15,100,50,0.01


In [None]:
39919883*20

In [25]:
join_data.count()

                                                                                ]

start_test     798397660
article_id     798397660
n_factors      798397660
reg_all        798397660
split_id       798397660
customer_id    798397660
n_epochs       798397660
group_id       798397660
dtype: int64

In [27]:
join_data.to_parquet('/user/HM_parquet/SVD_model/join_data30.parquet')

                                                                                ]

In [18]:
# 刪除用不到的資料表
del split_data, para_cross_split
gc.collect()

1329

In [19]:
df = join_data.to_spark()
df.count()

                                                                                

798397660

In [23]:
spark.catalog.clearCache()

In [22]:
del join_data
gc.collect()

NameError: name 'join_data' is not defined

# -----------------------------

# 讀取資料表

In [28]:
df = spark.read.option('header','true').parquet('/user/HM_parquet/SVD_model/join_data30.parquet')
# join_data.set_index('t_dat',inplace=True)
df.count()

798397660

In [None]:
# df.head(5)

# surpriseSVD

In [25]:
import pandas as pd
# from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp,SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict
import numpy as np
import module.average_precision as metrics

class surpriseSVD():
    def __init__(self):
        self = self

    def get_top_n(self, predictions, n=12):
        """Return the top-N recommendation for each user from a set of predictions.
        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.
        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        """

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n

    def get_set(self,df):
        reader = Reader(rating_scale=(1, 500))
        data_set = Dataset.load_from_df(df[['customer_id','article_id','rating']], reader)
        return data_set

    def get_rating_set(self,df):
        rating = df[['customer_id','article_id','split_id']].groupby(['customer_id','article_id']).count().reset_index()
        rating.columns = ['customer_id','article_id','rating']
        rating_set = self.get_set(rating)
        return rating_set


    def train_SVD(self, train_data, test_data, paras={}):
        
        ## 讀取評分資料為surprise可以訓練的格式
        trainset = self.get_rating_set(train_data)
        testset = self.get_rating_set(test_data)

        ## rmse 需要的資料
        testset2 = [testset.df.loc[i].to_list() for i in range(len(testset.df))]

        ## map@k testing 需要產的資料
        test_data.loc[:,'rating']=0
        test_processed = self.get_set(test_data)
        NA, test2 = train_test_split(test_processed, test_size=1.0)

        # ======= 消費者的實際購買清單 =======
        test_data['article_id'] = test_data['article_id'].astype('str')
        test_uni = test_data.drop_duplicates(subset=['customer_id', 'article_id'], keep='first')
        buy_n = test_uni[['customer_id','article_id']].groupby('customer_id')['article_id'].apply(list).to_dict()

        cust_actual_list = []
        for uid, user_ratings in buy_n.items():
            cust_pred_tuple = (uid, [iid for iid in user_ratings])
            cust_actual_list.append(cust_pred_tuple)

        # ======= 訓練 SVD 模型 =======
        algo = SVD(random_state=42,**paras)

        # 訓練模型
        algo.fit(trainset.build_full_trainset())

        ##### rmse #####
        predictions = algo.test(testset2)
        rmse = accuracy.rmse(predictions)

        ##### map@k #####
        predictions_map = algo.test(test2)
        # est = [i.est for i in predictions_map] 

        ##  消費者的預測清單 
        top_n = self.get_top_n(predictions=predictions_map, n=12)

        cust_pred_list = []
        for uid, user_ratings in top_n.items():
            cust_pred_tuple = (uid, [str(iid) for (iid, _) in user_ratings])
            cust_pred_list.append(cust_pred_tuple)

        final_list = list(zip(cust_actual_list, cust_pred_list))

        # map@k計算 
        mapk_list = []
        for i in range(len(final_list)):
            map_k = metrics.mapk([final_list[i][0][1]],[final_list[i][1][1]],12)
            mapk_list.append(map_k)

        map_k = sum(mapk_list)/len(mapk_list)

        return rmse, map_k

# pandas_udf

In [None]:
# pdf = df.where("group_id == 43").toPandas()

In [26]:
def time_split_hyperparameter_search(data):

    paras = {
        'n_factors':data.n_factors.values[0], 
        'n_epochs':data.n_epochs.values[0], 
        'reg_all':data.reg_all.values[0]
    }
    
    test_index = data['t_dat'] > data['start_test'][0]
    train_data = data.loc[~test_index]
    test_data = data.loc[test_index]
    
    model = surpriseSVD()
    rmse, map12 = model.train_SVD(train_data, test_data, paras)
    
    paras.update({
        'stride': TRAIN_PERIOD,
        'start_test' : data.start_test.values[0],
        'rmse' : rmse,
        'map12' : map12
    })
    
    results = pd.DataFrame([paras])
    
    return results


In [27]:
from pyspark.sql.types import (
    DateType, DoubleType, FloatType, IntegerType, StringType, StructField, StructType
)
# pandas_udf

schema = StructType(
    [
        StructField("stride", IntegerType(), True),
        StructField('start_test', DateType(),True),
        StructField("n_factors", IntegerType(), True),
        StructField("n_epochs", IntegerType(), True),
        StructField('reg_all', FloatType(),True),
        StructField('rmse', FloatType(),True),
        StructField('map12', FloatType(),True),
     ]
)

results = df.groupby('group_id').applyInPandas(time_split_hyperparameter_search, schema).cache()

AnalysisException: cannot resolve 'group_id' given input columns: [article_id, customer_id, n_epochs, n_factors, reg_all, split_id, start_test];
'Project ['group_id, split_id#2543, customer_id#2545, article_id#2546L, start_test#2547, n_factors#2548L, n_epochs#2549L, reg_all#2550]
+- Project [split_id#2543, customer_id#2545, article_id#2546L, start_test#2547, n_factors#2548L, n_epochs#2549L, reg_all#2550]
   +- Project [__index_level_0__#2559L, split_id#2543, t_dat#2544, customer_id#2545, article_id#2546L, start_test#2547, n_factors#2548L, n_epochs#2549L, reg_all#2550, monotonically_increasing_id() AS __natural_order__#2569L]
      +- Project [monotonically_increasing_id() AS __index_level_0__#2559L, split_id#2543, t_dat#2544, customer_id#2545, article_id#2546L, start_test#2547, n_factors#2548L, n_epochs#2549L, reg_all#2550]
         +- Project [split_id#2096 AS split_id#2543, t_dat#2092 AS t_dat#2544, customer_id#2093 AS customer_id#2545, article_id#2094L AS article_id#2546L, start_test#2095 AS start_test#2547, n_factors#2513L AS n_factors#2548L, n_epochs#2514L AS n_epochs#2549L, reg_all#2515 AS reg_all#2550]
            +- Project [t_dat#2092, customer_id#2093, article_id#2094L, start_test#2095, n_factors#2513L, n_epochs#2514L, reg_all#2515, split_id#2096, monotonically_increasing_id() AS __natural_order__#2524L]
               +- Project [t_dat#2092, customer_id#2093, article_id#2094L, start_test#2095, __right_n_factors#2483L AS n_factors#2513L, __right_n_epochs#2484L AS n_epochs#2514L, __right_reg_all#2485 AS reg_all#2515, split_id#2096]
                  +- Join LeftOuter, (cast(split_id#2096 as bigint) = __right_split_id#2482L)
                     :- SubqueryAlias left_table
                     :  +- Project [split_id#2096, t_dat#2092, customer_id#2093, article_id#2094L, start_test#2095, __natural_order__#2109L]
                     :     +- Project [__index_level_0__#2102L, t_dat#2092, customer_id#2093, article_id#2094L, start_test#2095, split_id#2096, monotonically_increasing_id() AS __natural_order__#2109L]
                     :        +- Project [monotonically_increasing_id() AS __index_level_0__#2102L, t_dat#2092, customer_id#2093, article_id#2094L, start_test#2095, split_id#2096]
                     :           +- Project [t_dat#0 AS t_dat#2092, customer_id#1 AS customer_id#2093, article_id#2L AS article_id#2094L, start_test#163 AS start_test#2095, split_id#168 AS split_id#2096]
                     :              +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#2067L]
                     :                 +- Union false, false
                     :                    :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1985L]
                     :                    :     +- Union false, false
                     :                    :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1903L]
                     :                    :        :     +- Union false, false
                     :                    :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1821L]
                     :                    :        :        :     +- Union false, false
                     :                    :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1739L]
                     :                    :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1657L]
                     :                    :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1575L]
                     :                    :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1493L]
                     :                    :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1411L]
                     :                    :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1329L]
                     :                    :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1247L]
                     :                    :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1165L]
                     :                    :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1083L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#1001L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#919L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#837L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#755L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#673L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#591L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#509L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#427L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#345L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#263L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Union false, false
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :  +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#163, split_id#168, monotonically_increasing_id() AS __natural_order__#181L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :     +- Project [t_dat#0, customer_id#1, article_id#2L, 2020-09-15 AS start_test#163, 0 AS split_id#168]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#0, customer_id#1, article_id#2L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#156L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#0, customer_id#1, article_id#2L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Filter ((__natural_order__#17L >= cast(85929744152 as bigint)) AND (__natural_order__#17L <= cast(85931134243 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Project [__index_level_0__#10L, t_dat#0, customer_id#1, article_id#2L, price#3, sales_channel_id#4L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#0, customer_id#1, article_id#2L, price#3, sales_channel_id#4L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Relation [t_dat#0,customer_id#1,article_id#2L,price#3,sales_channel_id#4L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#253, customer_id#254, article_id#255L, 2020-08-16 AS start_test#227, 1 AS split_id#232]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#253, customer_id#254, article_id#255L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#220L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#253, customer_id#254, article_id#255L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85928468224 as bigint)) AND (__natural_order__#17L <= cast(85930011341 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#253, customer_id#254, article_id#255L, price#256, sales_channel_id#257L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#253, customer_id#254, article_id#255L, price#256, sales_channel_id#257L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#253,customer_id#254,article_id#255L,price#256,sales_channel_id#257L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#335, customer_id#336, article_id#337L, 2020-07-17 AS start_test#309, 2 AS split_id#314]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#335, customer_id#336, article_id#337L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#302L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#335, customer_id#336, article_id#337L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85926680410 as bigint)) AND (__natural_order__#17L <= cast(85928798739 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#335, customer_id#336, article_id#337L, price#338, sales_channel_id#339L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#335, customer_id#336, article_id#337L, price#338, sales_channel_id#339L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#335,customer_id#336,article_id#337L,price#338,sales_channel_id#339L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#417, customer_id#418, article_id#419L, 2020-06-17 AS start_test#391, 3 AS split_id#396]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#417, customer_id#418, article_id#419L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#384L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#417, customer_id#418, article_id#419L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85925224726 as bigint)) AND (__natural_order__#17L <= cast(85927328642 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#417, customer_id#418, article_id#419L, price#420, sales_channel_id#421L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#417, customer_id#418, article_id#419L, price#420, sales_channel_id#421L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#417,customer_id#418,article_id#419L,price#420,sales_channel_id#421L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#499, customer_id#500, article_id#501L, 2020-05-18 AS start_test#473, 4 AS split_id#478]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#499, customer_id#500, article_id#501L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#466L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#499, customer_id#500, article_id#501L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85924119438 as bigint)) AND (__natural_order__#17L <= cast(85925650836 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#499, customer_id#500, article_id#501L, price#502, sales_channel_id#503L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#499, customer_id#500, article_id#501L, price#502, sales_channel_id#503L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#499,customer_id#500,article_id#501L,price#502,sales_channel_id#503L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#581, customer_id#582, article_id#583L, 2020-04-18 AS start_test#555, 5 AS split_id#560]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#581, customer_id#582, article_id#583L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#548L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#581, customer_id#582, article_id#583L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85922869900 as bigint)) AND (__natural_order__#17L <= cast(85924423834 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#581, customer_id#582, article_id#583L, price#584, sales_channel_id#585L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#581, customer_id#582, article_id#583L, price#584, sales_channel_id#585L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#581,customer_id#582,article_id#583L,price#584,sales_channel_id#585L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#663, customer_id#664, article_id#665L, 2020-03-19 AS start_test#637, 6 AS split_id#642]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#663, customer_id#664, article_id#665L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#630L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#663, customer_id#664, article_id#665L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85921812442 as bigint)) AND (__natural_order__#17L <= cast(85923111794 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#663, customer_id#664, article_id#665L, price#666, sales_channel_id#667L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#663, customer_id#664, article_id#665L, price#666, sales_channel_id#667L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#663,customer_id#664,article_id#665L,price#666,sales_channel_id#667L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#745, customer_id#746, article_id#747L, 2020-02-18 AS start_test#719, 7 AS split_id#724]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#745, customer_id#746, article_id#747L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#712L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#745, customer_id#746, article_id#747L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85920772459 as bigint)) AND (__natural_order__#17L <= cast(85922076872 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#745, customer_id#746, article_id#747L, price#748, sales_channel_id#749L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#745, customer_id#746, article_id#747L, price#748, sales_channel_id#749L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#745,customer_id#746,article_id#747L,price#748,sales_channel_id#749L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#827, customer_id#828, article_id#829L, 2020-01-19 AS start_test#801, 8 AS split_id#806]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#827, customer_id#828, article_id#829L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#794L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#827, customer_id#828, article_id#829L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85919701191 as bigint)) AND (__natural_order__#17L <= cast(85921053199 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#827, customer_id#828, article_id#829L, price#830, sales_channel_id#831L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#827, customer_id#828, article_id#829L, price#830, sales_channel_id#831L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#827,customer_id#828,article_id#829L,price#830,sales_channel_id#831L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#909, customer_id#910, article_id#911L, 2019-12-20 AS start_test#883, 9 AS split_id#888]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#909, customer_id#910, article_id#911L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#876L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#909, customer_id#910, article_id#911L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85918449254 as bigint)) AND (__natural_order__#17L <= cast(85920020372 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#909, customer_id#910, article_id#911L, price#912, sales_channel_id#913L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#909, customer_id#910, article_id#911L, price#912, sales_channel_id#913L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#909,customer_id#910,article_id#911L,price#912,sales_channel_id#913L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#991, customer_id#992, article_id#993L, 2019-11-20 AS start_test#965, 10 AS split_id#970]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#991, customer_id#992, article_id#993L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#958L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#991, customer_id#992, article_id#993L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85917463116 as bigint)) AND (__natural_order__#17L <= cast(85918703283 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#991, customer_id#992, article_id#993L, price#994, sales_channel_id#995L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#991, customer_id#992, article_id#993L, price#994, sales_channel_id#995L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#991,customer_id#992,article_id#993L,price#994,sales_channel_id#995L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#1073, customer_id#1074, article_id#1075L, 2019-10-21 AS start_test#1047, 11 AS split_id#1052]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#1073, customer_id#1074, article_id#1075L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1040L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#1073, customer_id#1074, article_id#1075L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85916181329 as bigint)) AND (__natural_order__#17L <= cast(85917739984 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1073, customer_id#1074, article_id#1075L, price#1076, sales_channel_id#1077L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1073, customer_id#1074, article_id#1075L, price#1076, sales_channel_id#1077L]
                     :                    :        :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#1073,customer_id#1074,article_id#1075L,price#1076,sales_channel_id#1077L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#1155, customer_id#1156, article_id#1157L, 2019-09-21 AS start_test#1129, 12 AS split_id#1134]
                     :                    :        :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#1155, customer_id#1156, article_id#1157L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1122L]
                     :                    :        :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#1155, customer_id#1156, article_id#1157L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85915126078 as bigint)) AND (__natural_order__#17L <= cast(85916619201 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1155, customer_id#1156, article_id#1157L, price#1158, sales_channel_id#1159L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1155, customer_id#1156, article_id#1157L, price#1158, sales_channel_id#1159L]
                     :                    :        :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#1155,customer_id#1156,article_id#1157L,price#1158,sales_channel_id#1159L] parquet
                     :                    :        :        :        :        :        :        :        :        :        :        +- Project [t_dat#1237, customer_id#1238, article_id#1239L, 2019-08-22 AS start_test#1211, 13 AS split_id#1216]
                     :                    :        :        :        :        :        :        :        :        :        :           +- Project [t_dat#1237, customer_id#1238, article_id#1239L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1204L]
                     :                    :        :        :        :        :        :        :        :        :        :              +- Project [t_dat#1237, customer_id#1238, article_id#1239L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85913637248 as bigint)) AND (__natural_order__#17L <= cast(85915399269 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1237, customer_id#1238, article_id#1239L, price#1240, sales_channel_id#1241L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1237, customer_id#1238, article_id#1239L, price#1240, sales_channel_id#1241L]
                     :                    :        :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#1237,customer_id#1238,article_id#1239L,price#1240,sales_channel_id#1241L] parquet
                     :                    :        :        :        :        :        :        :        :        :        +- Project [t_dat#1319, customer_id#1320, article_id#1321L, 2019-07-23 AS start_test#1293, 14 AS split_id#1298]
                     :                    :        :        :        :        :        :        :        :        :           +- Project [t_dat#1319, customer_id#1320, article_id#1321L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1286L]
                     :                    :        :        :        :        :        :        :        :        :              +- Project [t_dat#1319, customer_id#1320, article_id#1321L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85911759152 as bigint)) AND (__natural_order__#17L <= cast(85914154944 as bigint)))
                     :                    :        :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1319, customer_id#1320, article_id#1321L, price#1322, sales_channel_id#1323L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1319, customer_id#1320, article_id#1321L, price#1322, sales_channel_id#1323L]
                     :                    :        :        :        :        :        :        :        :        :                          +- Relation [t_dat#1319,customer_id#1320,article_id#1321L,price#1322,sales_channel_id#1323L] parquet
                     :                    :        :        :        :        :        :        :        :        +- Project [t_dat#1401, customer_id#1402, article_id#1403L, 2019-06-23 AS start_test#1375, 15 AS split_id#1380]
                     :                    :        :        :        :        :        :        :        :           +- Project [t_dat#1401, customer_id#1402, article_id#1403L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1368L]
                     :                    :        :        :        :        :        :        :        :              +- Project [t_dat#1401, customer_id#1402, article_id#1403L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85910042730 as bigint)) AND (__natural_order__#17L <= cast(85912402789 as bigint)))
                     :                    :        :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1401, customer_id#1402, article_id#1403L, price#1404, sales_channel_id#1405L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1401, customer_id#1402, article_id#1403L, price#1404, sales_channel_id#1405L]
                     :                    :        :        :        :        :        :        :        :                          +- Relation [t_dat#1401,customer_id#1402,article_id#1403L,price#1404,sales_channel_id#1405L] parquet
                     :                    :        :        :        :        :        :        :        +- Project [t_dat#1483, customer_id#1484, article_id#1485L, 2019-05-24 AS start_test#1457, 16 AS split_id#1462]
                     :                    :        :        :        :        :        :        :           +- Project [t_dat#1483, customer_id#1484, article_id#1485L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1450L]
                     :                    :        :        :        :        :        :        :              +- Project [t_dat#1483, customer_id#1484, article_id#1485L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85908551174 as bigint)) AND (__natural_order__#17L <= cast(85910496587 as bigint)))
                     :                    :        :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1483, customer_id#1484, article_id#1485L, price#1486, sales_channel_id#1487L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1483, customer_id#1484, article_id#1485L, price#1486, sales_channel_id#1487L]
                     :                    :        :        :        :        :        :        :                          +- Relation [t_dat#1483,customer_id#1484,article_id#1485L,price#1486,sales_channel_id#1487L] parquet
                     :                    :        :        :        :        :        :        +- Project [t_dat#1565, customer_id#1566, article_id#1567L, 2019-04-24 AS start_test#1539, 17 AS split_id#1544]
                     :                    :        :        :        :        :        :           +- Project [t_dat#1565, customer_id#1566, article_id#1567L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1532L]
                     :                    :        :        :        :        :        :              +- Project [t_dat#1565, customer_id#1566, article_id#1567L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85907135290 as bigint)) AND (__natural_order__#17L <= cast(85909009143 as bigint)))
                     :                    :        :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1565, customer_id#1566, article_id#1567L, price#1568, sales_channel_id#1569L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1565, customer_id#1566, article_id#1567L, price#1568, sales_channel_id#1569L]
                     :                    :        :        :        :        :        :                          +- Relation [t_dat#1565,customer_id#1566,article_id#1567L,price#1568,sales_channel_id#1569L] parquet
                     :                    :        :        :        :        :        +- Project [t_dat#1647, customer_id#1648, article_id#1649L, 2019-03-25 AS start_test#1621, 18 AS split_id#1626]
                     :                    :        :        :        :        :           +- Project [t_dat#1647, customer_id#1648, article_id#1649L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1614L]
                     :                    :        :        :        :        :              +- Project [t_dat#1647, customer_id#1648, article_id#1649L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85905893370 as bigint)) AND (__natural_order__#17L <= cast(85907503795 as bigint)))
                     :                    :        :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1647, customer_id#1648, article_id#1649L, price#1650, sales_channel_id#1651L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1647, customer_id#1648, article_id#1649L, price#1650, sales_channel_id#1651L]
                     :                    :        :        :        :        :                          +- Relation [t_dat#1647,customer_id#1648,article_id#1649L,price#1650,sales_channel_id#1651L] parquet
                     :                    :        :        :        :        +- Project [t_dat#1729, customer_id#1730, article_id#1731L, 2019-02-23 AS start_test#1703, 19 AS split_id#1708]
                     :                    :        :        :        :           +- Project [t_dat#1729, customer_id#1730, article_id#1731L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1696L]
                     :                    :        :        :        :              +- Project [t_dat#1729, customer_id#1730, article_id#1731L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :        :                 +- Filter ((__natural_order__#17L >= cast(85904684678 as bigint)) AND (__natural_order__#17L <= cast(85906255957 as bigint)))
                     :                    :        :        :        :                    +- Project [__index_level_0__#10L, t_dat#1729, customer_id#1730, article_id#1731L, price#1732, sales_channel_id#1733L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1729, customer_id#1730, article_id#1731L, price#1732, sales_channel_id#1733L]
                     :                    :        :        :        :                          +- Relation [t_dat#1729,customer_id#1730,article_id#1731L,price#1732,sales_channel_id#1733L] parquet
                     :                    :        :        :        +- Project [t_dat#1811, customer_id#1812, article_id#1813L, 2019-01-24 AS start_test#1785, 20 AS split_id#1790]
                     :                    :        :        :           +- Project [t_dat#1811, customer_id#1812, article_id#1813L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1778L]
                     :                    :        :        :              +- Project [t_dat#1811, customer_id#1812, article_id#1813L,  AS start_test#35,  AS split_id#39]
                     :                    :        :        :                 +- Filter ((__natural_order__#17L >= cast(85903518490 as bigint)) AND (__natural_order__#17L <= cast(85905020652 as bigint)))
                     :                    :        :        :                    +- Project [__index_level_0__#10L, t_dat#1811, customer_id#1812, article_id#1813L, price#1814, sales_channel_id#1815L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1811, customer_id#1812, article_id#1813L, price#1814, sales_channel_id#1815L]
                     :                    :        :        :                          +- Relation [t_dat#1811,customer_id#1812,article_id#1813L,price#1814,sales_channel_id#1815L] parquet
                     :                    :        :        +- Project [t_dat#1893, customer_id#1894, article_id#1895L, 2018-12-25 AS start_test#1867, 21 AS split_id#1872]
                     :                    :        :           +- Project [t_dat#1893, customer_id#1894, article_id#1895L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1860L]
                     :                    :        :              +- Project [t_dat#1893, customer_id#1894, article_id#1895L,  AS start_test#35,  AS split_id#39]
                     :                    :        :                 +- Filter ((__natural_order__#17L >= cast(85902370057 as bigint)) AND (__natural_order__#17L <= cast(85903775326 as bigint)))
                     :                    :        :                    +- Project [__index_level_0__#10L, t_dat#1893, customer_id#1894, article_id#1895L, price#1896, sales_channel_id#1897L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :        :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1893, customer_id#1894, article_id#1895L, price#1896, sales_channel_id#1897L]
                     :                    :        :                          +- Relation [t_dat#1893,customer_id#1894,article_id#1895L,price#1896,sales_channel_id#1897L] parquet
                     :                    :        +- Project [t_dat#1975, customer_id#1976, article_id#1977L, 2018-11-25 AS start_test#1949, 22 AS split_id#1954]
                     :                    :           +- Project [t_dat#1975, customer_id#1976, article_id#1977L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#1942L]
                     :                    :              +- Project [t_dat#1975, customer_id#1976, article_id#1977L,  AS start_test#35,  AS split_id#39]
                     :                    :                 +- Filter ((__natural_order__#17L >= cast(85901048458 as bigint)) AND (__natural_order__#17L <= cast(85902669034 as bigint)))
                     :                    :                    +- Project [__index_level_0__#10L, t_dat#1975, customer_id#1976, article_id#1977L, price#1978, sales_channel_id#1979L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                    :                       +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#1975, customer_id#1976, article_id#1977L, price#1978, sales_channel_id#1979L]
                     :                    :                          +- Relation [t_dat#1975,customer_id#1976,article_id#1977L,price#1978,sales_channel_id#1979L] parquet
                     :                    +- Project [t_dat#2057, customer_id#2058, article_id#2059L, 2018-10-26 AS start_test#2031, 23 AS split_id#2036]
                     :                       +- Project [t_dat#2057, customer_id#2058, article_id#2059L, start_test#35, split_id#39, monotonically_increasing_id() AS __natural_order__#2024L]
                     :                          +- Project [t_dat#2057, customer_id#2058, article_id#2059L,  AS start_test#35,  AS split_id#39]
                     :                             +- Filter ((__natural_order__#17L >= cast(85899601568 as bigint)) AND (__natural_order__#17L <= cast(85901414957 as bigint)))
                     :                                +- Project [__index_level_0__#10L, t_dat#2057, customer_id#2058, article_id#2059L, price#2060, sales_channel_id#2061L, monotonically_increasing_id() AS __natural_order__#17L]
                     :                                   +- Project [monotonically_increasing_id() AS __index_level_0__#10L, t_dat#2057, customer_id#2058, article_id#2059L, price#2060, sales_channel_id#2061L]
                     :                                      +- Relation [t_dat#2057,customer_id#2058,article_id#2059L,price#2060,sales_channel_id#2061L] parquet
                     +- SubqueryAlias right_table
                        +- Project [split_id#2427L AS __right_split_id#2482L, n_factors#2423L AS __right_n_factors#2483L, n_epochs#2424L AS __right_n_epochs#2484L, reg_all#2425 AS __right_reg_all#2485, __natural_order__#2440L]
                           +- Project [split_id#2427L, n_factors#2423L, n_epochs#2424L, reg_all#2425, __natural_order__#2440L]
                              +- Project [__index_level_0__#2433L, n_factors#2423L, n_epochs#2424L, reg_all#2425, key#2426, split_id#2427L, monotonically_increasing_id() AS __natural_order__#2440L]
                                 +- Project [monotonically_increasing_id() AS __index_level_0__#2433L, n_factors#2423L, n_epochs#2424L, reg_all#2425, key#2426, split_id#2427L]
                                    +- Project [n_factors#2222L AS n_factors#2423L, n_epochs#2223L AS n_epochs#2424L, reg_all#2224 AS reg_all#2425, key#2371 AS key#2426, split_id#2416L AS split_id#2427L]
                                       +- Project [n_factors#2222L, n_epochs#2223L, reg_all#2224, key#2371, __right_split_id#2390L AS split_id#2416L]
                                          +- Join Inner, (key#2371 = __right_key#2391)
                                             :- SubqueryAlias left_table
                                             :  +- Project [__index_level_0__#2221L, n_factors#2222L, n_epochs#2223L, reg_all#2224, 1 AS key#2371, __natural_order__#2229L]
                                             :     +- Project [__index_level_0__#2221L, n_factors#2222L, n_epochs#2223L, reg_all#2224, monotonically_increasing_id() AS __natural_order__#2229L]
                                             :        +- LogicalRDD [__index_level_0__#2221L, n_factors#2222L, n_epochs#2223L, reg_all#2224], false
                                             +- SubqueryAlias right_table
                                                +- Project [__index_level_0__#2312L AS __right___index_level_0__#2389L, split_id#2313L AS __right_split_id#2390L, key#2376 AS __right_key#2391, __natural_order__#2316L]
                                                   +- Project [__index_level_0__#2312L, split_id#2313L, 1 AS key#2376, __natural_order__#2316L]
                                                      +- Project [__index_level_0__#2312L, split_id#2313L, monotonically_increasing_id() AS __natural_order__#2316L]
                                                         +- LogicalRDD [__index_level_0__#2312L, split_id#2313L], false


In [18]:
results.count()

2022-04-01 15:34:10,181 WARN scheduler.TaskSetManager: Lost task 94.0 in stage 12.0 (TID 306) (bdse42.example.com executor 4): FetchFailed(BlockManagerId(9, bdse91.example.com, 36595, None), shuffleId=5, mapIndex=11, mapId=198, reduceId=100, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:903)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:84)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
	at org.apache.sp

480



In [25]:
results.groupBy(['n_factors','n_epochs','reg_all']).mean('rmse','map12').sort("avg(rmse)").limit(5).show()

2022-04-01 19:43:22,068 WARN scheduler.TaskSetManager: Lost task 157.0 in stage 27.0 (TID 980) (bdse89.example.com executor 16): FetchFailed(BlockManagerId(4, bdse42.example.com, 44135, None), shuffleId=11, mapIndex=26, mapId=825, reduceId=174, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:903)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:84)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
	at org.apache

+---------+--------+-------+------------------+--------------------+
|n_factors|n_epochs|reg_all|         avg(rmse)|          avg(map12)|
+---------+--------+-------+------------------+--------------------+
|       25|      20|   0.01| 21.12461559350292|5.767838883912191E-4|
|       50|      20|   0.01|21.125548214962084|5.791609328298364E-4|
|      100|      20|   0.01| 21.12736424803734| 5.80218322284054E-4|
|       25|      30|   0.01| 21.12763550753395|5.803498206660151E-4|
|       50|      30|   0.01|21.128597273180883|5.836315916288489E-4|
+---------+--------+-------+------------------+--------------------+





# 存檔

In [21]:
results.write.parquet(f'/user/HM_parquet/SVD_model/params/para{TRAIN_PERIOD}.parquet',mode='overwrite',partitionBy='rmse')

2022-04-01 17:04:35,033 WARN scheduler.TaskSetManager: Lost task 36.0 in stage 19.0 (TID 541) (bdse106.example.com executor 2): TaskKilled (Stage cancelled)
2022-04-01 17:04:35,033 WARN scheduler.TaskSetManager: Lost task 42.0 in stage 19.0 (TID 543) (bdse106.example.com executor 2): TaskKilled (Stage cancelled)
2022-04-01 17:07:31,391 WARN scheduler.TaskSetManager: Lost task 13.0 in stage 19.0 (TID 503) (bdse91.example.com executor 12): TaskKilled (Stage cancelled)
2022-04-01 17:45:15,407 WARN scheduler.TaskSetManager: Lost task 127.0 in stage 24.0 (TID 690) (bdse106.example.com executor 2): FetchFailed(BlockManagerId(13, bdse89.example.com, 41377, None), shuffleId=10, mapIndex=22, mapId=563, reduceId=137, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:903)
	at 

In [13]:
results.write.parquet('file:///home/hadoop/git_workspace/BDSE23_HM_recommendation/spark/params/para_30',mode='overwrite',partitionBy='rmse')

                                                                                

In [None]:
para30_ps['stride']=30
para30_ps.head(5)

In [None]:
para30_ps.to_parquet('/user/HM_parquet/SVD_model/params/para30.parquet',mode = 'overwrite',partition_cols='start_test')

In [None]:
para30 = ps.read_parquet('/user/HM_parquet/SVD_model/para_30.parquet',columns=[])

# --------------------------------------

# 讀取結果表

In [8]:
# 合併所有dataframe
periods = [30,60,90,180,270,360]
total_scores = ps.DataFrame()

for period in periods:
    perios_socres = ps.read_parquet(f'/user/HM_parquet/SVD_model/params/para{period}.parquet')
    total_scores = ps.concat([total_scores,perios_socres],axis=0,ignore_index=True)
    
total_scores



                                                                                

AnalysisException: Unable to infer schema for Parquet. It must be specified manually.

In [6]:
results = ps.read_parquet('/user/HM_parquet/SVD_model/params/para30.parquet')
results.count()

                                                                                

rmse          480
n_epochs      480
stride        480
map12         480
n_factors     480
reg_all       480
start_test    480
dtype: int64

In [10]:
total_scores.count()

                                                                                

480

In [9]:
total_scores.groupBy(['n_factors','n_epochs','reg_all','']).mean('rmse','map12').sort_values('rmse',ascending=True)



+---------+--------+-------+------------------+--------------------+
|n_factors|n_epochs|reg_all|         avg(rmse)|          avg(map12)|
+---------+--------+-------+------------------+--------------------+
|       25|      20|   0.01| 21.12461543541667|5.677682480988248E-4|
|       50|      20|   0.01|21.125548057083332|5.719335810378349E-4|
|      100|      20|   0.01|21.127364090416666|5.885357798736853E-4|
|       25|      30|   0.01|    21.12763534875|6.021444278303534E-4|
|       50|      30|   0.01| 21.12859711541667| 5.44418584468076E-4|
+---------+--------+-------+------------------+--------------------+



                                                                                

# (DataFrame)

## 1. 讀取檔案 /DataFrame

In [None]:
# # 讀取檔案
# customers = spark.read.option('header','true').parquet('/user/HM_parquet/customers.parquet')
# articles = spark.read.option('header','true').parquet('/user/HM_parquet/articles.parquet')
# transactions = spark.read.option('header','true').parquet('/user/HM_parquet/transactions_train.parquet')

In [None]:
# transactions.show()

## 2. 將customer_id(字串)轉為customer_index(整數) /DataFrame

In [None]:
# # 將customers的customer_id轉為數字(buffer要增加到512m)
# toIndex = StringIndexer(inputCol="customer_id", outputCol="customer_index").fit(customers)
# customers = toIndex.transform(customers)
# # customers.head(5)

In [None]:
# # 將transactions的customer_id轉為數字
# transactions = toIndex.transform(transactions)
# # transactions.head(5)

In [None]:
# transactions.describe()