In [1]:
spark

In [2]:
import pandas as pd
import pyspark.pandas as ps
import pyspark.sql.functions as sf
from pyspark.sql.functions import array_max
from pyspark.ml.feature import StringIndexer, IndexToString
# from model.TimeBasedCV import TimeBasedCV

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
ps.set_option("compute.default_index_type", "distributed")

In [4]:
spark.conf.get("spark.kryoserializer.buffer.max")

'512m'

# model.TimeBasedCV (pandas on Spark版本)

In [5]:

from dateutil.relativedelta import *
from datetime import datetime



class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
         
    def split(self, date_column,train_period=30, test_period=7, gap=0, stride=0,show_progress=False):
        '''
        Generate indices to split date_column into training and test set
        
        Parameters 
        ----------
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        stride: int, default=0
        
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
                    
        train_indices_list = []
        test_indices_list = []
        
        end_test = datetime.strptime(date_column.max(), "%Y-%m-%d")
        start_test = end_test - relativedelta(days=test_period)
        end_train = start_test - relativedelta(days=gap)
        start_train = end_train - relativedelta(days=train_period)
        

        while start_train > datetime.strptime(date_column.min(), "%Y-%m-%d"):
            # train indices:
            # cur_train_indices = list(date_column[(date_column['t_dat']>=start_train) & (date_column['t_dat']<end_train)].index)
            cur_train_indices = (date_column[(date_column>=start_train) & (date_column<end_train)].index).to_numpy()

            # test indices:
            cur_test_indices = (date_column[(date_column>=start_test) &
                                    (date_column<end_test)].index).to_numpy()
            
            if(show_progress):
                print("Train period:",start_train,"-" , end_train, ", test period", start_test, "-", end_test,
                    "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            end_test = end_test - relativedelta(days=stride)
            start_test = end_test - relativedelta(days=test_period)
            end_train = start_test - relativedelta(days=gap)
            start_train = end_train - relativedelta(days=train_period)

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 


# 定義函數

In [None]:
def train_and_evaluate_model(dataTrain, dataTest, paras={}):
    
    # == 轉換成 model 需要的資料型態 ==
    
    model = # modelName(paras)
    model.fit(dataTrain)
    
    map12= # 計算map12
    
    return map12
    

In [None]:
def time_split_hyperparameter_search(data):
    paras = {
        # 'para1' : para1.values[0],
        # 'para2' : para2.values[0]
    }
    
    dataTrain = # data最後七天之前
    dataTest = # data最後七天
    
    map12 = train_and_evaluate_model(dataTrain, dataTest, paras)
    
    paras.update({
        'date_x_paras_id' : data.date_x_paras_id.values[0],
        'val_date' : data.val_date.values[0],
        'map12' : map12
    })
    
    results = pd.DataFrame([paras])
    
    return results


# main

## 1. 讀取檔案 /DataFrame

In [6]:
# 讀取檔案
customers = spark.read.option('header','true').csv('/user/HM_csv/customers.csv')
articles = spark.read.option('header','true').csv('/user/HM_csv/articles.csv')
transactions = spark.read.option('header','true').csv('/user/HM_csv/transactions_train.csv')

                                                                                

## 2. 將customer_id(字串)轉為customer_index(整數) /DataFrame

In [7]:
# 將customers的customer_id轉為數字(buffer要增加到512m)
toIndex = StringIndexer(inputCol="customer_id", outputCol="customer_index").fit(customers)
customers = toIndex.transform(customers)
# customers.head(5)

                                                                                

In [8]:
# 將transactions的customer_id轉為數字
transactions = toIndex.transform(transactions)
# transactions.head(5)

In [9]:
# transactions.describe()

## 3. 作時間分割 /pandas on spark

In [10]:
# date_column = transactions.select('t_dat')

In [11]:
# date_column.withColumn("add_one_month", sf.date_add(date_column['t_dat'], 10)).show(5)

In [12]:
# date_column_pd['t_dat'].max()

In [13]:
# cur_train_indices = (date_column[(date_column['t_dat']>="2019-05-01") & 
#                                      (date_column['t_dat']<"2019-06-01")].index).to_numpy()
# cur_train_indices

In [14]:
# tran_ps = transactions.to_pandas_on_spark()
# tran_ps.loc[85908936269]

In [15]:
# cur_train_indices = [(date_column['t_dat']>="2019-05-01") & (date_column['t_dat']<"2019-06-01")]
# tran_ps[cur_train_indices]

In [16]:
# 把資料依據時間做time split
tscv = TimeBasedCV()
tran_ps = transactions.to_pandas_on_spark()
date_column = tran_ps['t_dat']

In [17]:
# date = date_column.max()
# datetime.strptime(date, "%Y-%m-%d")

In [18]:
train_period, test_period, stride = 30, 7, 30
index_output = tscv.split(date_column=date_column, train_period=train_period, test_period=test_period, stride=stride,show_progress=True)

2022-03-22 17:04:12,092 ERROR shuffle.RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Failed to connect to bdse93.example.com:38167
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:133)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteMan

Train period: 2020-08-16 00:00:00 - 2020-09-15 00:00:00 , test period 2020-09-15 00:00:00 - 2020-09-22 00:00:00 # train records 1123728 , # test records 233498


2022-03-22 17:05:26,760 ERROR shuffle.RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Failed to connect to bdse108.example.com:46797
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:133)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteMa

Train period: 2020-07-17 00:00:00 - 2020-08-16 00:00:00 , test period 2020-08-16 00:00:00 - 2020-08-23 00:00:00 # train records 1275928 , # test records 234159


2022-03-22 17:06:25,217 ERROR shuffle.RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Failed to connect to bdse109.example.com:46245
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:133)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteMa

Train period: 2020-06-17 00:00:00 - 2020-07-17 00:00:00 , test period 2020-07-17 00:00:00 - 2020-07-24 00:00:00 # train records 1787814 , # test records 284020


2022-03-22 17:07:19,627 ERROR shuffle.RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Failed to connect to bdse92.example.com:36545
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:133)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteMan

Train period: 2020-05-18 00:00:00 - 2020-06-17 00:00:00 , test period 2020-06-17 00:00:00 - 2020-06-24 00:00:00 # train records 1455684 , # test records 549443


                                                                                

Train period: 2020-04-18 00:00:00 - 2020-05-18 00:00:00 , test period 2020-05-18 00:00:00 - 2020-05-25 00:00:00 # train records 1105288 , # test records 374636


                                                                                

Train period: 2020-03-19 00:00:00 - 2020-04-18 00:00:00 , test period 2020-04-18 00:00:00 - 2020-04-25 00:00:00 # train records 1249538 , # test records 253001


                                                                                

Train period: 2020-02-18 00:00:00 - 2020-03-19 00:00:00 , test period 2020-03-19 00:00:00 - 2020-03-26 00:00:00 # train records 1057458 , # test records 206631


                                                                                

Train period: 2020-01-19 00:00:00 - 2020-02-18 00:00:00 , test period 2020-02-18 00:00:00 - 2020-02-25 00:00:00 # train records 1039983 , # test records 229317


                                                                                

Train period: 2019-12-20 00:00:00 - 2020-01-19 00:00:00 , test period 2020-01-19 00:00:00 - 2020-01-26 00:00:00 # train records 1071268 , # test records 237155


                                                                                

Train period: 2019-11-20 00:00:00 - 2019-12-20 00:00:00 , test period 2019-12-20 00:00:00 - 2019-12-27 00:00:00 # train records 1251937 , # test records 278504


                                                                                

Train period: 2019-10-21 00:00:00 - 2019-11-20 00:00:00 , test period 2019-11-20 00:00:00 - 2019-11-27 00:00:00 # train records 986138 , # test records 220402


                                                                                

Train period: 2019-09-21 00:00:00 - 2019-10-21 00:00:00 , test period 2019-10-21 00:00:00 - 2019-10-28 00:00:00 # train records 1281787 , # test records 247178


                                                                                

Train period: 2019-08-22 00:00:00 - 2019-09-21 00:00:00 , test period 2019-09-21 00:00:00 - 2019-09-28 00:00:00 # train records 1055251 , # test records 239251


                                                                                

Train period: 2019-07-23 00:00:00 - 2019-08-22 00:00:00 , test period 2019-08-22 00:00:00 - 2019-08-29 00:00:00 # train records 1488830 , # test records 235190


                                                                                

Train period: 2019-06-23 00:00:00 - 2019-07-23 00:00:00 , test period 2019-07-23 00:00:00 - 2019-07-30 00:00:00 # train records 1878096 , # test records 466895


                                                                                

Train period: 2019-05-24 00:00:00 - 2019-06-23 00:00:00 , test period 2019-06-23 00:00:00 - 2019-06-30 00:00:00 # train records 1716422 , # test records 583976


                                                                                

Train period: 2019-04-24 00:00:00 - 2019-05-24 00:00:00 , test period 2019-05-24 00:00:00 - 2019-05-31 00:00:00 # train records 1491556 , # test records 383761


                                                                                

Train period: 2019-03-25 00:00:00 - 2019-04-24 00:00:00 , test period 2019-04-24 00:00:00 - 2019-05-01 00:00:00 # train records 1415884 , # test records 385095


                                                                                

Train period: 2019-02-23 00:00:00 - 2019-03-25 00:00:00 , test period 2019-03-25 00:00:00 - 2019-04-01 00:00:00 # train records 1241920 , # test records 324525


2022-03-22 17:15:32,500 WARN spark.HeartbeatReceiver: Removing executor 4 with no recent heartbeats: 170851 ms exceeds timeout 120000 ms
2022-03-22 17:15:32,502 WARN spark.HeartbeatReceiver: Removing executor 10 with no recent heartbeats: 157836 ms exceeds timeout 120000 ms
2022-03-22 17:15:32,532 ERROR cluster.YarnScheduler: Lost executor 4 on bdse90.example.com: Executor heartbeat timed out after 170851 ms
2022-03-22 17:15:32,536 WARN scheduler.TaskSetManager: Lost task 8.0 in stage 108.0 (TID 1622) (bdse90.example.com executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 170851 ms
2022-03-22 17:15:32,536 WARN scheduler.TaskSetManager: Lost task 3.0 in stage 108.0 (TID 1616) (bdse90.example.com executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 170851 ms
2022-03-22 17:15:32,542 ERROR cluster.YarnScheduler: Lost executor 10 on bds

Train period: 2019-01-24 00:00:00 - 2019-02-23 00:00:00 , test period 2019-02-23 00:00:00 - 2019-03-02 00:00:00 # train records 1208692 , # test records 321840


                                                                                

Train period: 2018-12-25 00:00:00 - 2019-01-24 00:00:00 , test period 2019-01-24 00:00:00 - 2019-01-31 00:00:00 # train records 1166188 , # test records 297629


                                                                                

Train period: 2018-11-25 00:00:00 - 2018-12-25 00:00:00 , test period 2018-12-25 00:00:00 - 2019-01-01 00:00:00 # train records 1148433 , # test records 238692


                                                                                

Train period: 2018-10-26 00:00:00 - 2018-11-25 00:00:00 , test period 2018-11-25 00:00:00 - 2018-12-02 00:00:00 # train records 1321599 , # test records 272399


                                                                                

Train period: 2018-09-26 00:00:00 - 2018-10-26 00:00:00 , test period 2018-10-26 00:00:00 - 2018-11-02 00:00:00 # train records 1446890 , # test records 330397


                                                                                

In [None]:
# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, test_index in index_output:
    train_data = transactions.loc[train_index]
    test_data = transactions.loc[test_index]
    # 取得test開始日期
    test_data.reset_index(inplace=True, drop=True)
    start_test = test_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

scores.to_parquet('model/params/implicit_ALS_30.parquet')

In [None]:
# pandas_udf

schema = StructType(
    [
        StructField('date_x_paras_id', IntegerType(),True),
        StructField('map12', FloatType(),True),
        StructField('val_date', DateTime(),True),
        StructField("para1", IntegerType(), True),
        StructField("para2", IntegerType(), True)
     ]
)

results = df.groupby('date_x_paras_id').applyInPandas(time_split_hyperparameter_search, schema)
results.show()