In [1]:
spark

In [23]:
import pandas as pd
import pyspark.pandas as ps
from dateutil.relativedelta import *
from pyspark.sql.functions import array_max
from pyspark.ml.feature import StringIndexer, IndexToString
# from model.TimeBasedCV import TimeBasedCV

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
ps.set_option("compute.default_index_type", "distributed")

In [7]:
spark.conf.get("spark.kryoserializer.buffer.max")

'512m'

# model.TimeBasedCV (pandas on Spark版本)

In [43]:

from dateutil.relativedelta import *


class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
    
    
    def __init__(self, freq='days'):
        self.freq = freq

        
        
    def split(self, date_column,train_period=30, test_period=7, gap=0, stride=0,show_progress=False):
        '''
        Generate indices to split date_column into training and test set
        
        Parameters 
        ----------
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        stride: int, default=0
        
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
                    
        train_indices_list = []
        test_indices_list = []
        
        end_test = date_column.max()
        start_test = end_test - eval('relativedelta('+self.freq+'=test_period)')
        end_train = start_test - eval('relativedelta('+self.freq+'=gap)')
        start_train = end_train - eval('relativedelta('+self.freq+'=train_period)')
        

        while start_train > date_column.min():
            # train indices:
            cur_train_indices = list(date_column[(date_column>=start_train) & 
                                     (date_column<end_train)].index)

            # test indices:
            cur_test_indices = list(date_column[(date_column>=start_test) &
                                    (date_column<end_test)].index)
            
            if(show_progress):
                print("Train period:",start_train,"-" , end_train, ", test period", start_test, "-", end_test,
                    "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            end_test = end_test - eval('relativedelta('+self.freq+'=stride)')
            start_test = end_test - eval('relativedelta('+self.freq+'=test_period)')
            end_train = start_test - eval('relativedelta('+self.freq+'=gap)')
            start_train = end_train - eval('relativedelta('+self.freq+'=train_period)')

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 


# 定義函數

In [None]:
def train_and_evaluate_model(dataTrain, dataTest, paras={}):
    
    # == 轉換成 model 需要的資料型態 ==
    
    model = # modelName(paras)
    model.fit(dataTrain)
    
    map12= # 計算map12
    
    return map12
    

In [None]:
def time_split_hyperparameter_search(data):
    paras = {
        # 'para1' : para1.values[0],
        # 'para2' : para2.values[0]
    }
    
    dataTrain = # data最後七天之前
    dataTest = # data最後七天
    
    map12 = train_and_evaluate_model(dataTrain, dataTest, paras)
    
    paras.update({
        'date_x_paras_id' : data.date_x_paras_id.values[0],
        'val_date' : data.val_date.values[0],
        'map12' : map12
    })
    
    results = pd.DataFrame([paras])
    
    return results


# main

## 1. 讀取檔案 /DataFrame

In [42]:
# 讀取檔案
customers = spark.read.option('header','true').parquet('/user/HM_parquet/customers.parquet')
articles = spark.read.option('header','true').parquet('/user/HM_parquet/articles.parquet')
transactions = spark.read.option('header','true').parquet('/user/HM_parquet/transactions_train.parquet')

## 2. 將customer_id(字串)轉為customer_index(整數) /DataFrame

In [9]:
# 將customers的customer_id轉為數字(buffer要增加到512m)
toIndex = StringIndexer(inputCol="customer_id", outputCol="customer_index").fit(customers)
customers = toIndex.transform(customers)
customers.head(5)

2022-03-13 14:44:10,725 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
2022-03-13 14:44:15,635 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
2022-03-13 14:44:21,946 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
                                                                                

[Row(customer_id='00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657', FN=None, Active=None, club_member_status='ACTIVE', fashion_news_frequency='NONE', age=49, postal_code='52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100', customer_index=0.0),
 Row(customer_id='0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa', FN=None, Active=None, club_member_status='ACTIVE', fashion_news_frequency='NONE', age=25, postal_code='2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93f4c830291c32bc3057', customer_index=1.0),
 Row(customer_id='000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318', FN=None, Active=None, club_member_status='ACTIVE', fashion_news_frequency='NONE', age=24, postal_code='64f17e6a330a85798e4998f62d0930d14db8db1c054af6c9090f7dd3e38380dc', customer_index=2.0),
 Row(customer_id='00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e', FN=None, Active=None, club_member_status='ACTIVE', fashion_news_frequency='NONE', age=5

In [10]:
# 將transactions的customer_id轉為數字
transactions = toIndex.transform(transactions)
transactions.head(5)

2022-03-13 14:45:25,030 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
2022-03-13 14:45:28,913 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
2022-03-13 14:45:41,661 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
                                                                                

[Row(t_dat=datetime.date(2018, 9, 20), customer_id='000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318', article_id=663713001, price=0.050830508474576264, sales_channel_id=2, customer_index=2.0),
 Row(t_dat=datetime.date(2018, 9, 20), customer_id='000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318', article_id=541518023, price=0.03049152542372881, sales_channel_id=2, customer_index=2.0),
 Row(t_dat=datetime.date(2018, 9, 20), customer_id='00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2', article_id=505221004, price=0.01523728813559322, sales_channel_id=2, customer_index=7.0),
 Row(t_dat=datetime.date(2018, 9, 20), customer_id='00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2', article_id=685687003, price=0.016932203389830508, sales_channel_id=2, customer_index=7.0),
 Row(t_dat=datetime.date(2018, 9, 20), customer_id='00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2', article_id=685687004, price=0.0169322033898

In [18]:
transactions.describe()

2022-03-13 15:03:36,144 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
2022-03-13 15:05:40,726 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 111.2 MiB
                                                                                

DataFrame[summary: string, customer_id: string, article_id: string, price: string, sales_channel_id: string, customer_index: string]

## 3. 作時間分割 /pandas on spark

In [77]:
date_column_ps = transactions.select('t_dat').to_pandas_on_spark()
transactions.to_pandas_on_spark()

                                                                                

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
51539607552,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
51539607553,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
51539607554,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
51539607555,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
51539607556,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
51539607557,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687001,0.016932,2
51539607558,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221001,0.020322,2
51539607559,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,688873012,0.030492,1
51539607560,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,501323011,0.053373,1
51539607561,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,598859003,0.045746,2


2022-03-13 16:11:15,188 WARN nio.NioEventLoop: Selector.select() returned prematurely 512 times in a row; rebuilding Selector io.netty.channel.nio.SelectedSelectionKeySetSelector@8599857.


In [72]:
# 把資料依據時間做time split
tscv = TimeBasedCV(freq='days')
train_period, test_period, stride = 30, 7, 30
date_column_ps = transactions.select('t_dat').to_pandas_on_spark()
index_output = tscv.split(date_column_ps, train_period, test_period, stride,show_progress=True)

  s = pd.Series(


TypeError: Subtraction can not be applied to given types.

In [None]:
# pandas_udf

schema = StructType(
    [
        StructField('date_x_paras_id', IntegerType(),True),
        StructField('map12', FloatType(),True),
        StructField('val_date', DateTime(),True),
        StructField("para1", IntegerType(), True),
        StructField("para2", IntegerType(), True)
     ]
)

results = df.groupby('date_x_paras_id').applyInPandas(time_split_hyperparameter_search, schema)
results.show()

In [None]:
# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

scores.to_parquet('model/params/implicit_ALS_30.parquet')