In [1]:
spark

In [2]:
import pandas as pd
import pyspark.pandas as ps
import pyspark.sql.functions as sf
from pyspark.sql.functions import array_max
from pyspark.ml.feature import StringIndexer, IndexToString
# from model.TimeBasedCV import TimeBasedCV

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
ps.set_option("compute.default_index_type", "distributed")

In [4]:
spark.conf.get("spark.kryoserializer.buffer.max")

'512m'

# model.TimeBasedCV (pandas on Spark版本)

In [5]:

from dateutil.relativedelta import *
from datetime import datetime



class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
         
    def split(self, date_column,train_period=30, test_period=7, gap=0, stride=0,show_progress=False):
        '''
        Generate indices to split date_column into training and test set
        
        Parameters 
        ----------
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        stride: int, default=0
        
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
                    
        train_indices_list = []
        test_indices_list = []
        test_date_list = []
        
        # end_test = datetime.strptime(date_column.max(), "%Y-%m-%d")
        end_test = date_column.max()
        start_test = end_test - relativedelta(days=test_period)
        end_train = start_test - relativedelta(days=gap)
        start_train = end_train - relativedelta(days=train_period)
        

        while start_train > date_column.min():
            # train indices:
            # cur_train_indices = list(date_column[(date_column['t_dat']>=start_train) & (date_column['t_dat']<end_train)].index)
            cur_train_indices = (date_column[(date_column>=start_train) & (date_column<end_train)].index).to_numpy()

            # test indices:
            cur_test_indices = (date_column[(date_column>=start_test) &
                                    (date_column<end_test)].index).to_numpy()
            
            if(show_progress):
                print("Train period:",start_train,"-" , end_train, ", test period", start_test, "-", end_test,
                    "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)
            test_date_list.append(start_test)
            
            # update dates:
            end_test = end_test - relativedelta(days=stride)
            start_test = end_test - relativedelta(days=test_period)
            end_train = start_test - relativedelta(days=gap)
            start_train = end_train - relativedelta(days=train_period)

        # mimic sklearn output  
        index_output = [(train,test,start_test) for train,test,start_test in zip(train_indices_list,test_indices_list,test_date_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 


# 定義函數

In [None]:
def train_and_evaluate_model(dataTrain, dataTest, paras={}):
    
    # == 轉換成 model 需要的資料型態 ==
    
    model = # modelName(paras)
    model.fit(dataTrain)
    
    map12= # 計算map12
    
    return map12
    

In [None]:
def time_split_hyperparameter_search(data):
    paras = {
        # 'para1' : para1.values[0],
        # 'para2' : para2.values[0]
    }
    
    dataTrain = # data最後七天之前
    dataTest = # data最後七天
    
    map12 = train_and_evaluate_model(dataTrain, dataTest, paras)
    
    paras.update({
        'date_x_paras_id' : data.date_x_paras_id.values[0],
        'val_date' : data.val_date.values[0],
        'map12' : map12
    })
    
    results = pd.DataFrame([paras])
    
    return results


# main

## 1. 讀取檔案 /DataFrame

In [6]:
# 讀取檔案
customers = spark.read.option('header','true').parquet('/user/HM_parquet/customers.parquet')
articles = spark.read.option('header','true').parquet('/user/HM_parquet/articles.parquet')
transactions = spark.read.option('header','true').parquet('/user/HM_parquet/transactions_train.parquet')

                                                                                

In [None]:
# transactions.show()

## 2. 將customer_id(字串)轉為customer_index(整數) /DataFrame

In [8]:
# 將customers的customer_id轉為數字(buffer要增加到512m)
toIndex = StringIndexer(inputCol="customer_id", outputCol="customer_index").fit(customers)
customers = toIndex.transform(customers)
# customers.head(5)

                                                                                

In [9]:
# 將transactions的customer_id轉為數字
transactions = toIndex.transform(transactions)
# transactions.head(5)

In [None]:
# transactions.describe()

## 3. 作時間分割 /pandas on spark

In [None]:
# date_column = transactions.select('t_dat')

In [None]:
# date_column.withColumn("add_one_month", sf.date_add(date_column['t_dat'], 10)).show(5)

In [None]:
# date_column_pd['t_dat'].max()

In [None]:
# cur_train_indices = (date_column[(date_column['t_dat']>="2019-05-01") & 
#                                      (date_column['t_dat']<"2019-06-01")].index).to_numpy()
# cur_train_indices

In [None]:
# tran_ps = transactions.to_pandas_on_spark()
# tran_ps.loc[85908936269]

In [None]:
# cur_train_indices = [(date_column['t_dat']>="2019-05-01") & (date_column['t_dat']<"2019-06-01")]
# tran_ps[cur_train_indices]

In [10]:
# 把資料依據時間做time split
tscv = TimeBasedCV()
tran_ps = transactions.to_pandas_on_spark()
date_column = tran_ps['t_dat']

In [None]:
# date = date_column.max()
# datetime.strptime(date, "%Y-%m-%d")

In [11]:
train_period, test_period, stride = 30, 7, 30
index_output = tscv.split(date_column=date_column, train_period=train_period, test_period=test_period, stride=stride,show_progress=True)

                                                                                

Train period: 2020-08-16 - 2020-09-15 , test period 2020-09-15 - 2020-09-22 # train records 1123728 , # test records 233498


                                                                                

Train period: 2020-07-17 - 2020-08-16 , test period 2020-08-16 - 2020-08-23 # train records 1275928 , # test records 234159


                                                                                

Train period: 2020-06-17 - 2020-07-17 , test period 2020-07-17 - 2020-07-24 # train records 1787814 , # test records 284020


                                                                                

Train period: 2020-05-18 - 2020-06-17 , test period 2020-06-17 - 2020-06-24 # train records 1455684 , # test records 549443


                                                                                

Train period: 2020-04-18 - 2020-05-18 , test period 2020-05-18 - 2020-05-25 # train records 1105288 , # test records 374636


                                                                                

Train period: 2020-03-19 - 2020-04-18 , test period 2020-04-18 - 2020-04-25 # train records 1249538 , # test records 253001


                                                                                

Train period: 2020-02-18 - 2020-03-19 , test period 2020-03-19 - 2020-03-26 # train records 1057458 , # test records 206631


                                                                                

Train period: 2020-01-19 - 2020-02-18 , test period 2020-02-18 - 2020-02-25 # train records 1039983 , # test records 229317


                                                                                

Train period: 2019-12-20 - 2020-01-19 , test period 2020-01-19 - 2020-01-26 # train records 1071268 , # test records 237155


                                                                                

Train period: 2019-11-20 - 2019-12-20 , test period 2019-12-20 - 2019-12-27 # train records 1251937 , # test records 278504


                                                                                

Train period: 2019-10-21 - 2019-11-20 , test period 2019-11-20 - 2019-11-27 # train records 986138 , # test records 220402


                                                                                

Train period: 2019-09-21 - 2019-10-21 , test period 2019-10-21 - 2019-10-28 # train records 1281787 , # test records 247178


                                                                                

Train period: 2019-08-22 - 2019-09-21 , test period 2019-09-21 - 2019-09-28 # train records 1055251 , # test records 239251


                                                                                

Train period: 2019-07-23 - 2019-08-22 , test period 2019-08-22 - 2019-08-29 # train records 1488830 , # test records 235190


                                                                                

Train period: 2019-06-23 - 2019-07-23 , test period 2019-07-23 - 2019-07-30 # train records 1878096 , # test records 466895


                                                                                

Train period: 2019-05-24 - 2019-06-23 , test period 2019-06-23 - 2019-06-30 # train records 1716422 , # test records 583976


                                                                                

Train period: 2019-04-24 - 2019-05-24 , test period 2019-05-24 - 2019-05-31 # train records 1491556 , # test records 383761


                                                                                

Train period: 2019-03-25 - 2019-04-24 , test period 2019-04-24 - 2019-05-01 # train records 1415884 , # test records 385095


                                                                                

Train period: 2019-02-23 - 2019-03-25 , test period 2019-03-25 - 2019-04-01 # train records 1241920 , # test records 324525


                                                                                

Train period: 2019-01-24 - 2019-02-23 , test period 2019-02-23 - 2019-03-02 # train records 1208692 , # test records 321840


                                                                                

Train period: 2018-12-25 - 2019-01-24 , test period 2019-01-24 - 2019-01-31 # train records 1166188 , # test records 297629


                                                                                

Train period: 2018-11-25 - 2018-12-25 , test period 2018-12-25 - 2019-01-01 # train records 1148433 , # test records 238692


                                                                                

Train period: 2018-10-26 - 2018-11-25 , test period 2018-11-25 - 2018-12-02 # train records 1321599 , # test records 272399


                                                                                

Train period: 2018-09-26 - 2018-10-26 , test period 2018-10-26 - 2018-11-02 # train records 1446890 , # test records 330397


In [12]:
len(index_output)

24

In [15]:
from pyspark.pandas.config import set_option, reset_option
set_option("compute.ops_on_diff_frames", True)

train_index, test_index, start_test = index_output[0]

In [16]:
transactions[train_index]

TypeError: unexpected item type: <class 'numpy.ndarray'>

2022-03-26 18:38:06,551 WARN nio.NioEventLoop: Selector.select() returned prematurely 512 times in a row; rebuilding Selector io.netty.channel.nio.SelectedSelectionKeySetSelector@208fe6c8.
2022-03-26 18:38:06,554 WARN nio.NioEventLoop: Selector.select() returned prematurely 512 times in a row; rebuilding Selector io.netty.channel.nio.SelectedSelectionKeySetSelector@819cf63.
2022-03-26 18:38:06,559 WARN nio.NioEventLoop: Selector.select() returned prematurely 512 times in a row; rebuilding Selector io.netty.channel.nio.SelectedSelectionKeySetSelector@6de0f35c.


In [14]:
# 將transaction加上start_test欄位
transactions[train_index].withColumn('start_test', sf.lit(start_test))
# transactions["start_test"].loc[test_index]= start_test
transactions

AttributeError: 'DataFrame' object has no attribute 'loc'

In [None]:
transactions.tail(5)

In [None]:
for train_index, test_index in index_output:
    train_data = transactions.loc[train_index]
    test_data = transactions.loc[test_index]
    # 取得test開始日期
    test_data.reset_index(inplace=True, drop=True)
    start_test = test_data['t_dat'][0]
   

In [None]:
# pandas_udf

schema = StructType(
    [
        StructField('date_x_paras_id', IntegerType(),True),
        StructField('map12', FloatType(),True),
        StructField('val_date', DateTime(),True),
        StructField("para1", IntegerType(), True),
        StructField("para2", IntegerType(), True)
     ]
)

results = df.groupby('date_x_paras_id').applyInPandas(time_split_hyperparameter_search, schema)
results.show()