In [1]:
import os
import numpy as np
import pandas as pd

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
import pyspark.pandas as ps

In [2]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
ps.set_option("compute.default_index_type", "distributed")

In [3]:
spark

# 讀取速度: DataFrame vs Pyspark.pandas

In [None]:
# DataFrame
transactions_df = spark.read.option('header','true').parquet('/user/HM_parquet/transactions_train.parquet')
# pyspark.pandas
transactions_ps = ps.read_parquet('/user/HM_parquet/transactions_train.parquet')

In [None]:
%%timeit
transactions_df.head(10)

In [None]:
%%timeit
transactions_ps.head(10)

# 讀取速度: csv/parquet/mysql

## hdfs parquet (ps較快)

In [4]:
%%timeit
transactions_pq = ps.read_parquet('/user/HM_parquet/transactions_train.parquet')
transactions_pq.head(20)

[Stage 6:>                                                          (0 + 1) / 1]

The slowest run took 16.54 times longer than the fastest. This could mean that an intermediate result is being cached.
1.02 s ± 782 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

In [5]:
%%timeit
transactions_pqdf = spark.read.option('header','true').parquet('/user/HM_parquet/transactions_train.parquet')
transactions_pqdf.head(20)



The slowest run took 4.76 times longer than the fastest. This could mean that an intermediate result is being cached.
4.73 s ± 2.05 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

## hdfs csv (df較快)

In [6]:
%%timeit
transactions_csv = ps.read_csv('/user/HM_csv/transactions_train.csv')
transactions_csv.head(20)



8.23 s ± 3.39 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

In [7]:
%%timeit
transactions_csvdf = spark.read.option('header','true').csv('/user/HM_csv/transactions_train.csv')
transactions_csvdf.head(20)

234 ms ± 70.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## mySQL (ps較快)

In [8]:
# mysql 配置(需要修改)
prop = {'user': 'maggie',
        'password': 'root',
        'driver': 'com.mysql.cj.jdbc.Driver'}

# database 地址(需要修改)
url = 'jdbc:mysql://172.22.33.43:3306/HM?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC'

In [9]:
%%timeit
articles_sql = ps.read_sql_table(table_name='articles',con=url,options=prop)
articles_sql.head(20)

108 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
articles_sqldf = spark.read.jdbc(url=url, table='articles', properties=prop)
articles_sqldf.head(20)

                                                                                

986 ms ± 416 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

# mySQL vs ps.read_parquet (ps較快)

In [12]:
%%timeit
articles_pq = ps.read_parquet('/user/HM_parquet/articles.parquet')
articles_pq.count()

2022-03-10 17:46:46,581 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

800 ms ± 293 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
articles_sql = ps.read_sql_table(table_name='articles',con=url,options=prop)
articles_sql.count()

[Stage 214:>                                                        (0 + 1) / 1]

1.55 s ± 521 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


                                                                                

# 一個月的資料

In [None]:
trainOneMonth = spark.read.option('header','true').parquet('../data/train_one_month.parquet')
valOneMonth = spark.read.option('header','true').parquet('../data/val_one_month.parquet')

In [None]:
customers.columns

In [None]:
transaction.columns

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer


In [None]:
# 看spark設定
# spark.conf.get("spark.serializer")

In [None]:
# 改spark buffer 設定
# spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
# spark.conf.set("spark.kryoserializer.buffer.max", "128m")

In [None]:
# 將customer_id的字串轉為數字(buffer不夠大)
# indexer = StringIndexer(inputCol="customer_id", outputCol="customer_index")
# indexed = indexer.fit(customers).transform(customers)
# indexed.show()

In [None]:
def string_to_index(self,transactions,customers,articles):
        """傳入transaction,customers,articles 的 DataFrame，接著取出完整的users和items清單，然後將customer_id和article_id轉為編號，並且map到transaction上形成新的欄位。

        Args:
            transactions ([DataFrame])
            customers ([DataFrame])
            articles ([DataFrame])
            
        Returns:
            transactions ([DataFrame])
        """    
        dfu = customers
        dfi = articles
        ALL_USERS = dfu.select('customer_id').distinct()
        ALL_ITEMS = dfi.select('article_id').distinct()

        user_ids = dict(list(enumerate(ALL_USERS)))
        item_ids = dict(list(enumerate(ALL_ITEMS)))

        user_map = {u: uidx for uidx, u in user_ids.items()}
        item_map = {i: iidx for iidx, i in item_ids.items()}
        # 將落落長的使用者id和商品id轉為編號
        transactions['user_id'] = transactions['customer_id'].map(user_map)
        transactions['item_id'] = transactions['article_id'].map(item_map)

        del dfu, dfi
        return transactions

In [None]:
dfu = customers
ALL_USERS = dfu.select('customer_id').distinct()
user_ids = dict(list(enumerate(ALL_USERS)))
user_map = {u: uidx for uidx, u in user_ids.items()}
transaction.withColumn('user_id')

In [None]:
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="customer_id",itemCol="article_id",ratingCol="count",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)