In [1]:
!aws s3 ls s3://dmetasoul-bucket/demo/

                           PRE criteo_x1/
                           PRE movielens/
                           PRE output/
                           PRE schema/
                           PRE test/
                           PRE tianchi/
                           PRE train/
                           PRE tuner/


In [2]:
!aws s3 ls s3://dmetasoul-bucket/demo/movielens/

                           PRE config/
                           PRE feature_generation/
                           PRE mango/
                           PRE match/
                           PRE ml-1m/
                           PRE model/


In [3]:
num_negs = 200

movies_path='s3://dmetasoul-bucket/demo/movielens/ml-1m/movies.dat'
ratings_path='s3://dmetasoul-bucket/demo/movielens/ml-1m/ratings.dat'
users_path='s3://dmetasoul-bucket/demo/movielens/ml-1m/users.dat'

train_dataset_out_path='s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_%d/train.parquet'%num_negs
test_dataset_out_path='s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_%d/test.parquet'%num_negs
item_dataset_out_path='s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_%d/item.parquet'%num_negs
print(train_dataset_out_path)
print(test_dataset_out_path)
print(item_dataset_out_path)

s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_200/train.parquet
s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_200/test.parquet
s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_200/item.parquet


In [4]:
import metaspore as ms
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType

spark = (SparkSession.builder
    .appName('Feature generation - movielens')
    .config("spark.executor.memory","5G")
    .config("spark.executor.instances","4")
    .config("spark.network.timeout","500") # 500s
    #.config("spark.kubernetes.executor.deleteOnTermination", "false")
    #.config("spark.sql.execution.arrow.maxRecordsPerBatch", "512")
    #.config("spark.submit.pyFiles", "python.zip")
    .getOrCreate())
    
sc = spark.sparkContext
print(sc.version)
print(sc.applicationId)
print(sc.uiWebUrl)

22/01/28 12:05:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


3.1.2
spark-application-1643371543426
http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/


In [5]:
### read movies
movies_schema = StructType([
        StructField("movie_id", LongType(), True),
        StructField("title", StringType(), True),
        StructField("genre", StringType(), True)
])

movies = spark.read.csv(movies_path, sep='::',inferSchema=False, header=False, schema=movies_schema)
print('movies sample:')
movies.show(10)


item_profile=movies.select('movie_id', 'title', 'genre').dropDuplicates(['movie_id'])
print('item profile sample:')
item_profile.sort('movie_id').show(10)

movies sample:


                                                                                

+--------+--------------------+--------------------+
|movie_id|               title|               genre|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
+--------+--------------------+--------------------+
only showing top 10 rows

item profile sample:




+--------+--------------------+--------------------+
|movie_id|               title|               genre|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
+--------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [6]:
### read ratings
ratings_schema = StructType([
        StructField("user_id", LongType(), True),
        StructField("movie_id", LongType(), True),
        StructField("rating", FloatType(), True),
        StructField("timestamp", LongType(), True)
])

ratings = spark.read.csv(ratings_path, sep='::', inferSchema=False, header=False, schema=ratings_schema)
print('ratings sample:')
#ratings = ratings.limit(10000) # total: 1000209
ratings.show(10)

ratings sample:


[Stage 3:>                                                          (0 + 1) / 1]

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|      1|    1193|   5.0|978300760|
|      1|     661|   3.0|978302109|
|      1|     914|   3.0|978301968|
|      1|    3408|   4.0|978300275|
|      1|    2355|   5.0|978824291|
|      1|    1197|   3.0|978302268|
|      1|    1287|   5.0|978302039|
|      1|    2804|   5.0|978300719|
|      1|     594|   4.0|978302268|
|      1|     919|   4.0|978301368|
+-------+--------+------+---------+
only showing top 10 rows



                                                                                

In [7]:
### read users
users_schema = StructType([
        StructField("user_id", LongType(), True),
        StructField("gender", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("occupation", StringType(), True),
        StructField("zip", StringType(), True)
])

users = spark.read.csv(users_path, sep='::', inferSchema=False, header=False, schema=users_schema)
print('users sample:')
users.show(10)


user_profile = users.select("user_id", "gender", "age", "occupation", "zip").dropDuplicates(['user_id'])
print('user profile sample:')
user_profile.sort('user_id').show(10)

users sample:
+-------+------+---+----------+-----+
|user_id|gender|age|occupation|  zip|
+-------+------+---+----------+-----+
|      1|     F|  1|        10|48067|
|      2|     M| 56|        16|70072|
|      3|     M| 25|        15|55117|
|      4|     M| 45|         7|02460|
|      5|     M| 25|        20|55455|
|      6|     F| 50|         9|55117|
|      7|     M| 35|         1|06810|
|      8|     M| 25|        12|11413|
|      9|     M| 25|        17|61614|
|     10|     F| 35|         1|95370|
+-------+------+---+----------+-----+
only showing top 10 rows

user profile sample:




+-------+------+---+----------+-----+
|user_id|gender|age|occupation|  zip|
+-------+------+---+----------+-----+
|      1|     F|  1|        10|48067|
|      2|     M| 56|        16|70072|
|      3|     M| 25|        15|55117|
|      4|     M| 45|         7|02460|
|      5|     M| 25|        20|55455|
|      6|     F| 50|         9|55117|
|      7|     M| 35|         1|06810|
|      8|     M| 25|        12|11413|
|      9|     M| 25|        17|61614|
|     10|     F| 35|         1|95370|
+-------+------+---+----------+-----+
only showing top 10 rows



                                                                                

In [8]:
# merge movies, users, ratings
dataset = ratings.join(users, on=ratings.user_id==users.user_id, how='leftouter').drop(users.user_id)
dataset = dataset.join(movies, on=dataset.movie_id==movies.movie_id,how='leftouter').drop(movies.movie_id)
dataset = dataset.select('user_id', \
                         'gender', \
                         'age', \
                         'occupation', \
                         'zip', \
                         'movie_id', \
                         'title', \
                         'genre', \
                         'rating', \
                         'timestamp'
                        )
print('dataset sample:')
dataset.show(10)

dataset sample:
+-------+------+---+----------+-----+--------+--------------------+--------------------+------+---------+
|user_id|gender|age|occupation|  zip|movie_id|               title|               genre|rating|timestamp|
+-------+------+---+----------+-----+--------+--------------------+--------------------+------+---------+
|      1|     F|  1|        10|48067|    1193|One Flew Over the...|               Drama|   5.0|978300760|
|      1|     F|  1|        10|48067|     661|James and the Gia...|Animation|Childre...|   3.0|978302109|
|      1|     F|  1|        10|48067|     914| My Fair Lady (1964)|     Musical|Romance|   3.0|978301968|
|      1|     F|  1|        10|48067|    3408|Erin Brockovich (...|               Drama|   4.0|978300275|
|      1|     F|  1|        10|48067|    2355|Bug's Life, A (1998)|Animation|Childre...|   5.0|978824291|
|      1|     F|  1|        10|48067|    1197|Princess Bride, T...|Action|Adventure|...|   3.0|978302268|
|      1|     F|  1|        10

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import rand
from functools import reduce

# feature generation
def feature_generation(dataset, user_profile, item_profile, max_len=10, sep=u'\u0001'):
    def get_recent_items(kv_pairs, max_len=max_len):
        # sort
        kv_pairs.sort(key=lambda x: x[1])
        # get recent list
        recent_items = []
        for i in range(0, len(kv_pairs)):
            current, hist_time = kv_pairs[i]
            hist_list = [0] if i == 0 else reduce(lambda x, y:x+y, map(lambda x:[x[0]], kv_pairs[:i]))
            # get last max_len items
            hist_list = hist_list[-max_len:]
            hist_list = str.join(sep, map(str, hist_list))
            recent_items.append((hist_time, current, hist_list))

        return recent_items
    
    # label
    dataset = dataset.withColumn('label',  F.when(F.col('rating')> 0, 1).otherwise(0))
    
    # generate user recent behaviors features
    hist_item_list_df = dataset.filter(dataset['rating']>0).select('user_id','movie_id', 'timestamp').distinct().rdd\
                               .map(lambda x: (x['user_id'], [(x['movie_id'], x['timestamp'])]))\
                               .reduceByKey(lambda x, y: x + y)\
                               .map(lambda x: (x[0], get_recent_items(x[1])))\
                               .flatMapValues(lambda x: x)\
                               .map(lambda x: (x[0], x[1][0], x[1][1], x[1][2]))\
                               .toDF(['user_id', \
                                      'timestamp', \
                                      'movie_id', \
                                      'recent_movie_ids'])
    # merge features
    fg_result = dataset.alias('t1')\
                       .join(hist_item_list_df.alias('t2'), \
                             (col('t1.user_id')==col('t2.user_id')) & (col('t1.timestamp')==col('t2.timestamp')) & (col('t1.movie_id')==col('t2.movie_id')),
                             how='leftouter')\
                       .select('t1.label', \
                               't1.user_id', \
                               't1.gender', \
                               't1.age', \
                               't1.occupation', \
                               't1.zip', \
                               't1.movie_id', \
                               't2.recent_movie_ids', \
                               't1.genre', \
                               't1.timestamp')
    
    # replace sep in genre column
    fg_result = fg_result.withColumn('genre', regexp_replace('genre', '\|', sep))
    
    # shuffle and return
    # fg_result = fg_result.withColumn('rand', rand(seed=100)).orderBy('rand')
    # fg_result = fg_result.drop('rand')
    
    # https://stackoverflow.com/questions/40478018/pyspark-dataframe-convert-multiple-columns-to-float
    # fg_result = fg_result.select(*(col(c).cast('string').alias(c) for c in fg_result.columns))
    
    return fg_result


fg_dataset = feature_generation(dataset, user_profile, item_profile)

                                                                                

In [10]:
# split train, test
def split_train_test(dataset):
    dataset.registerTempTable('dataset')        
    query = """
    select label, user_id, gender, age, occupation, zip, movie_id, recent_movie_ids, genre, timestamp
    from
    (
        select
            *,
            ROW_NUMBER() OVER(PARTITION BY user_id ORDER BY timestamp DESC) as sample_id
        from
            dataset
    ) ta
    where ta.sample_id = 1
    order by user_id ASC
    """
    test_dataset = spark.sql(query)
    train_dataset = dataset.exceptAll(test_dataset)
    return train_dataset, test_dataset

train_fg_dataset, test_fg_dataset = split_train_test(fg_dataset)
train_fg_dataset.cache()
test_fg_dataset.cache()

DataFrame[label: int, user_id: bigint, gender: string, age: int, occupation: string, zip: string, movie_id: bigint, recent_movie_ids: string, genre: string, timestamp: bigint]

In [12]:
def gen_sample_prob(dataset, group_by, alpha=0.75):
    from pyspark.sql.functions import col, pow
    item_weight = dataset.groupBy(col(group_by)).count()
    item_weight = item_weight.withColumn('norm_weight', pow(item_weight['count'], alpha))
    total_freq = item_weight.select('count').groupBy().sum().collect()[0][0]
    total_norm = item_weight.select('norm_weight').groupBy().sum().collect()[0][0]
    item_weight = item_weight.withColumn('sampling_prob', item_weight['norm_weight']/total_norm)    
    return item_weight, total_norm, total_freq

test_df, _, _ = gen_sample_prob(dataset, 'movie_id')
test_dist = test_df.select('movie_id', 'sampling_prob')\
                            .rdd.map(lambda x: (x[0], x[1])).collect()
zipped_dist = [list(t) for t in zip(*test_dist)]
item_list, dist_list = zipped_dist[0], zipped_dist[1]

                                                                                

In [11]:
# negative sampling on original dataset
def negative_sampling(dataset, user_column='user_id', item_column='movie_id', time_column='timestamp', \
                      negative_item_column='trigger_item_id', negative_sample=3):
    
    def gen_sample_prob(dataset, group_by, alpha=0.75):
        from pyspark.sql.functions import col, pow
        item_weight = dataset.groupBy(col(group_by)).count()
        item_weight = item_weight.withColumn('norm_weight', pow(item_weight['count'], alpha))
        total_freq = item_weight.select('count').groupBy().sum().collect()[0][0]
        total_norm = item_weight.select('norm_weight').groupBy().sum().collect()[0][0]
        item_weight = item_weight.withColumn('sampling_prob', item_weight['norm_weight']/total_norm)    
        return item_weight, total_norm, total_freq
    
    def sample(user_id, user_item_list, item_list, dist_list, negative_sample):
        import numpy as np
        # sample negative list
        candidate_list = np.random.choice(list(item_list), size=len(user_item_list)*negative_sample, \
                                          replace=True, p=dist_list).tolist()
        # remove the positive sample from the sampling result
        candidate_list = list(set(candidate_list)-set(user_item_list))
        
        # sample trigger list
        trigger_list = np.random.choice(list(user_item_list), size=len(candidate_list), \
                                        replace=True).tolist()
        
        return list(zip(trigger_list, candidate_list))
    
    # sampling distribution
    item_weight, _, _ = gen_sample_prob(dataset, item_column)
    item_list = item_weight.select(item_column).rdd.flatMap(lambda x: x).collect()
    dist_list = item_weight.select('sampling_prob').rdd.flatMap(lambda x: x).collect()
    
    # generate sampling dataframe
    sampling_df=dataset.rdd\
                       .map(lambda x: (x[user_column], [x[item_column]]))\
                       .reduceByKey(lambda x, y: x + y)\
                       .map(lambda x: (x[0], sample(x[0], x[1], item_list, dist_list, negative_sample)))\
                       .flatMapValues(lambda x: x)\
                       .map(lambda x: (x[0], x[1][0], x[1][1]))\
                       .toDF([user_column, negative_item_column, item_column])
    
    return sampling_df

# negative sampling
neg_sample_df=negative_sampling(dataset=train_fg_dataset, user_column='user_id', item_column='movie_id', time_column='timestamp', negative_sample=num_negs)

# merge into item and user profile information
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
neg_sample_df = neg_sample_df.withColumn('label', lit(0))
neg_sample_df = neg_sample_df.alias('t1')\
                        .join(train_fg_dataset.alias('t2'), \
                             (col('t1.user_id')==col('t2.user_id')) & (col('t1.trigger_item_id')==col('t2.movie_id')),
                             how='leftouter')\
                        .select('t1.label', \
                                't1.user_id', \
                                't2.gender', \
                                't2.age', \
                                't2.occupation', \
                                't2.zip', \
                                't1.movie_id', \
                                't2.recent_movie_ids', \
                                't2.genre')

# show negative sampling result
print('negative sampling result size:%d'%neg_sample_df.count())
print('negative samping result:')
neg_sample_df.show(10)

# show origianl dataset
train_fg_dataset = train_fg_dataset.drop('timestamp')
print('original dataset sample size:%d'%train_fg_dataset.count())
print('original dataset sample:')
train_fg_dataset.show(10)

                                                                                

negative sampling result size:16440817
negative samping result:


                                                                                

+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|label|user_id|gender|age|occupation|  zip|movie_id|    recent_movie_ids|               genre|
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|    0|      5|     M| 25|        20|55455|      87|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     196|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     214|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     722|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     895|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     927|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|     969|15293260349910...|CrimeDramaRoman...|
|    0|      5|     M| 25|        20|55455|    139

                                                                                

original dataset sample size:994169
original dataset sample:
+-----+-------+------+---+----------+-----+--------+--------------------+----------------+
|label|user_id|gender|age|occupation|  zip|movie_id|    recent_movie_ids|           genre|
+-----+-------+------+---+----------+-----+--------+--------------------+----------------+
|    1|    736|     M| 18|        12|07070|    1036|27222232599288...| ActionThriller|
|    1|    755|     F| 35|         0|94002|    2599|22721196299092...|          Comedy|
|    1|   1260|     M| 25|        17|28262|    3100|10952395227611...|           Drama|
|    1|   1263|     M|  1|        10|81301|    2598|28053354375225...|          Comedy|
|    1|   1298|     M| 35|         6|33615|    3263|56233719473809...|          Comedy|
|    1|   1494|     M| 25|        17|38104|    1390|                   0|          Comedy|
|    1|   1530|     M| 25|         4|53711|    1247|2371317447147...|   DramaRomance|
|    1|   1632|     M| 25|   

In [12]:
train_dataset = train_fg_dataset.union(neg_sample_df)
train_dataset = train_dataset.withColumn('rand', rand(seed=100)).orderBy('rand')
train_dataset = train_dataset.drop('rand')
train_dataset = train_dataset.select(*(col(c).cast('string').alias(c) for c in train_dataset.columns))
train_dataset.cache()

test_dataset = test_fg_dataset.withColumn('rand', rand(seed=100)).orderBy('rand')
test_dataset = test_dataset.drop('rand')
test_dataset = test_dataset.drop('timestamp')
test_dataset = test_dataset.select(*(col(c).cast('string').alias(c) for c in test_dataset.columns))
test_dataset.cache()

DataFrame[label: string, user_id: string, gender: string, age: string, occupation: string, zip: string, movie_id: string, recent_movie_ids: string, genre: string]

In [13]:
# show results
print('final train dataset size: %d'%train_dataset.count())
print('final train dataset sample:')
train_dataset.show(10)

print('final test dataset size: %d'%test_dataset.count())
print('final test dataset sample:')
test_dataset.show(10)

                                                                                

final train dataset size: 17434453
final train dataset sample:
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|label|user_id|gender|age|occupation|  zip|movie_id|    recent_movie_ids|               genre|
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|    0|   6030|     M| 25|        17|32618|    2749|12069682664127...|ActionAdventure...|
|    0|   4547|     M| 18|        12|02115|     100|12104802858270...|               Drama|
|    0|   5087|     F| 25|         6|19102|    2295|9545891947110...|    Children'sDrama|
|    0|   4387|     F| 18|         4|63109|    3089|19612194971139...|        ComedyDrama|
|    0|   5950|     M| 25|         4|19713|       7|12651956300129...|      CrimeThriller|
|    0|   5573|     F| 35|         1|14619|     341|19643044800317...|ActionMysteryTh...|
|    0|   1037|     M| 45|         7|02081|    2111|34813408318933...|         



final test dataset size: 6040
final test dataset sample:
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|label|user_id|gender|age|occupation|  zip|movie_id|    recent_movie_ids|               genre|
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|    1|   5964|     M| 18|         5|97202|    2568|4295204423113...|        ActionCrime|
|    1|   4159|     F| 50|         9|01450|    2701|1239623555933...|ActionSci-FiWes...|
|    1|   1715|     M| 25|        16|53406|    2769|38973898394639...|       CrimeMystery|
|    1|    948|     M| 56|        12|43056|    3616|9965534585903...|      ComedyRomance|
|    1|    920|     M| 18|         4|92173|    1779|3801356590316...|AdventureSci-Fi...|
|    1|   2052|     M|  1|        10|46033|    3016|28272997271026...|              Horror|
|    1|   3214|     F| 56|         7|10019|    2388|16941208952191...|       DramaRo

                                                                                

In [14]:
temp_table = train_dataset.union(test_dataset).where(train_dataset['label'] == '1').distinct()
temp_table.registerTempTable('temp_table')        
query = """
select
    label, user_id, gender, age, occupation, zip, movie_id, recent_movie_ids, genre
from
(
    select
        *,
        ROW_NUMBER() OVER(PARTITION BY movie_id ORDER BY recent_movie_ids DESC) as sample_id
    from
        temp_table
) ta
where 
    sample_id=1
"""
item_dataset=spark.sql(query)
item_dataset.cache()
print('final item dataset size: %d'%item_dataset.count())
print('final item dataset sample:')
item_dataset.show(10)



final item dataset size: 3706
final item dataset sample:
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|label|user_id|gender|age|occupation|  zip|movie_id|    recent_movie_ids|               genre|
+-----+-------+------+---+----------+-----+--------+--------------------+--------------------+
|    1|   3272|     M| 35|         0|08330|    1090|9695031815071...|           DramaWar|
|    1|   5762|     F| 35|         6|55125|    1436|9531119711302...|              Comedy|
|    1|   3299|     F| 25|         4|19119|    1572|9559199542366...|               Drama|
|    1|   1812|     F| 25|        12|48103|    2069|99931861036261...|               Drama|
|    1|   1671|     M| 35|         0|98368|    2088|96327463199195...|AdventureComedy...|
|    1|   2304|     M| 45|        12|94103|    2136|91030881948127...|              Comedy|
|    1|   4568|     F| 25|         4|90034|    2162|92622624971923...|AdventureChild

                                                                                

In [15]:
#fg_result.coalesce(1).write.option("header", True).option('sep','::').csv(out_path, mode="overwrite")
train_dataset.write.parquet(train_dataset_out_path, mode="overwrite")
test_dataset.write.parquet(test_dataset_out_path, mode="overwrite")
item_dataset.write.parquet(item_dataset_out_path, mode="overwrite")

                                                                                

In [16]:
!aws s3 ls s3://dmetasoul-bucket/demo/movielens/feature_generation/num_negs_3/

                           PRE item.parquet/
                           PRE test.parquet/
                           PRE train.parquet/


In [17]:
#test = spark.read.csv(out_path, sep='::', header=True)
train_temp = spark.read.parquet(train_dataset_out_path)
test_temp = spark.read.parquet(test_dataset_out_path)
item_temp = spark.read.parquet(item_dataset_out_path)

In [18]:
print(train_temp.count())
print(test_temp.count())
print(item_temp.count())

                                                                                

17434453


                                                                                

6040




3706


                                                                                

In [19]:
sc.stop()

22/01/21 08:24:55 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)


# Debug:

In [None]:
dataset.cache()
dataset.show(5)
dataset.groupBy(dataset.user_id).count().orderBy(['count'], ascending=[1]).show()
print(dataset.count())

In [None]:
import numpy as np

np.random.choice(dist_list, size=5, \
                                          replace=True, p=dist_list).tolist()