In [None]:
import sys

from pyspark.sql import Window, functions as F
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
import metaspore as ms

spark_confs={
        "spark.network.timeout":"500",
        "spark.sql.codegen.wholeStage": "false"
    }

spark_session = ms.spark.get_session(local=True,
                             app_name='soc-pokec Demo',
                             batch_size=256,
                             worker_count=2,
                             server_count=2,
                             worker_memory='10G',
                             server_memory='10G',
                             coordinator_memory='10G',
                             spark_confs=spark_confs)

In [None]:
!aws s3 ls s3://dmetasoul-bucket/demo/datasets/soc-pokec/

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType
import pyspark.sql.functions as F

profile_colunm_names = ['user_id', 'public', 'completion_percentage', 'gender', 'region', 'last_login', 'registration',
                'AGE', 'body', 'I_am_working_in_field', 'spoken_languages', 'hobbies', 'I_most_enjoy_good_food',
                'pets', 'body_type', 'my_eyesight', 'eye_color', 'hair_color', 'hair_type', 'completed_level_of_education',
                'favourite_color', 'relation_to_smoking', 'relation_to_alcohol', 'sign_in_zodiac',
                'on_pokec_i_am_looking_for', 'love_is_for_me', 'relation_to_casual_sex', 'my_partner_should_be',
                'marital_status', 'children', 'relation_to_children', 'I_like_movies', 'I_like_watching_movie',
                'I_like_music', 'I_mostly_like_listening_to_music', 'the_idea_of_good_evening', 'I_like_specialties_from_kitchen',
                'fun', 'I_am_going_to_concerts', 'my_active_sports', 'my_passive_sports', 'profession', 'I_like_books',
                'life_style', 'music', 'cars', 'politics', 'relationships', 'art_culture', 'hobbies_interests',
                'science_technologies', 'computers_internet', 'education', 'sport', 'movies', 'travelling', 'health',
                'companies_brands', 'more']
relationship_colunm_names = ['user_id', 'friend_id']

profile_schema = StructType([StructField(cn, StringType(), True) for cn in profile_colunm_names])
relationship_schema = StructType([StructField(cn, LongType(), True) for cn in relationship_colunm_names])

profile_dataset = spark_session.read.csv('s3://dmetasoul-bucket/demo/datasets/soc-pokec/soc-pokec-profiles.txt', sep='\t', schema=profile_schema, header=False, inferSchema=False)
relationship_dataset = spark_session.read.csv('s3://dmetasoul-bucket/demo/datasets/soc-pokec/soc-pokec-relationships.txt', sep='\t', schema=relationship_schema, header=False, inferSchema=False)

profile_dataset = profile_dataset.withColumn('user_id', F.col('user_id').cast(LongType()))
profile_dataset = profile_dataset.orderBy(F.col('user_id')).limit(16000)
max_user_id = profile_dataset.agg({"user_id": "max"}).collect()[0]['max(user_id)']
relationship_dataset = relationship_dataset.filter((F.col('user_id') <= max_user_id) & (F.col('friend_id') <= max_user_id))

profile_dataset = profile_dataset.withColumn('user_id', F.col('user_id').cast(StringType()))
relationship_dataset = relationship_dataset.withColumn('user_id', F.col('user_id').cast(StringType()))
relationship_dataset = relationship_dataset.withColumn('friend_id', F.col('friend_id').cast(StringType()))

profile_dataset.cache()
relationship_dataset.cache()


In [None]:
profile_dataset.limit(10).toPandas()

In [None]:
relationship_dataset.limit(10).toPandas()

In [None]:
# relationship_df = relationship_dataset.groupby(F.col('user_id')).agg(F.collect_set(F.col('friend_id')).alias('friends'))
# relationship_df.limit(10).toPandas()

# profile_df = profile_dataset.join(relationship_df, on=profile_dataset.user_id==relationship_df.user_id, how='leftouter').drop(relationship_df.user_id)
# profile_df.limit(10).toPandas()


In [None]:
relationship_df = relationship_dataset.alias('t1').join(profile_dataset.alias('t2'), on=F.col('t1.user_id')==F.col('t2.user_id'), how='leftouter') \
                .select(F.col('t1.*'),
                        F.col('t2.gender').alias('user_gender'),
                        F.col('t2.AGE').alias('user_age'),
                        F.col('t2.completion_percentage').alias('user_completion_percentage'))

relationship_df = relationship_df.alias('t1').join(profile_dataset.alias('t2'), on=F.col('t1.friend_id')==F.col('t2.user_id'), how='leftouter') \
                .select(F.col('t1.*'),
                        F.col('t2.gender').alias('friend_gender'),
                        F.col('t2.AGE').alias('friend_age'),
                        F.col('t2.completion_percentage').alias('friend_completion_percentage'))


In [None]:
relationship_df.printSchema()
relationship_df.show(5)

In [None]:
relationship_df = relationship_df.select(F.lit('1').alias('label'), '*')
relationship_df.show(5)

In [None]:
splits = relationship_df.randomSplit([0.9, 0.1], 24)
train_dataset, test_dataset = splits[0], splits[1]

print('train dataset count: ', train_dataset.count())
print('test dataset count: ', test_dataset.count())

In [None]:
item_dataset = (
    relationship_df
    .withColumn('rn', F.row_number().over(
        Window.partitionBy('friend_id').orderBy(F.col('user_id'))
    ))
    .filter('rn == 1')
    .drop(F.col('rn'))
)

item_dataset.limit(10).toPandas()

In [None]:
train_dataset.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/train_dataset.parquet', mode="overwrite")
test_dataset.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/test_dataset.parquet', mode="overwrite")
item_dataset.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/item_dataset.parquet', mode="overwrite")
profile_dataset.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/profile_dataset.parquet', mode="overwrite")
relationship_dataset.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/relationship_dataset.parquet', mode="overwrite")

# ItemCF

In [None]:
tigger_df = (
    test_dataset
    .withColumn('rn', F.row_number().over(
        Window.partitionBy('user_id').orderBy(F.col('friend_id')))
        )
    .filter('rn == 1')
    .drop(F.col('rn'))
)

label_df = (
    test_dataset
    .withColumn('rn', F.row_number().over(
        Window.partitionBy('user_id').orderBy(F.col('friend_id')))
        )
    .filter('rn > 1')
    .drop(F.col('rn'))
    .groupby('user_id')
    .agg(F.collect_list('friend_id').alias('label_friends'))
)

test_df = (
    tigger_df.alias('t1').join(label_df.alias('t2'), on=F.col('t1.user_id')==F.col('t2.user_id'), how='rightouter')
    .select(F.col('t1.*'),
            F.col('t2.label_friends'))
)

test_df.limit(10).toPandas()

In [None]:
sys.path.append('/home/spark/work/MetaSpore/') 
from python.algos.item_cf_retrieval import ItemCFEstimator

estimator = ItemCFEstimator(user_id_column_name='user_id',
                            item_id_column_name='friend_id',
                            behavior_column_name='label',
                            behavior_filter_value='1',
                            key_column_name='key',
                            value_column_name='value',
                            max_recommendation_count=20,
                            debug=True)

model = estimator.fit(train_dataset)

In [None]:
prediction_df = model.transform(test_df)
prediction_df = prediction_df.withColumnRenamed('value', 'rec_info')
prediction_df.limit(10).toPandas()

In [None]:
from pyspark.mllib.evaluation import RankingMetrics
prediction_label_rdd = prediction_df.rdd.map(lambda x:(\
                                [xx._1 for xx in x.rec_info] if x.rec_info is not None else [], \
                                 x.label_friends))
recall_metrics = RankingMetrics(prediction_label_rdd)

print("Debug -- Precision@20: ", recall_metrics.precisionAt(20))
print("Debug -- Recall@20: ", recall_metrics.recallAt(20))
print("Debug -- MAP@20: ", recall_metrics.meanAveragePrecisionAt(20))

# Swing

In [None]:
swing_estimator = ms.SwingEstimator(user_id_column_name='user_id',
                            item_id_column_name='friend_id',
                            behavior_column_name='label',
                            behavior_filter_value='1',
                            key_column_name='key',
                            value_column_name='value',
                            use_plain_weight=False,
                            smoothing_coefficient=1.0,
                            max_recommendation_count=20)

swing_model = swing_estimator.fit(train_dataset)

In [None]:
swing_prediction_df = swing_model.transform(test_df)
swing_prediction_df = swing_prediction_df.withColumnRenamed('value', 'rec_info')
swing_prediction_df.limit(10).toPandas()

In [None]:
from pyspark.mllib.evaluation import RankingMetrics
swing_prediction_label_rdd = swing_prediction_df.rdd.map(lambda x:(\
                                [xx._1 for xx in x.rec_info] if x.rec_info is not None else [], \
                                 x.label_friends))
swing_recall_metrics = RankingMetrics(swing_prediction_label_rdd)

print("Debug -- Swing Precision@20: ", swing_recall_metrics.precisionAt(20))
print("Debug -- Swing Recall@20: ", swing_recall_metrics.recallAt(20))
print("Debug -- Swing MAP@20: ", swing_recall_metrics.meanAveragePrecisionAt(20))

# TwoTowers

In [None]:
import yaml
import subprocess
import sys
import metaspore as ms

model_params = dict()
with open('conf/soc_pokec_dssm_inbatch_new.yaml', 'r') as stream:
    model_params = yaml.load(stream, Loader=yaml.FullLoader)
    print('Debug -- load config: ', model_params)

In [None]:
subprocess.run(['zip', '-r', '../MetaSpore/solutions/recommend/offline/social_network/python.zip', 'python'], cwd='../../../../../recommend-algos')
spark_confs={
    "spark.network.timeout":"500",
    "spark.ui.showConsoleProgress": "true",
    "spark.kubernetes.executor.deleteOnTermination":"true",
    "spark.submit.pyFiles":"python.zip",
}
spark = ms.spark.get_session(local=model_params['local'],
                             app_name=model_params['app_name'],
                             batch_size=model_params['batch_size'],
                             worker_count=model_params['worker_count'],
                             server_count=model_params['server_count'],
                             worker_memory=model_params['worker_memory'],
                             server_memory=model_params['server_memory'],
                             coordinator_memory=model_params['coordinator_memory'],
                             spark_confs=spark_confs)
sc = spark.sparkContext
print('Debug -- spark init')
print('Debug -- version:', sc.version)   
print('Debug -- applicaitonId:', sc.applicationId)
print('Debug -- uiWebUrl:', sc.uiWebUrl)

In [None]:
# sys.path.append('/home/spark/work/recommend-algos')
from python.dssm_net import UserModule, ItemModule, SimilarityModule
from python.training import TwoTowerBatchNegativeSamplingAgent, TwoTowerBatchNegativeSamplingModule

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType
import pyspark.sql.functions as F

train_dataset = spark.read.parquet(model_params['train_path'])
test_dataset = spark.read.parquet(model_params['test_path'])
item_dataset = spark.read.parquet(model_params['item_path'])


In [None]:
train_dataset.limit(100).toPandas()

In [None]:
## init user module, item module, similarity module
user_module = UserModule(model_params['user_column_name'], \
                         model_params['user_combine_schema'], \
                         emb_size = model_params['vector_embedding_size'], \
                         alpha = model_params['ftrl_learning_rate'], \
                         beta = model_params['ftrl_smothing_rate'], \
                         l1 = model_params['ftrl_l1_regularization'], \
                         l2 = model_params['ftrl_l2_regularization'], \
                         dense_structure = model_params['dense_structure'])
item_module = ItemModule(model_params['item_column_name'], \
                         model_params['item_combine_schema'], \
                         emb_size = model_params['vector_embedding_size'], \
                         alpha = model_params['ftrl_learning_rate'], \
                         beta = model_params['ftrl_smothing_rate'], \
                         l1 = model_params['ftrl_l1_regularization'], \
                         l2 = model_params['ftrl_l2_regularization'], \
                         dense_structure = model_params['dense_structure'])
similarity_module = SimilarityModule(model_params['tau'])
module = TwoTowerBatchNegativeSamplingModule(user_module, item_module, similarity_module)

import importlib
module_lib = importlib.import_module(model_params['two_tower_module'])
## init estimator class
estimator_class_ = getattr(module_lib, model_params['two_tower_estimator_class'])
estimator = estimator_class_(module = module,
                             item_dataset = item_dataset,
                             item_ids_column_indices = [2],
                             retrieval_item_count = 20,
                             metric_update_interval = 500,
                             agent_class = TwoTowerBatchNegativeSamplingAgent,
                             **model_params)
## dnn learning rate
estimator.updater = ms.AdamTensorUpdater(model_params['adam_learning_rate'])
## model train
model = estimator.fit(train_dataset)

In [None]:
test_result = model.transform(test_dataset)
print('Debug -- test result sample:')
test_result.show(20)

In [None]:
from pyspark.sql import functions as F
print('Debug -- test sample:')
test_result.select('user_id', (F.posexplode('rec_info').alias('pos', 'rec_info'))).show(60)

test_result[test_result['user_id']==100]\
            .select('user_id', (F.posexplode('rec_info').alias('pos', 'rec_info'))).show(60)

## evaluation
from pyspark.mllib.evaluation import RankingMetrics
prediction_label_rdd = test_result.rdd.map(lambda x:(\
                                        [xx.name for xx in x.rec_info] if x.rec_info is not None else [], \
                                        [x.friend_id]))

recall_metrics = RankingMetrics(prediction_label_rdd)

print("Debug -- Precision@20: ", recall_metrics.precisionAt(20))
print("Debug -- Recall@20: ", recall_metrics.recallAt(20))
print("Debug -- MAP@20: ", recall_metrics.meanAveragePrecisionAt(20))

# Negative Sampling for CTR

In [None]:
import yaml
import subprocess
import sys
import metaspore as ms

subprocess.run(['zip', '-r', '../../solutions/recommend/offline/social_network/python.zip', 'common'], cwd='../../../../demo/dataset')
spark_confs={
    "spark.network.timeout":"500",
    "spark.ui.showConsoleProgress": "true",
    "spark.kubernetes.executor.deleteOnTermination":"true",
    "spark.submit.pyFiles":"python.zip",
}
spark = ms.spark.get_session(local=False,
                             app_name='soc-pokec ng sampling',
                             batch_size=128,
                             worker_count=2,
                             server_count=2,
                             worker_memory='10G',
                             server_memory='10G',
                             coordinator_memory='10G',
                             spark_confs=spark_confs)
sc = spark.sparkContext
print('Debug -- spark init')
print('Debug -- version:', sc.version)   
print('Debug -- applicaitonId:', sc.applicationId)
print('Debug -- uiWebUrl:', sc.uiWebUrl)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType
import pyspark.sql.functions as F

train_dataset = spark.read.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/train_dataset.parquet')
test_dataset = spark.read.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/test_dataset.parquet')
item_dataset = spark.read.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/item_dataset.parquet')


all_dataset = train_dataset.union(test_dataset)


In [None]:
from common.neg_sampler import negative_sampling

neg_sample_df = negative_sampling(spark, dataset=all_dataset, user_column='user_id', item_column='friend_id', time_column=None, 
                                      negative_item_column='trigger_item_id', negative_sample=3)
neg_sample_df.cache()

print('count of neg_sample_df: ', neg_sample_df.count())
neg_sample_df.limit(10).toPandas()


In [None]:
neg_sample_df = neg_sample_df.withColumn('label', F.lit('0'))

neg_sample_df = neg_sample_df.alias('t1')\
                            .join(all_dataset.alias('t2'), \
                                (F.col('t1.user_id')==F.col('t2.user_id')) & (F.col('t1.trigger_item_id')==F.col('t2.friend_id')),
                                how='leftouter')\
                            .select('t1.label', \
                                't1.user_id', 't1.friend_id', 't2.user_gender', 't2.user_age', 't2.user_completion_percentage')

neg_sample_df = neg_sample_df.alias('t1')\
                            .join(item_dataset.alias('t2'), \
                                F.col('t1.friend_id')==F.col('t2.friend_id'),
                                how='leftouter')\
                            .select('t1.*', 't2.friend_gender', 't2.friend_age', 't2.friend_completion_percentage')

neg_sample_df.limit(10).toPandas()

In [None]:
all_dataset = all_dataset.union(neg_sample_df)

splits = all_dataset.randomSplit([0.9, 0.1], 24)
train_dataset_rank, test_dataset_rank = splits[0], splits[1]

In [None]:
train_dataset_rank.cache()
test_dataset_rank.cache()

print('Percentage of positive sample in train_dataset: ', train_dataset_rank.filter(F.col('label') == '1').count() / train_dataset_rank.count())
print('Percentage of positive sample in test_dataset: ', test_dataset_rank.filter(F.col('label') == '1').count() / test_dataset_rank.count())

train_dataset_rank.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/train_dataset_rank.parquet', mode="overwrite")
test_dataset_rank.write.parquet('s3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/test_dataset_rank.parquet', mode="overwrite")


# DeepFM

In [None]:
import yaml
import subprocess
import sys
import metaspore as ms

model_params = dict()
with open('conf/soc_pokec_deepfm.yaml', 'r') as stream:
    model_params = yaml.load(stream, Loader=yaml.FullLoader)
    print('Debug -- load config: ', model_params)
    
locals().update(model_params)

In [None]:
subprocess.run(['zip', '-r', 'solutions/recommend/offline/social_network/python.zip', 'python'], cwd='../../../../')
spark_confs={
    "spark.network.timeout":"500",
    "spark.ui.showConsoleProgress": "true",
    "spark.kubernetes.executor.deleteOnTermination":"true",
    "spark.submit.pyFiles":"python.zip",
}
spark = ms.spark.get_session(local=model_params['local'],
                             app_name=model_params['app_name'],
                             batch_size=model_params['batch_size'],
                             worker_count=model_params['worker_count'],
                             server_count=model_params['server_count'],
                             worker_memory=model_params['worker_memory'],
                             server_memory=model_params['server_memory'],
                             coordinator_memory=model_params['coordinator_memory'],
                             spark_confs=spark_confs)
sc = spark.sparkContext
print('Debug -- spark init')
print('Debug -- version:', sc.version)   
print('Debug -- applicaitonId:', sc.applicationId)
print('Debug -- uiWebUrl:', sc.uiWebUrl)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType
import pyspark.sql.functions as F

train_dataset = spark.read.parquet(model_params['train_path'])
test_dataset = spark.read.parquet(model_params['test_path'])

In [None]:
from python.algos.deepfm_net import DeepFM

module = DeepFM(use_wide=use_wide,
            use_dnn=use_dnn,
            use_fm=use_fm,
            wide_embedding_dim=wide_embedding_dim,
            deep_embedding_dim=deep_embedding_dim,
            wide_column_name_path=wide_column_name_path,
            wide_combine_schema_path=wide_combine_schema_path,
            deep_column_name_path=deep_column_name_path,
            deep_combine_schema_path=deep_combine_schema_path,
            sparse_init_var=sparse_init_var,
            dnn_hidden_units=dnn_hidden_units,
            dnn_hidden_activations=dnn_hidden_activations,
            use_bias=use_bias,
            batch_norm=batch_norm,
            net_dropout=net_dropout,
            net_regularizer=net_regularizer,
            ftrl_l1=ftrl_l1,
            ftrl_l2=ftrl_l2,
            ftrl_alpha=ftrl_alpha,
            ftrl_beta=ftrl_beta)

estimator = ms.PyTorchEstimator(module=module, **model_params)

estimator.updater = ms.AdamTensorUpdater(adam_learning_rate)
model = estimator.fit(train_dataset)


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train_result = model.transform(train_dataset)
test_result = model.transform(test_dataset)

train_evaluator = BinaryClassificationEvaluator()
train_auc = train_evaluator.evaluate(train_result)

test_evaluator = BinaryClassificationEvaluator()
test_auc = test_evaluator.evaluate(test_result)

print('Debug -- Train AUC: ', train_auc)
print('Debug -- Test AUC: ', test_auc)

# Pipeline Test

In [None]:
from pipelines.pipeline import Pipeline
from pipelines.nodes.init_spark import InitSparkNode
from pipelines.nodes.data_loader import DataLoaderNode
from pipelines.nodes.two_towers_estimator import TwoTowersEstimatorNode
from pipelines.nodes.retrieval_evaluator import RetrievalEvaluatorNode

p = Pipeline('pipelines/test.yaml')
p.add_node(InitSparkNode())
p.add_node(DataLoaderNode())
p.add_node(TwoTowersEstimatorNode())
p.add_node(RetrievalEvaluatorNode())

p.run()

  from .autonotebook import tqdm as notebook_tqdm


Debug -- load config:  {'app_name': 'Pipeline Demo', 'local': False, 'worker_count': 2, 'server_count': 2, 'batch_size': 128, 'worker_memory': '4G', 'server_memory': '4G', 'coordinator_memory': '4G', 'zip_path': '../MetaSpore/solutions/recommend/offline/social_network/python.zip', 'zip_cwd': '../../../../../recommend-algos', 'train_path': 's3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/train_dataset.parquet', 'test_path': 's3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/test_dataset.parquet', 'item_path': 's3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_fg/item_dataset.parquet', 'user_id': 'user_id', 'item_id': 'friend_id', 'two_tower_module': 'metaspore', 'two_tower_module_class': 'TwoTowerBatchNegativeSamplingModule', 'two_tower_estimator_class': 'TwoTowerRetrievalEstimator', 'user_column_name': 's3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_schema/column_name.txt', 'user_combine_schema': 's3://dmetasoul-bucket/demo/datasets/soc-pokec/demo_schema/user_combine_sch

22/06/22 10:46:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/22 10:46:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
