In [1]:
import metaspore as ms

spark_confs={
    "spark.network.timeout":"500",
    "spark.ui.showConsoleProgress": "true",
    "spark.kubernetes.executor.deleteOnTermination":"true",
}
spark = ms.spark.get_session(local=True,
                            app_name='ESMM read data',
                            batch_size=256,
                            worker_count=2,
                            server_count=2,
                            worker_memory='5G',
                            server_memory='5G',
                            coordinator_memory='5G',
                            spark_confs=spark_confs)
sc = spark.sparkContext
print('Debug -- spark init')
print('Debug -- version:', sc.version)   
print('Debug -- applicaitonId:', sc.applicationId)
print('Debug -- uiWebUrl:', sc.uiWebUrl)

22/05/17 06:57:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: local-1652770633175
Debug -- uiWebUrl: http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/


In [2]:
train_path = '${MY_S3_BUCKET}/aliccp/traindata_10w.csv'
test_path = '${MY_S3_BUCKET}/aliccp/testdata_10w.csv'

train_output_path ='${MY_S3_BUCKET}/aliccp/traindata_10w.parquet'
test_output_path = '${MY_S3_BUCKET}/aliccp/testdata_10w.parquet'

In [3]:
train_dataset = spark.read.csv(train_path,  sep=',')
test_dataset = spark.read.csv(test_path,  sep=',')

In [4]:
from collections import defaultdict

all_field_list = [
    '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
    '125', '126', '127', '128', '129', '205', '206', '207', '210',
    '216', '508', '509', '702', '853', '301'
]

def get_aliccp_fields():
    all_field_dict = defaultdict(int)
    for i, field_id in enumerate(all_field_list):
        all_field_dict[field_id] = i
    return all_field_list, all_field_dict

def get_aliccp_columns():
    return ['label', 'ctr_label', 'cvr_label'] + all_field_list
    

def transform(row, max_len=10, sep=u'\u0001', default_padding='-1'):
    all_field_list, all_field_dict = get_aliccp_fields()
    output_buffer = [(field_id, []) for field_id in all_field_dict]
    
    ctr_label = 0
    ctr_label = 0
    for key, value in row.asDict().items():
        if key == '_c0': # row number
            continue
        elif key == '_c1':
            ctr_label = int(value)
        elif key == '_c2':
            cvr_label = int(value)
        else:
            if value is None or value =='':
                continue
            else:
                field_id, feature_id = value.strip().split(':')
                if field_id not in all_field_dict:
                    continue
                index = all_field_dict[field_id]
                output_buffer[index][1].append(int(feature_id))
    
    output_list=[]
    output_list.append(str(ctr_label * cvr_label))
    output_list.append(str(ctr_label))
    output_list.append(str(cvr_label))
    for i in range(len(all_field_list)):
        if len(output_buffer[i][1]) == 0:
            output_list.append(default_padding)
        else:
            seqs = output_buffer[i][1]
            if len(output_buffer[i][1]) > max_len:
                seqs = output_buffer[i][1][:max_len]
            output_list.append(sep.join([str(x) for x in seqs]))
    return output_list

In [5]:
fg_train_dataset = train_dataset.rdd.map(lambda x: transform(x)).toDF(get_aliccp_columns())

                                                                                

In [6]:
fg_train_dataset.count()

22/05/17 06:57:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

100001

In [7]:
fg_train_dataset.write.parquet(train_output_path, mode="overwrite")

                                                                                

In [8]:
fg_train_dataset_load = spark.read.parquet(train_output_path)
fg_train_dataset_load[fg_train_dataset_load['101']=='12'].show(3, False)



+-----+---------+---------+---+---------------------------------------+---------------------------------------+---------------------------------------+-----------------------------+---+---+---+---+---+---+---+---+---+---+---+---------------------------------------+---+---+---+---+---+---+
|label|ctr_label|cvr_label|101|109_14                                 |110_14                                 |127_14                                 |150_14                       |121|122|124|125|126|127|128|129|205|206|207|210                                    |216|508|509|702|853|301|
+-----+---------+---------+---+---------------------------------------+---------------------------------------+---------------------------------------+-----------------------------+---+---+---+---+---+---+---+---+---+---+---+---------------------------------------+---+---+---+---+---+---+
|0    |0        |0        |12 |418419420421422423424425426427|535536537538539540541542543544|209210211212213

                                                                                

In [9]:
fg_test_dataset = test_dataset.rdd.map(lambda x: transform(x)).toDF(get_aliccp_columns())

In [10]:
fg_test_dataset.count()

                                                                                

100001

In [11]:
fg_test_dataset.write.parquet(test_output_path, mode="overwrite")

                                                                                

In [12]:
fg_test_dataset_load = spark.read.parquet(test_output_path)
fg_test_dataset_load[fg_test_dataset_load['101']=='12'].show(3, False)



+-----+---------+---------+---+---------------------------------------------+---------------------------------------+-----------------------------------------------+----------------------------------------+---+---+---+---+---+---+---+---+------+----+------+-----------------------------------------+------+----+----+----+----+---+
|label|ctr_label|cvr_label|101|109_14                                       |110_14                                 |127_14                                         |150_14                                  |121|122|124|125|126|127|128|129|205   |206 |207   |210                                      |216   |508 |509 |702 |853 |301|
+-----+---------+---------+---+---------------------------------------------+---------------------------------------+-----------------------------------------------+----------------------------------------+---+---+---+---+---+---+---+---+------+----+------+-----------------------------------------+------+----+----+----+----+---+
|0    |

                                                                                

In [13]:
!aws s3 ls ${MY_S3_BUCKET}/aliccp/traindata_10w.parquet/

2022-05-17 06:58:25          0 _SUCCESS
2022-05-17 06:58:15     947384 part-00000-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet
2022-05-17 06:58:16    1079777 part-00001-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet
2022-05-17 06:58:16    1069582 part-00002-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet
2022-05-17 06:58:15     927070 part-00003-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet
2022-05-17 06:58:24    1065867 part-00004-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet
2022-05-17 06:58:22     607252 part-00005-d6feb948-d17c-41e3-9f79-8f74780ed618-c000.snappy.parquet


In [14]:
!aws s3 ls ${MY_S3_BUCKET}/aliccp/testdata_10w.parquet/

2022-05-17 06:59:34          0 _SUCCESS
2022-05-17 06:59:24     981117 part-00000-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet
2022-05-17 06:59:24    1113958 part-00001-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet
2022-05-17 06:59:24    1120122 part-00002-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet
2022-05-17 06:59:23     970861 part-00003-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet
2022-05-17 06:59:33    1103592 part-00004-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet
2022-05-17 06:59:30     615321 part-00005-b93ffec3-8b57-4ba2-b06f-626fedbaea29-c000.snappy.parquet


In [None]:
spark.stop()