In [12]:
import io
import os
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from IPython.display import display

# 1. 数据处理

In [13]:
# 读取之前处理好的features.csv文件
features_df = pd.read_csv('features.csv')
features_df

Unnamed: 0,item_id,live_id,comment_count,fav_count,live_price,after_speak_sale,anchor_id,avg_pv,fans_num,item_count_x,...,channel_id_69,channel_id_7,channel_id_8,channel_id_81,channel_id_82,channel_id_9,channel_id_97,re_location_0,re_location_1,re_location_2
0,6.140904e+11,2.724494e+11,969.0,3040.0,89.9,341.0,427785947,16522.0,1788740.0,189.0,...,0,1,0,0,0,0,0,0,0,1
1,6.216509e+11,2.724494e+11,17.0,428.0,89.9,165.0,427785947,16522.0,1788740.0,189.0,...,0,1,0,0,0,0,0,0,0,1
2,6.184656e+11,2.724494e+11,415.0,4788.0,139.0,345.0,427785947,16522.0,1788740.0,189.0,...,0,1,0,0,0,0,0,0,0,1
3,6.222409e+11,2.724494e+11,15.0,700.0,69.9,379.0,427785947,16522.0,1788740.0,189.0,...,0,1,0,0,0,0,0,0,0,1
4,6.226983e+11,2.724494e+11,0.0,166.0,129.0,108.0,427785947,16522.0,1788740.0,189.0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64151,6.186700e+11,2.715348e+11,1.0,2.0,1000.0,1.0,2333208945,7643.0,4615.0,17.0,...,0,0,0,0,0,0,0,0,0,1
64152,6.225621e+11,2.708427e+11,0.0,0.0,888.0,0.0,2208537528,6793.0,14183.0,221.0,...,0,1,0,0,0,0,0,0,0,1
64153,5.887962e+11,2.720551e+11,211.0,498.0,98.0,16.0,754205053,3041.0,5383.0,91.0,...,0,0,1,0,0,0,0,0,0,1
64154,6.186530e+11,2.718611e+11,0.0,0.0,456.0,0.0,1730501906,3509.0,18202.0,361.0,...,0,1,0,0,0,0,0,0,0,1


In [14]:
# 切分数据集
y = features_df[['after_speak_sale']]
X = features_df.drop(['after_speak_sale', 'item_id', 'live_id', 'anchor_id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [15]:
# 拼接label与特征，并保存csv文件
train_data =  pd.concat([y_train, X_train], axis=1)
validation_data = pd.concat([y_val, X_val], axis=1)
train_data.to_csv('xgb_train.csv', header=False, index=False)
validation_data.to_csv('xgb_validation.csv', header=False, index=False)

In [16]:
# 上传数据
sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()
prefix = 'recommendation-xgb-sort'
train_key = 'train.csv'
val_key = 'validation.csv'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_key)).upload_file('xgb_train.csv')
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', val_key)).upload_file('xgb_validation.csv')

# 2. 模型训练

In [17]:
# 定义模型训练相关参数
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='latest')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [18]:
# 定义estimator

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m5.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:linear',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-07-14 07:06:06 Starting - Starting the training job...
2021-07-14 07:06:08 Starting - Launching requested ML instancesProfilerReport-1626246366: InProgress
......
2021-07-14 07:07:24 Starting - Preparing the instances for training.........
2021-07-14 07:09:04 Downloading - Downloading input data...
2021-07-14 07:09:25 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2021-07-14:07:09:26:INFO] Running standalone xgboost training.[0m
[34m[2021-07-14:07:09:26:INFO] File size need to be processed in the node: 11.99mb. Available memory size in the node: 8063.86mb[0m
[34m[2021-07-14:07:09:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:09:26] S3DistributionType set as FullyReplicated[0m
[34m[07:09:26] 46192x82 matrix with 3787744 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-07-14:07:09:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:09:26] S3Distribut

# 3. 模型部署与推理

In [19]:
# 部署模型
from sagemaker.serializers import CSVSerializer

xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge', serializer=CSVSerializer())

----------------!

In [20]:
# 推理
print(X_test.to_numpy()[0:1])
y_pred = xgb_predictor.predict(X_test.to_numpy()[0:50])
y_pred = str(y_pred)[2:-1].split(',')
y_pred = pd.Series([ float(i) for i in y_pred ])
compare_df = pd.DataFrame({'y_test': y_test['after_speak_sale'][:50].reset_index(drop=True), 'y_pred': y_pred})
compare_df

[[5.530000e+02 1.426000e+03 4.990000e+02 6.357800e+04 2.437381e+06
  1.890000e+02 1.500000e+01 4.290000e+02 7.300000e+01 5.040800e+04
  2.031060e+05 4.314900e+04 1.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
  0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 0.000000

Unnamed: 0,y_test,y_pred
0,33.0,642.925842
1,757.0,483.187531
2,1226.0,677.491089
3,0.0,253.118683
4,1684.0,3636.695068
5,34.0,103.408485
6,21.0,494.175323
7,58.0,129.683502
8,36.0,308.347504
9,29.0,103.408485


In [21]:
for i in sorted(compare_df['y_test'][:50], reverse=True):
    print(list(y_test['after_speak_sale'][:50]).index(i), end=' ')

30 44 48 13 12 4 2 19 40 39 1 31 17 27 34 21 21 18 24 49 35 28 47 41 42 7 8 23 5 0 0 9 11 38 6 10 25 46 15 32 14 14 14 3 3 3 3 3 3 3 

In [22]:
for i in sorted(compare_df['y_pred'], reverse=True):
    print(list(y_pred).index(i), end=' ')

44 30 40 48 4 13 22 34 17 41 12 47 2 0 19 19 27 6 1 1 35 18 24 28 8 38 3 46 29 42 49 21 33 7 37 16 5 5 5 5 5 5 5 5 5 32 45 14 43 36 