In [1]:
import pandas as pd
import numpy as np 
%matplotlib inline
import time
import random
import pickle
from datetime import datetime
from typing import List, Any
import yaml
import boto3
from tqdm import tqdm

import sagemaker
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.sklearn.processing import SKLearnProcessor, ScriptProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

https://www.alpha-quantum.com/blog/ctr-prediction/ctr-prediction-using-hashing-trick-logistic-regression-sgd-and-only-simple-python/

In [2]:
SETTING_FILE_PATH = '../settings.yaml'

In [3]:
# df_train = pd.read_csv('../avazu-ctr-prediction/train')
# df_train_partial = df_train[df_train.index % 100 == 0]
# df_train_partial.to_csv('../avazu-ctr-prediction/train_partial', index=False)
df_train = pd.read_csv('../avazu-ctr-prediction/train_partial')

In [4]:
df_test = pd.read_csv('../avazu-ctr-prediction/test')

In [74]:
pd.concat([df_train_partial.nunique().to_frame(), df_train_partial.dtypes.to_frame(), df_train_partial.isna().any().to_frame()], axis=1)

Unnamed: 0,0,0.1,0.2
id,404290,float64,False
click,2,int64,False
hour,240,int64,False
C1,7,int64,False
banner_pos,7,int64,False
site_id,2171,object,False
site_domain,2147,object,False
site_category,20,object,False
app_id,2245,object,False
app_domain,132,object,False


In [73]:
## hasing trick, train valid test split

In [5]:
df_train, df_valid = train_test_split(df_train, train_size=0.8)

In [6]:
def preprocess(df: pd.DataFrame):
    df['hour'] = df['hour'].map(lambda x: datetime.strptime(str(x), "%y%m%d%H"))
    df['day_of_week'] = df['hour'].map(lambda x: x.hour)
    
    feature_hasher = FeatureHasher(n_features=2**24, input_type='string')
    hashed_feature = feature_hasher.fit_transform(np.asanyarray(df.astype(str)))
    
    return hashed_feature

In [7]:
current_time_sec = int(round(time.time()))
df_train['event_time'] = pd.Series([current_time_sec]*len(df_train), dtype="float64")


In [8]:
feature_cols = ['id', 'event_time', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
                            'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                            'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
                            'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

target = 'click'


In [3]:

def hashing(x: str, n_features=2**24)-> int:
    return hash(x) % n_features

def hashing_from_dataframe(df: pd.DataFrame, n_features=2**24):

    df_hashed = np.zeros((df.shape[0],n_features), dtype=int)
    for row in tqdm(range(df.shape[0])):
        for col in range(df.shape[1]):
            index = hashing(str(df.iloc[row, col])) + 1
            df_hashed[row, index] += 1
    return df_hashed

In [9]:
y_train = df_train[target].values
y_train = np.asarray(y_train).ravel()

X_train = df_train[feature_cols]
X_train_hashed = preprocess(X_train)

In [None]:
# np.zeros((X_train.shape[0], 2*24), dtype=int)
hashing_from_dataframe(X_train)

 74%|██████████████████████████████████            | 239677/323432 [03:25<01:52, 741.72it/s]

In [3]:
with open('train_data', 'wb') as p:
    pickle.dump({"feature": X_train_hashed, 'target': y_train}, p)

NameError: name 'X_train_hashed' is not defined

In [148]:
model = SGDClassifier(loss='log', alpha=0.00001, penalty='l2', eta0=2.0, n_jobs=-1, random_state=42)
# model = LogisticRegression()
model.partial_fit(X_train_hashed, y_train, classes=[0, 1])


SGDClassifier(alpha=1e-05, eta0=2.0, loss='log', n_jobs=-1, random_state=42)

In [149]:
y_valid = df_valid[target]
y_valid = np.asarray(y_valid).ravel()

X_valid = df_valid[feature_cols]
X_valid_hashed = preprocess(X_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [156]:
y_pred = model.predict_proba(X_valid_hashed)

In [4]:
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)

Unnamed: 0,id,event_time,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,day_of_week
33676,1.508678e+19,1.651367e+09,0,2014-10-21 17:00:00,1005,1,791b5571,fd13bc73,f028772b,ecad2386,...,0,16208,320,50,1800,3,167,100077,23,17
291370,7.673021e+18,1.651367e+09,0,2014-10-28 10:00:00,1005,1,e4d8dd7b,a17bde68,3e814130,ecad2386,...,0,22682,320,50,2528,0,39,100081,221,10
338742,1.389177e+18,,0,2014-10-29 09:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,0,23160,320,50,2667,0,35,100188,221,9
247910,4.019017e+18,1.651367e+09,0,2014-10-27 07:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,0,15705,320,50,1722,0,35,-1,79,7
327883,1.377997e+19,,0,2014-10-29 03:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,f0d41ff1,...,0,22988,320,50,2657,3,38,100032,23,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81005,1.506654e+19,1.651367e+09,0,2014-10-22 15:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,0,15701,320,50,1722,0,35,-1,79,15
201538,4.093370e+18,1.651367e+09,0,2014-10-26 01:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,92f5800b,...,3,21191,320,50,2424,1,161,100193,71,1
177333,2.093357e+18,1.651367e+09,0,2014-10-25 10:00:00,1010,1,85f751fd,c4e18dd6,50e219e0,7eee9ec3,...,0,22335,320,50,2578,0,1315,100131,110,10
184858,1.178875e+18,1.651367e+09,0,2014-10-25 14:00:00,1005,1,b7e9786d,b12b9f85,f028772b,ecad2386,...,0,19950,320,50,1800,3,167,100076,23,14


In [5]:
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = boto3.Session().region_name

sm = boto3.client('sagemaker')
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")
s3 = boto3.client('s3')

'ap-northeast-1'

In [35]:
prefix = "ctr-prediction-feature-store"
feature_group_name = "ctr-prediction-group"

feature_definitions = [
    FeatureDefinition(feature_name=feature_name, feature_type=FeatureTypeEnum.STRING)
    for feature_name in feature_cols
]

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)


In [243]:
record_identifier_name = "id"
event_time_feature_name = "event_time"

In [247]:
feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}",
    record_identifier_name = record_identifier_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=False)

{'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/ctr-prediction-group',
 'ResponseMetadata': {'RequestId': '38cc9a61-2b99-4ad5-a6af-069e44d781fd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '38cc9a61-2b99-4ad5-a6af-069e44d781fd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '102',
   'date': 'Thu, 21 Apr 2022 16:16:21 GMT'},
  'RetryAttempts': 0}}

In [248]:
import time


def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
    
wait_for_feature_group_creation_complete(feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup ctr-prediction-group successfully created.


In [249]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        data_frame[label] = data_frame[label].astype("str").astype("string")
    return data_frame

In [251]:
output_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
df_train['event_time'] = output_date

feature_group.ingest(data_frame=cast_object_to_string(df_train), max_workers=3, wait=True)


KeyboardInterrupt: 

In [218]:
feature_store_query = feature_group.athena_query()

In [220]:
feature_store_table = feature_store_query.table_name

In [221]:
print(feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.ctr-prediction-group (
  id STRING
  event_time STRING
  click STRING
  hour STRING
  C1 STRING
  banner_pos STRING
  site_id STRING
  site_domain STRING
  site_category STRING
  app_id STRING
  app_domain STRING
  app_category STRING
  device_id STRING
  device_ip STRING
  device_model STRING
  device_type STRING
  device_conn_type STRING
  C14 STRING
  C15 STRING
  C16 STRING
  C17 STRING
  C18 STRING
  C19 STRING
  C20 STRING
  C21 STRING
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://ctr-prediction/ctr-prediction-feature-store/547760918250/sagemaker/ap-northeast-1/offline-store/ctr-prediction-group'


In [222]:
query_string = """
SELECT id, hour, click 
FROM "{}" LIMIT 5
""".format(
    feature_store_table
)

In [223]:
feature_store_query.run(query_string=query_string, output_location="s3://" + bucket + "/" + prefix + "/query_results/")

feature_store_query.wait()

In [224]:
dataset = pd.DataFrame()

dataset = feature_store_query.as_dataframe()

dataset

Unnamed: 0,id,hour,click
0,1.10805e+19,1410211400,0
1,1.149118e+19,1410211400,0
2,1.164484e+19,1410211400,1
3,1.253413e+19,1410211400,0
4,1.254452e+19,1410211400,1


In [35]:
processing_instance_type = "ml.t3.medium"
processing_instance_count = 1
train_valid_split_percentage = 0.8
processing_job_name = "ctr-prediction-sklearn-preprocessor"

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_runtime_in_seconds=7200,
)

In [36]:
processor.run(
    code="ctr-prediction-preprocessor.py",
    inputs=[ProcessingInput(
                    source="s3://ctr-prediction/input/",
                    destination="/opt/ml/processing/input"),
           ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train",
                                    destination="s3://ctr-prediction/output/"
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/validation",
                                    destination="s3://ctr-prediction/output/"
                        ),
    ],
     arguments=[
        "--train_valid_split_percentage",
        str(0.9)],
    wait=True,
    logs=True,
    job_name=processing_job_name,
    experiment_config=None    
)


Job Name:  ctr-prediction-sklearn-preprocessor
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/input/', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-1-547760918250/ctr-prediction-sklearn-preprocessor/input/code/ctr-prediction-preprocessor.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/output/', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/output/', 'LocalPath': '/opt/ml/process

In [39]:
sm = boto3.client('sagemaker')
jobs = sm.list_processing_jobs()
pd.DataFrame(jobs['ProcessingJobSummaries'])[:1]

Unnamed: 0,ProcessingJobName,ProcessingJobArn,CreationTime,ProcessingEndTime,LastModifiedTime,ProcessingJobStatus,FailureReason
0,ctr-prediction-sklearn-preprocessor,arn:aws:sagemaker:ap-northeast-1:547760918250:...,2022-05-02 10:29:33.437000+09:00,2022-05-02 10:41:48.720000+09:00,2022-05-02 10:41:49.209000+09:00,Completed,


In [33]:
processor.run(
    code="ctr-prediction-preprocessor.py",
    inputs=[ProcessingInput(
                    source="./train_partial",
                    destination="/opt/ml/processing/input"),
           ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train",
                                    destination="s3://ctr-prediction/output/"
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/validation",
                                    destination="s3://ctr-prediction/output/"
                        ),
    ],
     arguments=[
        "--train_valid_split_percentage",
        str(0.9)],
    wait=True,
    logs=True,
    job_name=processing_job_name,
    experiment_config=None    
)


Job Name:  sagemaker-scikit-learn-2022-05-02-00-58-15-584
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-1-547760918250/sagemaker-scikit-learn-2022-05-02-00-58-15-584/input/input-1/train_partial', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-1-547760918250/sagemaker-scikit-learn-2022-05-02-00-58-15-584/input/code/ctr-prediction-preprocessor.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/output/', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'o

In [50]:
processor_description = processor.jobs[-1].describe()
processor_description

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://ctr-prediction/input/',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-1-547760918250/ctr-prediction-sklearn-preprocessor/input/code/ctr-prediction-preprocessor.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'output-1',
    'S3Output': {'S3Uri': 's3://ctr-prediction/output/',
     'LocalPath': '/opt/ml/processing/output/train',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName': 'output-2',
    'S3Output': {'S3Uri': 's3://ct

In [6]:

account_id = boto3.client('sts').get_caller_identity().get('Account')

ecr_repository = f'ctr-preprocessor-custom:latest'
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository}'
!docker build . -t $image_uri

!aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $account_id.dkr.ecr.$region.amazonaws.com
 

!aws ecr create-repository --repository-name $ecr_repository
 
!docker build -t {ecr_repository} .
!docker tag {ecr_repository} $image_uri
!docker push $image_uri


[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.1s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 43B                                        0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/library/python:3.8-slim-buster  0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (3/4)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 43B                                        0.0s
[0m[34m => [internal] load .dockerignore                           

[0m[?25h[1A[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 2.1s (4/4)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 43B                                        0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.8-slim-buster  2.0s
[0m[34m => [auth] library/python:pull token for registry-1.docker.io              0.0s
[0m[?25h[1A[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 2.1s (7/7) FINISHED                                                
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 43B                                        0.0s
[0m[34m => [internal] load .dockerig

In [17]:


processing_instance_type = "ml.m5.xlarge"
processing_instance_count = 1
train_valid_split_percentage = 0.8
processing_job_name = "ctr-prediction-custom-preprocessor"

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=image_uri,
    role=role,
    instance_count=processing_instance_count,
    instance_type=processing_instance_type
)

In [18]:
script_processor.run(
    code="custom-preprocessor.py",
    inputs=[ProcessingInput(
                    source="s3://ctr-prediction/input/",
                    destination="/opt/ml/processing/input"),
           ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train",
                                    destination="s3://ctr-prediction/output/"
                        ),
        ProcessingOutput(source="/opt/ml/processing/output/validation",
                                    destination="s3://ctr-prediction/output/"
                        ),
    ],
     arguments=[
        "--train_valid_split_percentage",
        str(0.9)],
    wait=True,
    logs=True,
    job_name=processing_job_name,
    experiment_config=None    
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)


Job Name:  ctr-prediction-custom-preprocessor
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://ctr-prediction/input/', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-1-547760918250/ctr-prediction-custom-preprocessor/input/code/custom-preprocessor.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/output/', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://ctr-prediction/output/', 'LocalPath': '/opt/ml/processing/output

UnexpectedStatusException: Error for Processing job ctr-prediction-custom-preprocessor: Failed. Reason: AlgorithmError: See job logs for more information