## Importing Libraries

###### The project analyzes the prior marketing campaign data of a bank and aims to predict whether the customer will subscribe to fixed-term deposit products the bank is offering.

In [2]:
import boto3
import pandas as pd
from sagemaker import get_execution_role

## Reading and combining into one file

In [3]:
role = get_execution_role()
bucket = 'sagemakerstudio12'
s= boto3.client('s3')
keys = []
resp = s.list_objects_v2(Bucket = bucket)
for i in range(1,6):
    n = "s3://" + bucket + "/"+ str(i)+'.csv'
    keys.append(n)
data = pd.concat([pd.read_csv((k)) for k in keys])

In [27]:
for k in keys:
    d=pd.read_csv(k)
    print(d.shape)

(1000, 21)
(1000, 21)
(1000, 21)
(1000, 21)
(1000, 21)


In [29]:
data.to_csv('combine.csv')
df = pd.read_csv('combine.csv')
df = df.drop([df.columns[0]], axis=1)
df.to_csv('combine.csv', index=False)

## Uploading the combined dataset into the bucket

In [5]:
s.upload_file('combine.csv',bucket,'combine.csv')

In [6]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


In [30]:
df.shape

(5000, 21)

### Checking for null values
#### There are no null values

In [7]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

## Importing Sagemaker Parameters and dividing the dataset into training and testing sets

In [12]:
import sagemaker
import os
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
import numpy as np
from time import gmtime, strftime
region = boto3.Session().region_name
smclient = boto3.Session().client("sagemaker")
role = sagemaker.get_execution_role()
pr = 'coder'
df["no_previous_contact"] = np.where(df["pdays"] == 999, 1, 0)  
df["not_working"] = np.where(np.in1d(df["job"], ["student", "retired", "unemployed"]), 1, 0)
model_data = pd.get_dummies(df)
model_data = model_data.drop(["euribor3m","cons.price.idx", "cons.conf.idx", "nr.employed","duration", "emp.var.rate"],axis=1)
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729),
                                                  [int(0.7 * len(model_data)), int(0.9 * len(model_data))])

In [13]:
pd.concat([train_data["y_yes"], train_data.drop(["y_no", "y_yes"], axis=1)], axis=1).to_csv("train.csv", index=False, header=False)
pd.concat([validation_data["y_yes"], validation_data.drop(["y_no", "y_yes"], axis=1)], axis=1).to_csv("validation.csv", index=False, header=False)
pd.concat([test_data["y_yes"], test_data.drop(["y_no", "y_yes"], axis=1)], axis=1).to_csv("test.csv", index=False, header=False)

### Uploading into s3 bucket and importing them

In [15]:
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(pr, "train/train.csv")).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(pr, "validation/validation.csv")).upload_file("validation.csv")

In [16]:
from sagemaker.inputs import TrainingInput

s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, pr), content_type="csv"
)

s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/validation".format(bucket, pr), content_type="csv"
)

### Performing Hyperparameter Tuning

In [17]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve

sess = sagemaker.Session()

container = retrieve("xgboost", region, "latest")

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name="xgboost-random-search",
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, pr),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=10,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)
objective_metric_name = "validation:auc"

## Performing Logarithmic Scaling

In [18]:
hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
}

In [19]:
tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

tuner_log.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

.....................................................!


In [20]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

### Performing Linear Scaling

In [21]:
hyperparameter_ranges_linear = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Linear"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Linear"),
}

In [22]:
tuner_linear = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

tuner_linear.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name="xgb-linsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

......................................................!


In [23]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

## Comparing them

In [24]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
status_log = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]
status_linear = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]
assert status_log == "Completed", "First must be completed, was {}".format(status_log)
assert status_linear == "Completed", "Second must be completed, was {}".format(status_linear)
df_log = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_log.latest_tuning_job.job_name
).dataframe()
df_linear = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_linear.latest_tuning_job.job_name
).dataframe()
df_log["scaling"] = "log"
df_linear["scaling"] = "linear"
df = pd.concat([df_log, df_linear], ignore_index=True)

## Deploying

In [26]:
predictor = tuner_linear.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


2022-06-15 16:37:38 Starting - Preparing the instances for training
2022-06-15 16:37:38 Downloading - Downloading input data
2022-06-15 16:37:38 Training - Training image download completed. Training in progress.
2022-06-15 16:37:38 Uploading - Uploading generated training model
2022-06-15 16:37:38 Completed - Training job completed
------!

##### The ML trained using AUTO ML gave the XGBoost as the best algorithm and the training sets are saved in the bucket as shown below as screenshots in the attached document

### Deleting the Session

In [31]:
sess.delete_endpoint(endpoint_name=predictor.endpoint_name)