### Necessary Import

In [1]:
%pip install s3fs

Collecting s3fs
  Downloading s3fs-2024.3.1-py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.3.1 (from s3fs)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Downloading s3fs-2024.3.1-py3-none-any.whl (29 kB)
Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, s3fs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.6.0
    Uninstalling fsspec-2023.6.0:
      Successfully uninstalled fsspec-2023.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-ai 2.11.0 requires faiss-cpu, which is not installed.
datasets 2.18.0 requires fsspec[http]<=2024.2.0,>=2023.1.0, but you have fsspec 2024.3.1 which is incompatible.
jupyter-scheduler 2.5.1 r

In [2]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri

import numpy as np
import io
import pandas as pd
from sklearn.model_selection import train_test_split

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Data loading

Read the data **from a S3 bucket to a CSV**. 

In [4]:
df = pd.read_csv('s3://team-4-fp/Bank Customer Churn Prediction.csv')
# show the first 5 rows
df.head()


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# drop rowes with missing values
df = df.dropna()

In [6]:
# drop the columns that are not needed
df = df.drop('customer_id', axis=1)
df.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
# show structure of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      10000 non-null  int64  
 1   country           10000 non-null  object 
 2   gender            10000 non-null  object 
 3   age               10000 non-null  int64  
 4   tenure            10000 non-null  int64  
 5   balance           10000 non-null  float64
 6   products_number   10000 non-null  int64  
 7   credit_card       10000 non-null  int64  
 8   active_member     10000 non-null  int64  
 9   estimated_salary  10000 non-null  float64
 10  churn             10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


### Data Preprocessing

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Split the data into features and target label
X = df.drop('churn', axis=1)
y = df['churn']


# Preprocessing for numerical columns: Filling missing values with the mean, then scaling the data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Peprocessing for categorical columns: Filling missing values with the most frequent value then applying one-hot encoding
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define categorical columns
cat_cols = ['country', 'gender', 'credit_card', 'active_member']

# Define numerical columns
num_cols = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'estimated_salary']

# Create ColumnTransdormer with categorical and numerical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
        ],
        remainder='passthrough')


### Train / Val / Test split

Splitting the data into train (80%), validation (10%) and test (10%) sets. 

In [9]:
# Split the initial dataset into a training set (80% of the data) and a temporary test/validation set (20% of the data)
train_X, temp_X, train_y, temp_y = train_test_split(X, y, train_size=0.8, random_state=1200)

# Further split the temporary test/validation set into validation (50% of the temporary set, 10% of the total) and test sets (50% of the temporary set, 10% of the total)
valid_X, test_X, valid_y, test_y = train_test_split(temp_X, temp_y, train_size=0.5, random_state=1200)

# Fit and transform the training set
train_X = preprocessor.fit_transform(train_X)

# Transform the validation and test sets
valid_X = preprocessor.transform(valid_X)
test_X = preprocessor.transform(test_X)

### Transforming the preprocessed data

In [10]:
from scipy.sparse import hstack, csr_matrix
import numpy as np


# Assuming y_train, y_val, and y_test are Pandas Series, we first convert them into sparse column matrices
train_y_sparse = csr_matrix(train_y.values.reshape(-1, 1))
valid_y_sparse = csr_matrix(valid_y.values.reshape(-1, 1))
test_y_sparse = csr_matrix(test_y.values.reshape(-1, 1))

# Then, we combine them with the corresponding X_preprocessed sparse matrices
train_set = hstack([train_y_sparse, train_X])
valid_set = hstack([valid_y_sparse, valid_X])
test_set = hstack([test_y_sparse, test_X])


### Defining the S3 upload function

In [11]:
s3 = boto3.resource('s3')

def upload_to_s3(matrix, bucket, filename):
    # Transform the sparse matrix into a dense numpy array format, followed by conversion to a pandas DataFrame
    df = pd.DataFrame(matrix.toarray())
    
    # Use StringIO to hold CSV data
    placeholder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    
    # Rewind to the beginning of the StringIO
    placeholder.seek(0)
    
    # Upload csv string to S3
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())

After defining this, we proceed to the upload of the train and validation split. 

In [12]:
upload_to_s3(train_set, 'team-4-fp', 'train.csv')

In [13]:
upload_to_s3(valid_set, 'team-4-fp', 'valid.csv')

## Setting up the model


We utilize the Estimator class from the sagemaker.estimator module to establish the environment for running training jobs for our model. Key configurations include:

Container Name: We're leveraging a pre-existing Docker container for XGBoost, specified with sagemaker.image_uris.retrieve, indicating the algorithm version and AWS region.

Role Name: The execution role fetched by sagemaker.get_execution_role(), granting necessary permissions for the training job, akin to roles used in Lambda functions.

Instance Count: We're starting with a single instance (instance_count=1) for training, keeping scalability in mind for larger jobs.
Instance Type: Selected as 'ml.m4.xlarge', a type included in the SageMaker Free Tier, to balance cost and performance.

Output Path: Designated as s3://team-4-fp/sagemaker-output/, where the model artifacts and related information will be stored.

Hyperparameters: Configured for a binary classification task, including parameters such as 'objective', 'eval_metric', and 'num_round', among others, to tune the XGBoost model effectively.

Current Session: Utilizes sagemaker.Session(), necessary for internal management within SageMaker's environment.

This setup meticulously specifies the training environment, data locations, model parameters, and AWS resources, ensuring a streamlined and efficient model training process on SageMaker.

In [14]:
# Fetch the execution role for the SageMaker session
role = sagemaker.get_execution_role()

# Determine the AWS region of the current SageMaker session
region_name = boto3.Session().region_name

# Retrieve the Docker container image for XGBoost, specifying the version and region
container = sagemaker.image_uris.retrieve('xgboost', region_name, version='0.90-1')

# Define the S3 output location where the trained model artifacts will be stored
output_location = 's3://team-4-fp/sagemaker-output/'

# Configuring XGBoost hyperparameters for binary classification:
# - 'objective': Learning task, 'binary:logistic' for binary classification.
# - 'eval_metric': Performance evaluation metric, 'auc' for area under curve.
# - 'eta': Learning rate to control overfitting, set to 0.1.
# - 'max_depth': Maximum tree depth, set to 6 for complexity control.
# - 'min_child_weight': Minimum sum of instance weight(hessian) needed in a child.
# - 'subsample', 'colsample_bytree': Subsampling rates for samples and features to prevent overfitting.
# - 'gamma': Minimum loss reduction required to make further splits.
# - 'lambda', 'alpha': L2 and L1 regularization terms on weights for model simplification and to prevent overfitting.
hyperparams = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'num_round': '20',
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0,
}


# Initialize the Estimator object with the specified configuration
estimator = sagemaker.estimator.Estimator(
    image_uri=container,                  
    role=role,                            
    instance_count=1,                     
    instance_type='ml.m4.xlarge',         
    output_path=output_location,          
    hyperparameters=hyperparams,          
    sagemaker_session=sagemaker.Session() 
)


Now we have to crete what sagemaker calls "channels". We need to specify where is the data and in which format in a specific dictionary:  

In [17]:
train_channel = sagemaker.session.s3_input(
    's3://team-4-fp/train.csv',
    content_type='text/csv'
)
val_channel = sagemaker.session.s3_input(
    's3://team-4-fp/valid.csv',
    content_type='text/csv'
)

channels_for_training = {
    'train': train_channel,
    'validation': val_channel
}



See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Training the model

In [18]:
estimator.fit(inputs=channels_for_training, logs=False)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-04-02-13-16-28-584



2024-04-02 13:16:30 Starting - Starting the training job..
2024-04-02 13:16:44 Starting - Preparing the instances for training..........
2024-04-02 13:17:41 Downloading - Downloading input data.......
2024-04-02 13:18:21 Downloading - Downloading the training image....
2024-04-02 13:18:47 Training - Training image download completed. Training in progress....
2024-04-02 13:19:07 Uploading - Uploading generated training model..
2024-04-02 13:19:23 Completed - Training job completed


### Checking metrics of the first model

In [19]:
# Accessing the training job's performance metrics related to the set objectives and evaluations
model_metrics = sagemaker.analytics.TrainingJobAnalytics(
    training_job_name=estimator.latest_training_job.name,
    # Assuming 'auc' was a key metric for your model, you might also be interested in 'logloss' and 'error' if they are relevant to your evaluation strategy.
    metric_names=['train:auc', 'validation:auc']
)

# Displaying the metrics as a DataFrame for easy visualization and analysis
metrics_df = model_metrics.dataframe()
metrics_df


Unnamed: 0,timestamp,metric_name,value
0,0.0,train:auc,0.898326
1,0.0,validation:auc,0.851817


### Model tuning

In this setup, we configure and launch a hyperparameter tuning job for a binary classification model using XGBoost on Amazon SageMaker. We define a range of values for key model hyperparameters and set certain fixed parameters, including the objective and evaluation metrics. The aim is to find the optimal hyperparameter values that maximize the area under the ROC curve (AUC) on the validation dataset. We use SageMaker's hyperparameter tuning functionality to automate this process, running multiple training jobs in parallel to explore the defined hyperparameter space efficiently. The training and validation data are provided from specified S3 locations, and the tuning job's progress can be monitored through SageMaker's console.

In [22]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.inputs import TrainingInput
import sagemaker

# Define the hyperparameter search space
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.01, 0.2),
    'min_child_weight': ContinuousParameter(1, 10),
    'alpha': ContinuousParameter(0, 2),
    'max_depth': IntegerParameter(3, 10),
    'subsample': ContinuousParameter(0.5, 1),
    'colsample_bytree': ContinuousParameter(0.5, 1),
    'gamma': ContinuousParameter(0, 5),
    'lambda': ContinuousParameter(1e-5, 10),
}

# Set fixed hyperparameters and hyperparameters for early stopping
estimator.set_hyperparameters(
    eval_metric='auc',
    num_round=1000,  # A high number, intending to rely on early stopping
    objective='binary:logistic',
    early_stopping_rounds=10
)

# Define the objective metric to optimize during tuning
objective_metric_name = 'validation:auc'

# Configure the hyperparameter tuner
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=30,  # Total number of training jobs
                            max_parallel_jobs=10)  # Number of jobs to run in parallel

# Specify the data channels for training and validation
train_input = TrainingInput('s3://team-4-fp/train.csv', content_type='text/csv')
validation_input = TrainingInput('s3://team-4-fp/valid.csv', content_type='text/csv')

# Launch the hyperparameter tuning job
tuner.fit({'train': train_input, 'validation': validation_input}, logs=False)


INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-240402-1329


.....................................................................!


### Evaluating the tuned Model

In [24]:
# Get the name of the hyperparameter tuning job
tuning_job_name = tuner.latest_tuning_job.job_name

# Retrieve the results of the tuning job
sage_client = sagemaker.Session().sagemaker_client
tuning_job_result = sage_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)

# Get the best performing hyperparameter set
best_hyperparameters = sage_client.list_training_jobs_for_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    SortBy='FinalObjectiveMetricValue',
    SortOrder='Descending',
    MaxResults=1
)

best_job_name = best_hyperparameters['TrainingJobSummaries'][0]['TrainingJobName']
best_job = sage_client.describe_training_job(TrainingJobName=best_job_name)

# Now extract the metrics
best_job_metrics = best_job['FinalMetricDataList']

# Output the metrics
for metric in best_job_metrics:
    print(f"{metric['MetricName']}: {metric['Value']}")


validation:auc: 0.8537300229072571
train:auc: 0.9084439873695374
ObjectiveMetric: 0.8537300229072571


### Deploying the tuned model

In [26]:
# First, get the best training job name
best_training_job_name = tuner.best_training_job()

# Then attach this best training job to a new estimator object
best_estimator = sagemaker.estimator.Estimator.attach(best_training_job_name)

# Deploy this best estimator to an endpoint
predictor = best_estimator.deploy(initial_instance_count=1, 
                                  instance_type='ml.m4.xlarge', 
                                  serializer=sagemaker.serializers.CSVSerializer())



2024-04-02 13:33:58 Starting - Found matching resource for reuse
2024-04-02 13:33:58 Downloading - Downloading the training image
2024-04-02 13:33:58 Training - Training image download completed. Training in progress.
2024-04-02 13:33:58 Uploading - Uploading generated training model
2024-04-02 13:33:58 Completed - Resource reused by training job: sagemaker-xgboost-240402-1329-021-aded26cb

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-04-02-14-06-51-127





INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-04-02-14-06-51-127
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-04-02-14-06-51-127


------!

### Metrics of the deployed model

In [35]:
from scipy import sparse
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Ensure predictor uses CSVSerializer to prepare the data for the endpoint
predictor.serializer = sagemaker.serializers.CSVSerializer()

# Convert your test data to CSV format
buffer = io.StringIO()
np.savetxt(buffer, test_X.toarray() if sparse.issparse(test_X) else test_X, delimiter=",", fmt='%g')

# Make predictions
predictions_csv = predictor.predict(buffer.getvalue()).decode('utf-8')

# Convert the returned CSV predictions into a list of floats
predictions_floats = [float(prob) for prob in predictions_csv.split(',')]

# Assuming your true labels (test_y) are in a format compatible with sklearn's metrics
# Convert probabilities to binary predictions based on a 0.5 threshold
threshold = 0.5
binary_predictions = [1 if prob > threshold else 0 for prob in predictions_floats]

# Calculate metrics
accuracy = accuracy_score(test_y, binary_predictions)
recall = recall_score(test_y, binary_predictions)
precision = precision_score(test_y, binary_predictions)
f1 = f1_score(test_y, binary_predictions)

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")


Accuracy: 0.84
Recall: 0.4131455399061033
Precision: 0.7154471544715447
F1 Score: 0.5238095238095238


The metrics obtained from the model's predictions on the test dataset showcase its performance in identifying churn. The model achieves an accuracy of 84%, indicating a strong overall ability to correctly classify both churn and non-churn cases. However, when looking deeper into the metrics that provide insight into its performance on the positive class (churn), we see a mixed picture. The recall of approximately 41.31% suggests that the model identifies just over 41% of the actual churn cases, highlighting a potential area for improvement in capturing more true positives. In contrast, the precision of 71.54% is relatively high, indicating that when the model predicts churn, it is correct about 72% of the time. The F1 score, which balances precision and recall, stands at 52.38%, reflecting the need to improve recall to better the model's performance in identifying churn cases. This analysis suggests while the model is quite reliable in its predictions, further tuning or exploring different modeling approaches might enhance its ability to detect more true churn cases without significantly sacrificing precision.

### Predicting churn for customers

In [28]:
# Assuming subset_test_X is a small subset of your numpy array
subset_test_X = test_X[:10] 

# Convert the numpy array to CSV format
csv_buffer = io.StringIO()
np.savetxt(csv_buffer, subset_test_X, delimiter=",", fmt='%g')

# Reset the buffer position to the beginning
csv_buffer.seek(0)

# Use the deployed predictor to make predictions
predictor.serializer = sagemaker.serializers.CSVSerializer()
predictions = predictor.predict(csv_buffer.getvalue()).decode('utf-8')

# Print the predictions
print(predictions)


0.061128076165914536,0.06451965868473053,0.0803229957818985,0.17671948671340942,0.12232694029808044,0.07389398664236069,0.11089379340410233,0.6179166436195374,0.2312091737985611,0.06720710545778275


The prediction results from the deployed model have provided us with the estimated probabilities of customer churn. Analyzing the output, we observe that the majority of the instances have probabilities below the typical threshold of 0.5, indicating a low likelihood of churn according to the model's current understanding. One particular prediction stands out with a probability significantly higher than 0.5, suggesting a higher risk of churn for that customer. These results are valuable as they help prioritize follow-up actions and customer retention strategies. The low probabilities indicate a well-engaged customer base, while the outlier suggests a need for targeted intervention to address potential concerns leading to churn.