In [None]:
from datetime import datetime
from os import environ

from boto3 import client
from numpy import mean, number
from numpy.random import seed
from pandas import read_csv, to_numeric
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import DMatrix, train

In [None]:
s3_endpoint_url = f"https://{environ.get('AWS_S3_ENDPOINT')}"
s3_access_key = environ.get('AWS_ACCESS_KEY_ID')
s3_secret_key = environ.get('AWS_SECRET_ACCESS_KEY')
s3_bucket_name = environ.get('AWS_S3_BUCKET')

In [None]:
print(f'Downloading data "training-data.csv" '
      f'from bucket "{s3_bucket_name}" '
      f'from S3 storage at {s3_endpoint_url}')

s3_client = client(
    's3', endpoint_url=s3_endpoint_url,
    aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key
)

s3_client.download_file(
    s3_bucket_name,
    'BreastCancerWisconsinDataSet.csv',
    './data/raw_data.csv'
)

In [None]:
print("Loading raw data.")

# Load the raw data
data = read_csv("data/raw_data.csv")

# Display summary statistics similar to skim()
print(data.describe(include='all'))

print("Cleaning data...")

# Drop duplicate rows based on 'id' column
data = data.drop_duplicates(subset='id')

# Select specific columns
columns_to_keep = [
    'diagnosis',
    'radius_mean',
    'area_mean',
    'radius_worst',
    'area_worst',
    'perimeter_worst',
    'perimeter_mean'
]
data = data[columns_to_keep]

# Convert all columns except 'diagnosis' to numeric
cols_to_convert = data.columns.difference(['diagnosis'])
data[cols_to_convert] = data[cols_to_convert].apply(
    to_numeric, errors='coerce'
)

print("Data cleaning done, saving file.")

# Save the cleaned data to a new CSV file
data.to_csv("data/cleaned_data.csv", index=False)

In [None]:
# Set the random seed
seed(1)
print("Set seed 1")

print("Load cleaned data")
data_cleaned = read_csv("data/cleaned_data.csv")

# Standardize the numeric columns
scaler = StandardScaler()
numeric_cols = data_cleaned.select_dtypes(include=number).columns
data_cleaned[numeric_cols] = scaler.fit_transform(data_cleaned[numeric_cols])

print("Creating train and testing files")
# Use 70% of dataset as training set and remaining 30% as testing set
train_set, test_set = train_test_split(
    data_cleaned, test_size=0.3, random_state=1
)

print("Writing to csv files.")
train_set.to_csv("data/train.csv", index=False)
test_set.to_csv("data/test.csv", index=False)
print("Done.")

In [None]:
# Read environment variables
max_depth = int(environ.get('max_depth', '10'))
n_round = int(environ.get('n_round', '21'))
early_stopping_rounds = int(environ.get('early_stopping_rounds', '3'))

print('Loading training and test data')
train_set = read_csv("data/train.csv")
test_set = read_csv("data/test.csv")

# Remove labels
train_data = train_set.drop(columns=['diagnosis'])
test_data = test_set.drop(columns=['diagnosis'])

# Create targets
train_label = train_set['diagnosis'].apply(lambda x: 1 if x == "M" else 0).values
test_label = test_set['diagnosis'].apply(lambda x: 1 if x == "M" else 0).values

# Convert to matrices
train_matrix = train_data.values
test_matrix = test_data.values

# Create DMatrix
dtrain = DMatrix(data=train_matrix, label=train_label)
dtest = DMatrix(data=test_matrix, label=test_label)

print('Training model...')
breastcancer_model = train(
    params={
        'max_depth': max_depth,
        'objective': 'binary:logistic',
        'verbosity': 2,
        'early_stopping_rounds': early_stopping_rounds
    },
    dtrain=dtrain,
    num_boost_round=n_round,
    evals=[(dtest, 'test')]
)

# Predict and evaluate
pred = breastcancer_model.predict(dtest)
err = mean((pred > 0.5).astype(int) != test_label)
print(f"test-error= {err}")

print('Saving model')
breastcancer_model.save_model('model.bst')

In [None]:
timestamp = datetime.now().strftime('%y%m%d%H%M')
model_object_name = f'model-{timestamp}.bst'

try:
    s3_client.upload_file('model.bst', s3_bucket_name, model_object_name)
except Exception:
    print(f'S3 upload to bucket {s3_bucket_name} at {s3_endpoint_url} failed!')
    raise
print(f'model uploaded and available as "{model_object_name}"')