<a href="https://colab.research.google.com/github/nisha1365/TECHNICAL_TRAINING_CTS/blob/main/Nisha_2211566_Sagemaker_Clinical_record_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install s3fs
 
import boto3
import sagemaker
from sagemaker.session import Session

In [None]:
region = boto3.Session().region_name
 
boto_session = boto3.Session(region_name=region)
 
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
 
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

Set up S3 Bucket for the OfflineStore

In [None]:
# change the bucket name to your desired bucket name 
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = 'feature-store'
 
print(default_s3_bucket_name)

Set up IAM Role

In [None]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

## Inspect *Dataset*

In [None]:
import pandas as pd
from IPython.display import display

In [None]:
!aws s3 cp  ./clinical_records_dataset.csv s3://$default_s3_bucket_name/$prefix/data/

In [None]:
clinical_data_file_name = 'clinical_records_dataset.csv'
clinical_data_path = "s3://{}/{}/data/{}".format(default_s3_bucket_name, prefix, clinical_data_file_name)
clinical = pd.read_csv(clinical_data_path)
pd.set_option('display.max_columns', 500)
clinical.head()

In [None]:
print ('percentage of the value missing in each column is: ')
clinical.isnull().sum() / len(clinical)

## Prepare data for feature store

Create a unique ID for each patient

In [None]:
#### Add an id for each patient
clinical.reset_index(inplace = True)
clinical.rename(columns = {'index': 'patient_id'}, inplace = True)

In [None]:
clinical.head()

In [None]:
clinical.tail()

In [None]:
clinical.dtypes

In [None]:
clinical['patient_id'] = clinical['patient_id'].astype(object)

In [None]:
clinical.info()

Create a TimeStamp for each record

In [None]:
import time
 
current_time_sec = int(round(time.time()))
# append EventTime feature
clinical['EventTime'] = pd.Series([current_time_sec]*len(clinical), dtype="float64")


Check data types for each column

In [None]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")
 
# cast object dtype to string. The SageMaker Feature Store Python SDK will then map the string dtype to
# String feature type.
cast_object_to_string(clinical)

In [None]:
clinical.dtypes

Create Features

Assign a feature  group name

In [None]:
from time import gmtime, strftime, sleep

clinical_feature_group_name = 'clinical-feature-group-' + strftime('%d-%H-%M-%S', gmtime())

Create a FeatureGroup

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)

Define Identifier

In [None]:
# record identifier and event time feature names
record_identifier_feature_name = "patient_id"
event_time_feature_name = "EventTime"

Load feature definations to the feature group

In [None]:
clinical_feature_group.load_feature_definitions(data_frame=clinical); # output is suppressed

Create Feature Group

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
 
clinical_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}", #offline feature store bucket
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)
wait_for_feature_group_creation_complete(feature_group=clinical_feature_group)

Work with your FeatureGroup

In [None]:
clinical_feature_group.describe()

In [None]:
sagemaker_client.list_feature_groups() # use boto client to list FeatureGroups

Put Records into the Feature Store

In [None]:
clinical_feature_group.ingest(
    data_frame=clinical, max_workers=3, wait=True
)

Get Records from a Feature Group

In [None]:
record_identifier_value = str(200)
 
featurestore_runtime.get_record(FeatureGroupName=clinical_feature_group_name, RecordIdentifierValueAsString=record_identifier_value)

Generate Hive DDL Commands

In [None]:
print(clinical_feature_group.as_hive_ddl())

In [None]:
%%time
s3_client = boto3.client('s3', region_name=region)
 
account_id = boto3.client('sts').get_caller_identity()["Account"]
 
clinical_feature_group_table_name = clinical_feature_group.describe().get('OfflineStoreConfig').get('DataCatalogConfig').get('TableName')
 
print(account_id)
print(clinical_feature_group_table_name)
 
clinical_feature_group_s3_prefix = prefix + '/' + account_id + '/sagemaker/' + region + '/offline-store/' + clinical_feature_group_table_name + '/data'
 
offline_store_contents = None
while (offline_store_contents is None):
    objects_in_bucket = s3_client.list_objects(Bucket=default_s3_bucket_name, Prefix=clinical_feature_group_s3_prefix)
    if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) >= 1):
        offline_store_contents = objects_in_bucket['Contents']
    else:
        print('Waiting for data in offline store...\n')
        sleep(60)

print('Data available.')

BUILD A TRAINING DATASET

In [None]:
clinical_query = clinical_feature_group.athena_query()
clinical_table = clinical_query.table_name

In [None]:
# Athena query
query_string = 'SELECT * FROM "'+clinical_table+'" LIMIT 290'
 
# run Athena query. The output is loaded to a Pandas dataframe.
dataset = pd.DataFrame()
clinical_query.run(query_string=query_string, output_location='s3://'+default_s3_bucket_name+'/query_results/')
clinical_query.wait()
dataset = clinical_query.as_dataframe()

In [None]:
id_for_test = []
for i in range(299):
    if i not in dataset['patient_id'].unique():
        id_for_test.append(i)

Preparing dataset for training

In [None]:
# Prepare query results for training.
query_execution = clinical_query.get_query_execution()
query_result = 's3://'+default_s3_bucket_name+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'
print(query_result)

In [None]:
# Select useful columns for training with target column as the first.
dataset = dataset[["death_event", "age", 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
# Write to csv in S3 without headers and index column.
dataset.to_csv('dataset.csv', header=False, index=False)
s3_client.upload_file('dataset.csv', default_s3_bucket_name, prefix+'/training_input/dataset.csv')
dataset_uri_prefix = 's3://'+default_s3_bucket_name+'/'+prefix+'/training_input/';

In [None]:
dataset.head(2)

## Train and Deploy the Model