## <a name="setup"></a> Global configuration and set up

In [53]:
# Define IAM role
import boto3
import re
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
s3 = boto3.resource('s3')

NoCredentialsError: Unable to locate credentials

### Set up connection to AWS S3

Let's start by specifying:

- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting.
- The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these. Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the boto regexp with a the appropriate full IAM role arn string(s).

In [1]:
def copy_to_bucket(source_bucket_name, source_file_path, target_bucket_name, target_file_path):
    source_bucket = {
        'Bucket': source_bucket_name,
        'Key': source_file_path
    }
    target_bucket = s3.Bucket(target_bucket_name)
    target_bucket.copy(source_bucket, target_file_path, ExtraArgs={"ServerSideEncryption": "aws:kms"})

### Copy source file to working S3 bucket

In [None]:
source_bucket_name = '<your_s3_source_bucket_name_here>'
source_file_key = '<source_file_key_here>'
target_bucket_name = '<your_s3_target_bucket_name_here>'
target_file_key = '<target_file_key_here>'

copy_to_bucket(source_bucket_name, source_file_key, target_bucket_name, target_file_key)

### Create data catalog from source file

In [64]:
df = pd.read_csv('{}/{}'.format(target_bucket_name, target_file_key))

cols = df.columns.values
num_cols = df._get_numeric_data().columns.values
cat_cols = list(set(cols) - set(num_cols))

print('Numerical Columns: ' + ', '.join(num_cols))
print('Categorical Columns: ' + ', '.join(cat_cols))
target = input('Insert the name of the Target column: ')

catalog = {
    'DataFilePath' : [ 's3://{}/{}'.format('target_bucket_name', 'target_file_key') ],
    'Target' : [ target ],
    'NumericalColumns' : [ num_cols[i] for i in range(0, len(num_cols)) ],
    'CategoricalColumns' : [ cat_cols[i] for i in range(0, len(cat_cols)) ]
}
yaml_catalog = yaml.dump(catalog, sort_keys=False)
yaml_catalog = yaml_catalog.replace('-', '    -')

with open('data-catalog.yml', 'w') as f:
    f.write(yaml_catalog)

Numerical Columns: Cost
Categorical Columns: Item, Name
