# Getting the dataset prepared in Lab `1-DataPrep`

In [None]:
import pandas as pd
import boto3
import sagemaker

sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()

In [None]:
# Set the paths for the datasets saved locally
local_train_path = 'train.csv'
train_df = pd.read_csv(local_train_path, header=None)
train_df.head()

pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page
train_df.head()

In [None]:
# Let's check the validation dataset
local_validation_path = 'validation.csv'
validation_df = pd.read_csv(local_validation_path, header=None)
validation_df.head()

If you remember from previous lab, we saved the CSV without headers. CSV with headers are stored in `config/training-dataset-with-header.csv`.

To see our train set with headers:

In [None]:
pd.read_csv("training-dataset-with-header.csv").head()

Now we'll upload the files to S3 for training but first we will create an S3 bucket for the data if one does not already exist.

In [None]:
region = sess.region_name
account_id = sess.client('sts', region_name=region).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
prefix = 'xgboost-churn'
train_dir = f"{prefix}/train"
val_dir = f"{prefix}/validation"

try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=bucket)
    else:
        sess.client('s3').create_bucket(Bucket=bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print("Looks like you already have a bucket of this name. That's good. Uploading the data files...")

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url_train = sagemaker.s3.S3Uploader.upload(local_train_path, 's3://{}/{}'.format(bucket, train_dir))
s3url_validation = sagemaker.s3.S3Uploader.upload(local_validation_path, 's3://{}/{}'.format(bucket, val_dir))

If you want to see in the console, go to S3 and verify the 2 CSV files are there:

In [None]:
from IPython.core.display import display, HTML
s3_url_placeholder = "https://s3.console.aws.amazon.com/s3/buckets/{}?&prefix={}/"

In [None]:
display(HTML(f"<a href={s3_url_placeholder.format(bucket,train_dir)}>S3 Train object</a>"))

In [None]:
display(HTML(f"<a href={s3_url_placeholder.format(bucket,val_dir)}>S3 Validation object</a>"))

### Saving variables to use in the main notebook for this lab

In [None]:
%store bucket

In [None]:
%store prefix

In [None]:
%store train_dir

In [None]:
%store val_dir

[You can now go back to modeling.ipynb](../modeling.ipynb)