In [None]:
import boto3
import sagemaker
import os

In [None]:
# Store the current SageMaker session
session = sagemaker.Session()
# Define the global bucket name
bucket = "data14group1-ml"

### Download trainval data parquet files from S3 bucket 

In [None]:
# Create a Boto3 client
s3_client = boto3.client('s3')

# Define S3 bucket and folder path
bucket_name = "data14group1-ml"
s3_folder_path = 'data/trainval/'

# Define the local directory where files will be downloaded
local_dir = "data/"

# Ensure the local directory exists
if not os.path.exists(local_dir):
    os.makedirs(local_dir)

# List all objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder_path)

# Iterate through the objects and download each one
for obj in response.get('Contents', []):
    # Get the file path
    s3_file_path = obj['Key']
    file_name = os.path.basename(s3_file_path)

    # Define the local file path
    local_file_path = os.path.join(local_dir, file_name)

    # Download the file from S3
    s3_client.download_file(bucket_name, s3_file_path, local_file_path)
    print(f'Downloaded {s3_file_path} to {local_file_path}')

print("\ndata download complete")

### Split data into train, valuation

In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ReadParquetFromS3").getOrCreate()

# Define file path
file_path = f"data/"

# Read the Parquet file into a DataFrame
data = spark.read.parquet(file_path)

print("data loading complete")

### Check data integrity

In [None]:
# Fill the NaN with zero
data.fillna(0)
data.count()

In [None]:
# Split the data into training (80%) and validation (20%) sets
seed = 42
train, validation = data.randomSplit([0.8,0.2], seed=seed)
print("train and validation sets are randomly selected")

print(train.count())
print(validation.count())

### Save train and validation sets to local folders

In [None]:
file_path = "train/"
train.write.mode("overwrite").parquet(file_path)
print("train file saving complete")

file_path = "validation/"
validation.write.mode("overwrite").parquet(file_path)
print("validation file saving complete")

### Save train and validation sets to local folders

In [None]:
file_path = "train/"
train.write.mode("overwrite").parquet(file_path)
print("train file saving complete")

file_path = "validation/"
validation.write.mode("overwrite").parquet(file_path)
print("validation file saving complete")

### Upload to S3 bucket

In [None]:
### Make sure delete hidden files in the train and validation folder in S3 before training

Model training won't work if we don't delete them!!!

In [None]:
# Specify your bucket name
bucket = "data14group1-ml"
prefix_train = "data/train"  # Optional: specify if you want to delete within a specific prefix
prefix_validation = "data/validation"

def delete_crc(bucket_name, prefix):
    # List and delete .crc files
    paginator = s3_client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('.crc'):
                print(f'Deleting {key}')
                s3_client.delete_object(Bucket=bucket_name, Key=key)

delete_crc(bucket, prefix_train)
delete_crc(bucket, prefix_validation)