In [56]:
import boto3
import pandas as pd
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
import json

In [27]:
BUCKET_NAME = "team1-index-predictor-bucket"

raw_data_filename = "data/raw/sp500.csv"

s3 = boto3.client('s3')

In [28]:
s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=raw_data_filename)

data = s3_object['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))
df.head()

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2024-05-15 09:30:00-04:00,5263.259766,5271.049805,5263.259766,5270.939941,5270.939941,0
1,2024-05-15 09:31:00-04:00,5271.25,5273.240234,5271.149902,5271.439941,5271.439941,12862437
2,2024-05-15 09:32:00-04:00,5271.779785,5275.759766,5271.779785,5274.950195,5274.950195,11405230
3,2024-05-15 09:33:00-04:00,5275.160156,5277.259766,5273.870117,5276.120117,5276.120117,8963509
4,2024-05-15 09:34:00-04:00,5275.97998,5276.200195,5274.560059,5276.109863,5276.109863,7163548


In [29]:
df = df[["Datetime", "Close"]]
df.head()

Unnamed: 0,Datetime,Close
0,2024-05-15 09:30:00-04:00,5270.939941
1,2024-05-15 09:31:00-04:00,5271.439941
2,2024-05-15 09:32:00-04:00,5274.950195
3,2024-05-15 09:33:00-04:00,5276.120117
4,2024-05-15 09:34:00-04:00,5276.109863


In [30]:
# Ensure data is sorted by datetime
df = df.sort_values(by=["Datetime"])

In [31]:
total_size = len(df)
test_size = int(0.1 * total_size)
validation_size = int(0.1 * total_size)
train_size = total_size - test_size - validation_size

test_start_idx = total_size - test_size
validation_start_idx = test_start_idx - validation_size

train_df = df.iloc[:validation_start_idx]
validation_df = df.iloc[validation_start_idx:test_start_idx]
test_df = df.iloc[test_start_idx:]

In [32]:
print("Training set size:", len(train_df))
print("Validation set size:", len(validation_df))
print("Test set size:", len(test_df))

# Verify the splits
print("\nTrain DataFrame:")
print(train_df.head())
print("\nValidation DataFrame:")
print(validation_df.head())
print("\nTest DataFrame:")
print(test_df.head())

Training set size: 6484
Validation set size: 810
Test set size: 810

Train DataFrame:
                    Datetime        Close
0  2024-05-15 09:30:00-04:00  5270.939941
1  2024-05-15 09:31:00-04:00  5271.439941
2  2024-05-15 09:32:00-04:00  5274.950195
3  2024-05-15 09:33:00-04:00  5276.120117
4  2024-05-15 09:34:00-04:00  5276.109863

Validation DataFrame:
                       Datetime        Close
6484  2024-06-07 14:59:00-04:00  5357.490234
6485  2024-06-07 15:00:00-04:00  5354.959961
6486  2024-06-07 15:01:00-04:00  5355.169922
6487  2024-06-07 15:02:00-04:00  5358.359863
6488  2024-06-07 15:03:00-04:00  5359.020020

Test DataFrame:
                       Datetime        Close
7294  2024-06-11 15:30:00-04:00  5365.189941
7295  2024-06-11 15:31:00-04:00  5364.990234
7296  2024-06-11 15:32:00-04:00  5364.220215
7297  2024-06-11 15:33:00-04:00  5364.859863
7298  2024-06-11 15:34:00-04:00  5365.580078


In [33]:
scaler = MinMaxScaler()

train_df.loc[:, 'Close'] = scaler.fit_transform(train_df[['Close']])

validation_df.loc[:, 'Close'] = scaler.transform(validation_df[['Close']])

test_df.loc[:, 'Close'] = scaler.transform(test_df[['Close']])

train_df.head()

Unnamed: 0,Datetime,Close
0,2024-05-15 09:30:00-04:00,0.429912
1,2024-05-15 09:31:00-04:00,0.43267
2,2024-05-15 09:32:00-04:00,0.452035
3,2024-05-15 09:33:00-04:00,0.458489
4,2024-05-15 09:34:00-04:00,0.458433


In [58]:
# store later as metadata in feature store
scaler_params = {
    'min': scaler.data_min_.tolist(),
    'max': scaler.data_max_.tolist(),
    'range': scaler.data_range_.tolist(),
    'scale': scaler.scale_.tolist(),
    'min': scaler.min_.tolist(),
}
scaler_params

{'min': [-28.647924124759925],
 'max': [5374.27978515625],
 'range': [181.27001953125],
 'scale': [0.005516632053032935]}

In [58]:
# REVISIT: Maybe prepare the dataset in a format that needs to be passed to the model

{'min': [-28.647924124759925],
 'max': [5374.27978515625],
 'range': [181.27001953125],
 'scale': [0.005516632053032935]}

In [53]:
# REVISIT: use feature store later instead of plain s3, version things and store 
# scaler params as metadata in feature store

def upload_df_to_s3(df, bucket_name, object_name):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket=bucket_name, Key=object_name, Body=csv_buffer.getvalue())
    
root_folder = "data/processed"

upload_df_to_s3(train_df, BUCKET_NAME, f"{root_folder}/train.csv")
upload_df_to_s3(validation_df, BUCKET_NAME, f"{root_folder}/validation.csv")
upload_df_to_s3(test_df, BUCKET_NAME, f"{root_folder}/test.csv")

In [59]:
scaler_params_json = json.dumps(scaler_params)

s3.put_object(Bucket=BUCKET_NAME, Key=f"{root_folder}/scaler_params.json", Body=scaler_params_json)

{'ResponseMetadata': {'RequestId': 'ZQF23Z0WFZ3B9GKG',
  'HostId': 'mCDKcUIqSm7YocRgCn+at3h2ojChzibqcWZbssSZPr1c6QYBX2JuH0GE8NyKjMSKTzss5aC+ToS8x7YDVe1DEQ==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'mCDKcUIqSm7YocRgCn+at3h2ojChzibqcWZbssSZPr1c6QYBX2JuH0GE8NyKjMSKTzss5aC+ToS8x7YDVe1DEQ==',
   'x-amz-request-id': 'ZQF23Z0WFZ3B9GKG',
   'date': 'Fri, 14 Jun 2024 10:00:17 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3ca7802d62b31cccc7b07fa80d4d8d3d"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3ca7802d62b31cccc7b07fa80d4d8d3d"',
 'ServerSideEncryption': 'AES256'}

In [55]:
s3.list_objects(Bucket=BUCKET_NAME)['Contents']

[{'Key': 'data/processed/test.csv',
  'LastModified': datetime.datetime(2024, 6, 14, 9, 57, 52, tzinfo=tzlocal()),
  'ETag': '"4d1370a95a9260cfba866dcc4f51791c"',
  'Size': 36263,
  'StorageClass': 'STANDARD',
  'Owner': {'ID': 'ec1ed018b5c0ee51b7177c58aeefc36b88bc97ed583ad096499121407c50d423'}},
 {'Key': 'data/processed/train.csv',
  'LastModified': datetime.datetime(2024, 6, 14, 9, 57, 52, tzinfo=tzlocal()),
  'ETag': '"1e087f7f9658fc8c539bcfb3ce575d20"',
  'Size': 292145,
  'StorageClass': 'STANDARD',
  'Owner': {'ID': 'ec1ed018b5c0ee51b7177c58aeefc36b88bc97ed583ad096499121407c50d423'}},
 {'Key': 'data/processed/validation.csv',
  'LastModified': datetime.datetime(2024, 6, 14, 9, 57, 52, tzinfo=tzlocal()),
  'ETag': '"9767fe903678ae5cb4bf72847b456203"',
  'Size': 36364,
  'StorageClass': 'STANDARD',
  'Owner': {'ID': 'ec1ed018b5c0ee51b7177c58aeefc36b88bc97ed583ad096499121407c50d423'}},
 {'Key': 'data/raw/sp500.csv',
  'LastModified': datetime.datetime(2024, 6, 14, 7, 19, 40, tzinfo=

In [60]:
s3_object = s3.get_object(Bucket=BUCKET_NAME, Key=f"{root_folder}/train.csv")

loaded_data = s3_object['Body'].read().decode('utf-8')
loaded_df = pd.read_csv(StringIO(loaded_data))
loaded_df.head()

Unnamed: 0,Datetime,Close
0,2024-05-15 09:30:00-04:00,0.429912
1,2024-05-15 09:31:00-04:00,0.43267
2,2024-05-15 09:32:00-04:00,0.452035
3,2024-05-15 09:33:00-04:00,0.458489
4,2024-05-15 09:34:00-04:00,0.458433
