In [1]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

sys.path.append(os.path.abspath(".."))

from src.features import create_moving_average
from src.data_loader import load_data
from src.config import DATA_PATH, TARGET_COLUMN, RANDOM_STATE, TEST_SIZE, PROCESSED_DATA_PATH

os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [2]:
# Load the data and parse the date column
df = load_data(DATA_PATH)
df['Date'] = pd.to_datetime(df['Date'])

In [3]:
# Create moving average features
df = create_moving_average(df, window_size=[10, 50])

In [4]:
# Prepare features, target variable and index
features = ['Open', 'High', 'Low', 'Volume', 'MA_10', 'MA_50']
target = TARGET_COLUMN
index = df['Date']

X = df[features]
y = df[target]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test, X_train_index, X_test_index = train_test_split(X, y, index, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=False)

In [6]:
# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Preserve original indices
X_train_index = X_train_index.reset_index(drop=True)
X_test_index = X_test_index.reset_index(drop=True)

# Combine scaled features with target values
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Save the processed data
X_train_scaled.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_train.csv'), index=False)
X_test_scaled.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(PROCESSED_DATA_PATH, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(PROCESSED_DATA_PATH, 'y_test.csv'), index=False)
X_train_index.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_train_index.csv'), index=False)
X_test_index.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_test_index.csv'), index=False)