In [1]:
from sklearn.preprocessing import MinMaxScaler



In [2]:
import numpy as np
import pandas as pd

In [3]:
mm_scaler = MinMaxScaler().set_output(transform="pandas")

In [4]:
df_data = pd.read_csv("raw_data/AAPL.csv")

In [5]:
df_macroeconomic = pd.read_csv("raw_data/macro_economic_indicators.csv")

In [6]:
df_technical = pd.read_csv("raw_data/AAPL_technical_analysis.csv")

# Supporting functions

In [7]:
def rename_macroeconomic(df_macroeconomic):
    columns_macroeconomic = ["datetime", "interest_rate", "GDP", "inflation"]
    df_macroeconomic.columns = columns_macroeconomic
    return df_macroeconomic

In [8]:
def merge_macroeconomic(df_technical, df_macroeconomic):
    df_merged = df_technical.merge(df_macroeconomic, on="datetime", how="inner")
    return df_merged

In [9]:
def rename_columns(df):
    columns = ["datetime", "open", "high", "low", "close", "volume"]
    df.columns = columns
    return df

In [10]:
def rename_technical(df_technical):
    df_technical = df_technical.rename(columns = {"Unnamed: 0": "datetime"})
    return df_technical

In [11]:
def convert_datetime(df):
    data = df.copy()
    data["datetime"] = pd.to_datetime(data["datetime"])
    return data

In [12]:
def merge_columns(df_values, df_technical):
    df_merged = df_values.merge(df_technical, how="inner", on="datetime")
    return df_merged

In [13]:
def clean_data(df):
    df_inter = df.interpolate(method='linear')
    df_clean = df_inter.dropna()
    return df_clean

In [14]:
def create_target(df, column_name, new_column_name='Target'):   
    result = []
    for i in range(len(df) - 1):
        if df[column_name].iloc[i] > df[column_name].iloc[i-1]:
            result.append(1)
        else:
            result.append(0)
    result.append(float('nan'))
    df[new_column_name] = pd.Series(result, index=df.index)
    return df

In [15]:
def target_drop(df_final):
    df_final = df_final.dropna()
    return df_final

In [16]:
def convert_time_sin_cos(df):
    data = df.copy()
    data['day_of_year'] = data['datetime'].dt.dayofyear
    data['time_of_day'] = data['datetime'].dt.hour * 3600 + data['datetime'].dt.minute * 60 + data['datetime'].dt.second
    data['day_of_year_norm'] = data['day_of_year'] / 365.0
    data['time_of_day_norm'] = data['time_of_day'] / 86400.0
    data['cos_time_of_day'] = np.cos(2 * np.pi * data['time_of_day_norm'])
    data['sin_time_of_day'] = np.sin(2 * np.pi * data['time_of_day_norm'])
    data["cos_day_of_year"] = np.cos(2 * np.pi * data['day_of_year_norm'])
    data["sin_day_of_year"] = np.sin(2 * np.pi * data['day_of_year_norm'])
    df_converted = data.drop(columns = ["day_of_year", "time_of_day", "day_of_year_norm", "time_of_day_norm", "datetime"])
    return df_converted

In [17]:
def log_divide_next(df):
    next_values = df.shift(-1)
    ratio = df / next_values
    log_ratio = np.log(ratio)
    return log_ratio

# Final function that prepares all the data

In [20]:
def create_x_y(df_data, df_technical, df_macroeconomic):
    df_macroeconomics = rename_macroeconomic(df_macroeconomic)
    df_technical = rename_technical(df_technical)
    df_data = rename_columns(df_data)
    df_technical = clean_data(df_technical)
    df_data = convert_datetime(df_data)
    df_technical = convert_datetime(df_technical)
    df_macroeconomic = convert_datetime(df_macroeconomic)
    df_technical = merge_macroeconomic(df_technical, df_macroeconomic)
    df_merged = merge_columns(df_data, df_technical)
    df_target = create_target(df_merged, "close")
    df_final = target_drop(df_target)
    df_time = convert_time_sin_cos(df_final)
    X = df_time.drop(columns = "Target")
    y = df_time[["Target"]]
    log_df = log_divide_next(df_data["close"])
    return X, y, log_df

In [21]:
X, y, log_df = create_x_y(df_data, df_technical, df_macroeconomic)