## **This code aims to generate horizontal dataframes**


In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
project_dir = "/home/jupyter-tfg2425paula/prediction_project_v3"
os.chdir(project_dir)

clean_data_dir = os.path.join(project_dir, "00_data/clean")
horizontal_structure_data_dir = os.path.join(project_dir, "00_data/horizontal_structure")
pca_data_dir = os.path.join(project_dir, "00_data/pca")

In [3]:
def scale_data(df, selected_scale_cols, scaling_method):
    """
    Scales specified columns in a DataFrame using the specified scaling method.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        selected_cols (list): A list of column names to scale.
        scaling_method (str): The scaling method to use ("standard" or "minmax"). Default is "standard".
    
    Returns:
        pd.DataFrame: The DataFrame with specified columns scaled.
    """
    
    if scaling_method is not None:
        if scaling_method == "standard":
            scaler = StandardScaler()
        elif scaling_method == "minmax":
            scaler = MinMaxScaler()
        else:
            raise ValueError("Invalid scaling method. Choose 'standard' or 'minmax'.")

        df[selected_scale_cols] = df[selected_scale_cols].apply(pd.to_numeric, errors="coerce")
        
        # Scale only the selected columns
        df_scaled = df.copy()
        # Replace infinite values with NaN
        df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)

        # Optionally fill NaN with column mean or median
        df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)

        df_scaled[selected_scale_cols] = scaler.fit_transform(df[selected_scale_cols])
    
    else:
        df_scaled = df
        
    return df_scaled

In [4]:
def split_dataframe(df, target_column, window_size):
    """
    Splits the DataFrame into sequential portions of size `window_size`.

    Parameters:
    df (pd.DataFrame): The input DataFrame to be split.
    target_column (str): Name of the target column that indicates future changes.
    window_size (int): The size of each sequential portion.

    Returns:
    list: A list of DataFrames, each of size `window_size`.
    list: Corresponding targets for each sequential portion.
    """
    sequential_data = []
    targets = []

    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size].copy()  # Selects a window of size `window_size`
        target = df.iloc[i + window_size - 1][target_column]  # Target is the last value in the window
        sequential_data.append(window)
        targets.append(target)

    return sequential_data, targets

In [5]:
def create_sequential_dataframe(sequential_data, targets):
    """
    Creates a reshaped DataFrame where each row contains sequential data for each feature
    and a corresponding target value.

    Parameters:
    sequential_data (list): List of DataFrames representing sequential data portions.
    targets (list): List of target values corresponding to each sequence.

    Returns:
    pd.DataFrame: Reshaped DataFrame with each row containing sequential data for each feature
                  and the corresponding target value.
    """
    reshaped_rows = []

    for i, window_df in enumerate(sequential_data):
        row_data = {}
        # Iterate over columns (features) in the window
        for col in window_df.columns:
            # Create a new column for each feature across the window size
            row_data[col] = pd.Series(window_df[col].values)

        # Add the corresponding target for the sequence
        row_data['Target'] = targets[i]

        reshaped_rows.append(row_data)

    # Convert to DataFrame
    reshaped_df = pd.DataFrame(reshaped_rows)
    return reshaped_df

In [11]:
types_securities = ["technical"]
stocks = ['SPX']
stocks = ['AAPL', 'MSFT', 'AMZN', 'NVDA']
years = ["10y"]

In [12]:
window_sizes = [500, 1000]
for security_type in types_securities:
    for stock in stocks:
        for period in years:
            input_df = pd.read_csv(os.path.join(clean_data_dir, f"{security_type}/{stock}/{period}_data.csv"))
            input_df = input_df.drop(columns = ["Date"])
            
            # SCALE DATA FIRST
            scaling_method = "standard"
            selected_scale_cols = list(input_df.drop(columns=["Target"]).columns)
            scaled_df = scale_data(input_df, selected_scale_cols, scaling_method)
            
            for window_size in window_sizes:
                print(f"{security_type}, {stock}, {period}, {window_size}")
                sequential_data, targets = split_dataframe(scaled_df, target_column='Target', window_size=window_size)
                reshaped_df = create_sequential_dataframe(sequential_data, targets)
                
                pkl_filename = f"clean/{security_type}/{stock}/{period}_{window_size}_data.pkl"
                
                output_filepath = os.path.join(horizontal_structure_data_dir, pkl_filename)
                os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
                
                print(output_filepath)
                reshaped_df.to_pickle(output_filepath) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)


technical, AAPL, 10y, 500
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/AAPL/10y_500_data.pkl
technical, AAPL, 10y, 1000
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/AAPL/10y_1000_data.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)


technical, MSFT, 10y, 500
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/MSFT/10y_500_data.pkl
technical, MSFT, 10y, 1000
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/MSFT/10y_1000_data.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)


technical, AMZN, 10y, 500
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/AMZN/10y_500_data.pkl
technical, AMZN, 10y, 1000
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/AMZN/10y_1000_data.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)


technical, NVDA, 10y, 500
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/NVDA/10y_500_data.pkl
technical, NVDA, 10y, 1000
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/technical/NVDA/10y_1000_data.pkl
