In [1]:
import pandas as pd
import os

In [2]:
project_dir = "/home/jupyter-tfg2425paula"
os.chdir(project_dir)
data_dir = os.path.join(project_dir, "raw_data")
options_dir = os.path.join(data_dir, "options_and_combinations")
pca_dir = os.path.join(data_dir, "pca")

In [3]:
import sklearn.preprocessing

def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    
    
    for column in numeric_columns:  # Use 'df' instead of 'normalized_df' to iterate over its columns
        df[column] = min_max_scaler.fit_transform(df[column].values.reshape(-1, 1))  # Correct access to columns
    return df

In [8]:
stocks = 'AAPL_MSFT_AMZN_NVDA_SPX'
filename = f'rotated_{stocks}_options.csv'
normalized_df = pd.read_csv(os.path.join(pca_dir, filename))
normalized_df.head()

Unnamed: 0,Rotated_PC1,Target
0,0.77,1.0
1,0.77,0.0
2,0.65,0.0
3,0.66,1.0
4,0.52,1.0


In [9]:
def split_dataframe(df, target_column, window_size):
    """
    Splits the DataFrame into sequential portions of size `window_size`.

    Parameters:
    df (pd.DataFrame): The input DataFrame to be split.
    target_column (str): Name of the target column that indicates future changes.
    window_size (int): The size of each sequential portion.

    Returns:
    list: A list of DataFrames, each of size `window_size`.
    list: Corresponding targets for each sequential portion.
    """
    sequential_data = []
    targets = []

    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size].copy()  # Selects a window of size `window_size`
        target = df.iloc[i + window_size - 1][target_column]  # Target is the last value in the window
        sequential_data.append(window)
        targets.append(target)

    return sequential_data, targets

In [10]:
import pandas as pd

window_size = 200
sequential_data, targets = split_dataframe(normalized_df, target_column='Target', window_size=window_size)

print(f"Number of sequential portions: {len(sequential_data)}")
print("Example of one sequential portion:")
print(sequential_data[0])  # Show the first sequential portion
print("Corresponding target:", targets[0])


Number of sequential portions: 3829
Example of one sequential portion:
     Rotated_PC1  Target
0           0.77     1.0
1           0.77     0.0
2           0.65     0.0
3           0.66     1.0
4           0.52     1.0
..           ...     ...
195         0.88     0.0
196         0.89     1.0
197         0.82     1.0
198         0.69     1.0
199         0.78     0.0

[200 rows x 2 columns]
Corresponding target: 0.0


In [11]:
import pandas as pd

def create_sequential_dataframe(sequential_data, targets):
    """
    Creates a reshaped DataFrame where each row contains sequential data for each feature
    and a corresponding target value.

    Parameters:
    sequential_data (list): List of DataFrames representing sequential data portions.
    targets (list): List of target values corresponding to each sequence.

    Returns:
    pd.DataFrame: Reshaped DataFrame with each row containing sequential data for each feature
                  and the corresponding target value.
    """
    reshaped_rows = []

    for i, window_df in enumerate(sequential_data):
        row_data = {}
        # Iterate over columns (features) in the window
        for col in window_df.columns:
            # Create a new column for each feature across the window size
            row_data[col] = pd.Series(window_df[col].values)

        # Add the corresponding target for the sequence
        row_data['Target'] = targets[i]

        reshaped_rows.append(row_data)

    # Convert to DataFrame
    reshaped_df = pd.DataFrame(reshaped_rows)
    return reshaped_df

In [12]:
sequential_data, targets = split_dataframe(normalized_df, target_column='Target', window_size=window_size)
reshaped_df = create_sequential_dataframe(sequential_data, targets)
reshaped_df

Unnamed: 0,Rotated_PC1,Target
0,0 0.77 1 0.77 2 0.65 3 0.6...,0.0
1,0 0.77 1 0.65 2 0.66 3 0.5...,1.0
2,0 0.65 1 0.66 2 0.52 3 0.7...,1.0
3,0 0.66 1 0.52 2 0.78 3 0.4...,1.0
4,0 0.52 1 0.78 2 0.48 3 0.7...,0.0
...,...,...
3824,0 9.57 1 9.35 2 10.15 3 ...,1.0
3825,0 9.35 1 10.15 2 10.15 3 ...,0.0
3826,0 10.15 1 10.15 2 7.62 3 ...,1.0
3827,0 10.15 1 7.62 2 7.90 3 ...,1.0


In [15]:
output_filename = f'rotated_{stocks}_{window_size}.pkl'
output_data_folder = os.path.join(project_dir, 'processed_data/options_and_combinations')
reshaped_df.to_pickle(os.path.join(output_data_folder, output_filename))