In [1]:
import pandas as pd
import os

In [25]:
project_dir = "/home/jupyter-tfg2425paula"
os.chdir(project_dir)
data_dir = os.path.join(project_dir, "raw_data")
options_dir = os.path.join(data_dir, "options_and_combinations")
pca_dir = os.path.join(data_dir, "pca")

In [22]:
import sklearn.preprocessing

def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    
    
    for column in numeric_columns:  # Use 'df' instead of 'normalized_df' to iterate over its columns
        df[column] = min_max_scaler.fit_transform(df[column].values.reshape(-1, 1))  # Correct access to columns
    return df

In [12]:
from openpyxl import load_workbook

stocks = 'AAPL_MSFT_AMZN_NVDA_SPX'
filename = f'{stocks}_options.xlsx'
workbook = load_workbook(os.path.join(options_dir, filename))
sheet = workbook.active
data = []
for row in sheet.iter_rows(values_only=True):  # Iterate through rows, extract values
    data.append(row)

# Convert to pandas DataFrame
df = pd.DataFrame(data[1:], columns=data[0])
df.head()

Unnamed: 0,Date,AAPL_Close,AAPL_CALL_OM,AAPL_CALL_O1,AAPL_CALL_OY,AAPL_CALL_OI,AAPL_CALL_VM,AAPL_PUT_OM,AAPL_PUT_O1,AAPL_PUT_OY,...,SPX_CALL_OM,SPX_CALL_O1,SPX_CALL_OY,SPX_CALL_OI,SPX_CALL_VM,SPX_PUT_OM,SPX_PUT_O1,SPX_PUT_OY,SPX_PUT_OI,SPX_PUT_VM
0,2009-06-18,135.88,6.8,0.4063,0.4254,920448,31936.0,5.7,0.3943,0.4314,...,24.3,0.2568,0.2698,3816626,360148.0,28.16,0.2656,0.2939,5607238,468262.0
1,2009-06-19,139.48,5.4,0.3606,0.4244,687080,40040.0,5.96,0.3654,0.4145,...,23.0,0.2335,0.2635,3889631,211146.0,25.0,0.2524,0.295,5775618,296620.0
2,2009-06-22,137.37,6.9,0.3852,0.4156,708547,25673.0,4.45,0.3845,0.4333,...,24.4,0.288,0.2804,3806131,195176.0,26.9,0.2799,0.3056,5685968,312092.0
3,2009-06-23,134.01,4.52,0.3653,0.418,725825,24184.0,5.6,0.3747,0.4259,...,24.0,0.2734,0.2752,3755289,117641.0,25.0,0.2757,0.3059,5287312,312995.0
4,2009-06-24,136.22,5.45,0.3513,0.4088,732078,16474.0,4.1,0.3464,0.4138,...,24.0,0.2707,0.2763,4028277,225367.0,22.6,0.2595,0.2936,6065984,255662.0


In [26]:
filename = f'rotated_{stocks}_options.csv'
df = pd.read_csv(os.path.join(pca_dir, filename))
df.head()

Unnamed: 0,Rotated_PC1,Target
0,0.77,1.0
1,0.77,0.0
2,0.65,0.0
3,0.66,1.0
4,0.52,1.0


In [13]:
security = 'AAPL'

df[f'RETURN_{security}'] = (df[f'{security}_Close'] - df[f'{security}_Close'].shift(1)) / df[f'{security}_Close'].shift(1)
df[f'TARGET_{security}'] = (df[f'{security}_Close'].shift(-1) - df[f'{security}_Close'] > 0).astype(float)

df = df.drop(columns = ['Date'])
    
df = df.fillna(0)
df = df.rename(columns={f'TARGET_{security}': 'Target'})

normalized_df = normalize_data(df)

normalized_df.head()

Unnamed: 0,AAPL_Close,AAPL_CALL_OM,AAPL_CALL_O1,AAPL_CALL_OY,AAPL_CALL_OI,AAPL_CALL_VM,AAPL_PUT_OM,AAPL_PUT_O1,AAPL_PUT_OY,AAPL_PUT_OI,...,SPX_CALL_OY,SPX_CALL_OI,SPX_CALL_VM,SPX_PUT_OM,SPX_PUT_O1,SPX_PUT_OY,SPX_PUT_OI,SPX_PUT_VM,RETURN_AAPL,Target
0,0.074532,0.166161,0.083655,0.145124,0.109092,0.007441,0.136778,0.122097,0.05268,0.130542,...,0.448234,0.495566,0.381835,0.095962,0.282048,0.312108,0.471325,0.202485,0.877078,1.0
1,0.080416,0.123711,0.070093,0.144516,0.081433,0.009329,0.144681,0.109138,0.048656,0.094416,...,0.429218,0.505046,0.223861,0.085193,0.263833,0.314138,0.485479,0.128264,0.90426,0.0
2,0.076967,0.169193,0.077393,0.139166,0.083977,0.005982,0.098784,0.117702,0.053133,0.098452,...,0.480229,0.494204,0.206929,0.091668,0.30178,0.333702,0.477943,0.134954,0.861557,0.0
3,0.071475,0.097029,0.071488,0.140625,0.086025,0.005635,0.133739,0.113308,0.051371,0.106272,...,0.464534,0.487602,0.124725,0.085193,0.295985,0.334256,0.444433,0.135345,0.851982,1.0
4,0.075087,0.125227,0.067333,0.135032,0.086766,0.003838,0.088146,0.100619,0.048489,0.10755,...,0.467854,0.523048,0.238938,0.077015,0.27363,0.311554,0.509886,0.110553,0.893998,1.0


In [14]:
def split_dataframe(df, target_column, window_size):
    """
    Splits the DataFrame into sequential portions of size `window_size`.

    Parameters:
    df (pd.DataFrame): The input DataFrame to be split.
    target_column (str): Name of the target column that indicates future changes.
    window_size (int): The size of each sequential portion.

    Returns:
    list: A list of DataFrames, each of size `window_size`.
    list: Corresponding targets for each sequential portion.
    """
    sequential_data = []
    targets = []

    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i + window_size].copy()  # Selects a window of size `window_size`
        target = df.iloc[i + window_size - 1][target_column]  # Target is the last value in the window
        sequential_data.append(window)
        targets.append(target)

    return sequential_data, targets

In [15]:
import pandas as pd

window_size = 200
sequential_data, targets = split_dataframe(normalized_df, target_column='Target', window_size=window_size)

print(f"Number of sequential portions: {len(sequential_data)}")
print("Example of one sequential portion:")
print(sequential_data[0])  # Show the first sequential portion
print("Corresponding target:", targets[0])


Number of sequential portions: 3829
Example of one sequential portion:
     AAPL_Close  AAPL_CALL_OM  AAPL_CALL_O1  AAPL_CALL_OY  AAPL_CALL_OI  \
0      0.074532      0.166161      0.083655      0.145124      0.109092   
1      0.080416      0.123711      0.070093      0.144516      0.081433   
2      0.076967      0.169193      0.077393      0.139166      0.083977   
3      0.071475      0.097029      0.071488      0.140625      0.086025   
4      0.075087      0.125227      0.067333      0.135032      0.086766   
..          ...           ...           ...           ...           ...   
195    0.219623      0.241662      0.040180      0.086150      0.071253   
196    0.215701      0.196483      0.041575      0.086393      0.055799   
197    0.219787      0.231352      0.042020      0.096790      0.058315   
198    0.225687      0.123711      0.041842      0.087853      0.059276   
199    0.227338      0.138872      0.043623      0.091196      0.060848   

     AAPL_CALL_VM  AAPL_PUT_

In [16]:
import pandas as pd

def create_sequential_dataframe(sequential_data, targets):
    """
    Creates a reshaped DataFrame where each row contains sequential data for each feature
    and a corresponding target value.

    Parameters:
    sequential_data (list): List of DataFrames representing sequential data portions.
    targets (list): List of target values corresponding to each sequence.

    Returns:
    pd.DataFrame: Reshaped DataFrame with each row containing sequential data for each feature
                  and the corresponding target value.
    """
    reshaped_rows = []

    for i, window_df in enumerate(sequential_data):
        row_data = {}
        # Iterate over columns (features) in the window
        for col in window_df.columns:
            # Create a new column for each feature across the window size
            row_data[col] = pd.Series(window_df[col].values)

        # Add the corresponding target for the sequence
        row_data['Target'] = targets[i]

        reshaped_rows.append(row_data)

    # Convert to DataFrame
    reshaped_df = pd.DataFrame(reshaped_rows)
    return reshaped_df

In [17]:
sequential_data, targets = split_dataframe(normalized_df, target_column='Target', window_size=window_size)
reshaped_df = create_sequential_dataframe(sequential_data, targets)

reshaped_df

Unnamed: 0,AAPL_Close,AAPL_CALL_OM,AAPL_CALL_O1,AAPL_CALL_OY,AAPL_CALL_OI,AAPL_CALL_VM,AAPL_PUT_OM,AAPL_PUT_O1,AAPL_PUT_OY,AAPL_PUT_OI,...,SPX_CALL_OY,SPX_CALL_OI,SPX_CALL_VM,SPX_PUT_OM,SPX_PUT_O1,SPX_PUT_OY,SPX_PUT_OI,SPX_PUT_VM,RETURN_AAPL,Target
0,0 0.074532 1 0.080416 2 0.07696...,0 0.166161 1 0.123711 2 0.16919...,0 0.083655 1 0.070093 2 0.07739...,0 0.145124 1 0.144516 2 0.13916...,0 0.109092 1 0.081433 2 0.08397...,0 0.007441 1 0.009329 2 0.00598...,0 0.136778 1 0.144681 2 0.09878...,0 0.122097 1 0.109138 2 0.11770...,0 0.052680 1 0.048656 2 0.05313...,0 0.130542 1 0.094416 2 0.09845...,...,0 0.448234 1 0.429218 2 0.48022...,0 0.495566 1 0.505046 2 0.49420...,0 0.381835 1 0.223861 2 0.20692...,0 0.095962 1 0.085193 2 0.09166...,0 0.282048 1 0.263833 2 0.30178...,0 0.312108 1 0.314138 2 0.33370...,0 0.471325 1 0.485479 2 0.47794...,0 0.202485 1 0.128264 2 0.13495...,0 0.877078 1 0.904260 2 0.86155...,0.0
1,0 0.080416 1 0.076967 2 0.07147...,0 0.123711 1 0.169193 2 0.09702...,0 0.070093 1 0.077393 2 0.07148...,0 0.144516 1 0.139166 2 0.14062...,0 0.081433 1 0.083977 2 0.08602...,0 0.009329 1 0.005982 2 0.00563...,0 0.144681 1 0.098784 2 0.13373...,0 0.109138 1 0.117702 2 0.11330...,0 0.048656 1 0.053133 2 0.05137...,0 0.094416 1 0.098452 2 0.10627...,...,0 0.429218 1 0.480229 2 0.46453...,0 0.505046 1 0.494204 2 0.48760...,0 0.223861 1 0.206929 2 0.12472...,0 0.085193 1 0.091668 2 0.08519...,0 0.263833 1 0.301780 2 0.29598...,0 0.314138 1 0.333702 2 0.33425...,0 0.485479 1 0.477943 2 0.44443...,0 0.128264 1 0.134954 2 0.13534...,0 0.904260 1 0.861557 2 0.85198...,1.0
2,0 0.076967 1 0.071475 2 0.07508...,0 0.169193 1 0.097029 2 0.12522...,0 0.077393 1 0.071488 2 0.06733...,0 0.139166 1 0.140625 2 0.13503...,0 0.083977 1 0.086025 2 0.08676...,0 0.005982 1 0.005635 2 0.00383...,0 0.098784 1 0.133739 2 0.08814...,0 0.117702 1 0.113308 2 0.10061...,0 0.053133 1 0.051371 2 0.04848...,0 0.098452 1 0.106272 2 0.10755...,...,0 0.480229 1 0.464534 2 0.46785...,0 0.494204 1 0.487602 2 0.52304...,0 0.206929 1 0.124725 2 0.23893...,0 0.091668 1 0.085193 2 0.07701...,0 0.301780 1 0.295985 2 0.27363...,0 0.333702 1 0.334256 2 0.31155...,0 0.477943 1 0.444433 2 0.50988...,0 0.134954 1 0.135345 2 0.11055...,0 0.861557 1 0.851982 2 0.89399...,1.0
3,0 0.071475 1 0.075087 2 0.08103...,0 0.097029 1 0.125227 2 0.09642...,0 0.071488 1 0.067333 2 0.06314...,0 0.140625 1 0.135032 2 0.12372...,0 0.086025 1 0.086766 2 0.08688...,0 0.005635 1 0.003838 2 0.00454...,0 0.133739 1 0.088146 2 0.10030...,0 0.113308 1 0.100619 2 0.09313...,0 0.051371 1 0.048489 2 0.04801...,0 0.106272 1 0.107550 2 0.10888...,...,0 0.464534 1 0.467854 2 0.43767...,0 0.487602 1 0.523048 2 0.53778...,0 0.124725 1 0.238938 2 0.25547...,0 0.085193 1 0.077015 2 0.07190...,0 0.295985 1 0.273630 2 0.24396...,0 0.334256 1 0.311554 2 0.30435...,0 0.444433 1 0.509886 2 0.51444...,0 0.135345 1 0.110553 2 0.13102...,0 0.851982 1 0.893998 2 0.90449...,1.0
4,0 0.075087 1 0.081037 2 0.08525...,0 0.125227 1 0.096422 2 0.12219...,0 0.067333 1 0.063149 2 0.05130...,0 0.135032 1 0.123723 2 0.11861...,0 0.086766 1 0.086887 2 0.08772...,0 0.003838 1 0.004543 2 0.00414...,0 0.088146 1 0.100304 2 0.05531...,0 0.100619 1 0.093131 2 0.08367...,0 0.048489 1 0.048013 2 0.04415...,0 0.107550 1 0.108889 2 0.11069...,...,0 0.467854 1 0.437670 2 0.42378...,0 0.523048 1 0.537780 2 0.55145...,0 0.238938 1 0.255477 2 0.20065...,0 0.077015 1 0.071903 2 0.07190...,0 0.273630 1 0.243963 2 0.24492...,0 0.311554 1 0.304356 2 0.30084...,0 0.509886 1 0.514445 2 0.52058...,0 0.110553 1 0.131024 2 0.07958...,0 0.893998 1 0.904494 2 0.89600...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,0 0.153427 1 0.152953 2 0.15042...,0 0.112189 1 0.108551 2 0.16919...,0 0.019467 1 0.023325 2 0.02412...,0 0.026629 1 0.028818 2 0.02906...,0 0.263325 1 0.266434 2 0.02276...,0 0.026366 1 0.027544 2 0.02841...,0 0.106383 1 0.103343 2 0.06990...,0 0.027711 1 0.027935 2 0.03327...,0 0.004001 1 0.004311 2 0.00493...,0 0.280257 1 0.280613 2 0.05861...,...,0 0.064896 1 0.062179 2 0.05735...,0 0.654511 1 0.662201 2 0.22097...,0 0.276654 1 0.197751 2 0.19560...,0 0.190833 1 0.188789 2 0.19816...,0 0.077273 1 0.076169 2 0.08679...,0 0.074013 1 0.072905 2 0.07622...,0 0.754191 1 0.764281 2 0.19753...,0 0.266159 1 0.132977 2 0.11947...,0 0.872143 1 0.875462 2 0.86842...,1.0
3825,0 0.152953 1 0.150420 2 0.15042...,0 0.108551 1 0.169193 2 0.16919...,0 0.023325 1 0.024126 2 0.02412...,0 0.028818 1 0.029061 2 0.02906...,0 0.266434 1 0.022767 2 0.02276...,0 0.027544 1 0.028418 2 0.00000...,0 0.103343 1 0.069909 2 0.06990...,0 0.027935 1 0.033271 2 0.03327...,0 0.004311 1 0.004930 2 0.00493...,0 0.280613 1 0.058610 2 0.05861...,...,0 0.062179 1 0.057350 2 0.05735...,0 0.662201 1 0.220977 2 0.22097...,0 0.197751 1 0.195602 2 0.00000...,0 0.188789 1 0.198160 2 0.19816...,0 0.076169 1 0.086795 2 0.08679...,0 0.072905 1 0.076227 2 0.07622...,0 0.764281 1 0.197530 2 0.19753...,0 0.132977 1 0.119472 2 0.00000...,0 0.875462 1 0.868428 2 0.87707...,0.0
3826,0 0.150420 1 0.150420 2 0.14919...,0 0.169193 1 0.169193 2 0.07580...,0 0.024126 1 0.024126 2 0.02498...,0 0.029061 1 0.029061 2 0.03039...,0 0.022767 1 0.022767 2 0.26324...,0 0.028418 1 0.000000 2 0.04094...,0 0.069909 1 0.069909 2 0.08510...,0 0.033271 1 0.033271 2 0.03546...,0 0.004930 1 0.004930 2 0.00576...,0 0.058610 1 0.058610 2 0.28828...,...,0 0.057350 1 0.057350 2 0.07153...,0 0.220977 1 0.220977 2 0.67553...,0 0.195602 1 0.000000 2 0.19134...,0 0.198160 1 0.198160 2 0.19492...,0 0.086795 1 0.086795 2 0.09507...,0 0.076227 1 0.076227 2 0.07881...,0 0.197530 1 0.197530 2 0.77644...,0 0.119472 1 0.000000 2 0.14508...,0 0.868428 1 0.877078 2 0.87285...,1.0
3827,0 0.150420 1 0.149194 2 0.15043...,0 0.169193 1 0.075804 2 0.08277...,0 0.024126 1 0.024987 2 0.02418...,0 0.029061 1 0.030399 2 0.02906...,0 0.022767 1 0.263241 2 0.26498...,0 0.000000 1 0.040942 2 0.01568...,0 0.069909 1 0.085106 2 0.07051...,0 0.033271 1 0.035468 2 0.03524...,0 0.004930 1 0.005763 2 0.00526...,0 0.058610 1 0.288284 2 0.29109...,...,0 0.057350 1 0.071536 2 0.06731...,0 0.220977 1 0.675534 2 0.67970...,0 0.000000 1 0.191341 2 0.18697...,0 0.198160 1 0.194922 2 0.18589...,0 0.086795 1 0.095074 2 0.09190...,0 0.076227 1 0.078811 2 0.07567...,0 0.197530 1 0.776441 2 0.77972...,0 0.000000 1 0.145080 2 0.13685...,0 0.877078 1 0.872857 2 0.88137...,1.0


In [18]:
stocks = 'AAPL_MSFT_AMZN_NVDA_SPX'

In [19]:
output_filename = f'{stocks}_{window_size}.csv'
output_data_folder = os.path.join(project_dir, 'processed_data/options_and_combinations')
reshaped_df.to_csv(os.path.join(output_data_folder, output_filename))

In [20]:
pkl_filename = f'{stocks}_{window_size}.pkl'
reshaped_df.to_pickle(os.path.join(output_data_folder, pkl_filename))