In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/21-25.csv')

In [2]:
import time
from getData import get_stock_data, sp_list

start_time = time.time()
data = get_stock_data('AAPL', '2021-01-01', '2025-01-01')
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")


Execution time: 0.28008174896240234 seconds


In [3]:
def get_df(symbol):
    data = get_stock_data(symbol, '2021-01-01', '2025-01-01')
    return pd.DataFrame(data)

df_dict = {symbol: get_df(symbol) for symbol in sp_list}

In [4]:
df_dict

{'APO':             date    open    high     low   close  adjClose   volume  \
 0     2024-12-31  167.00  167.60  164.93  165.16    164.69  2817031   
 1     2024-12-30  167.95  168.87  165.18  166.51    166.04  2597815   
 2     2024-12-27  172.68  173.41  169.70  170.28    169.80  2859065   
 3     2024-12-26  173.63  175.08  172.51  174.37    173.87  1348439   
 4     2024-12-24  172.51  174.85  171.80  174.85    174.35  1023900   
 ...          ...     ...     ...     ...     ...       ...      ...   
 1000  2021-01-08   47.50   48.26   46.37   46.96     42.40  1248004   
 1001  2021-01-07   47.96   48.08   46.45   46.90     42.34  1283731   
 1002  2021-01-06   47.94   48.79   46.45   47.50     42.88  1782015   
 1003  2021-01-05   48.08   48.76   47.78   48.14     43.46   655500   
 1004  2021-01-04   48.98   49.09   47.61   47.77     43.13  1064700   
 
       unadjustedVolume  change  changePercent      vwap            label  \
 0              2817031   -1.84       -1.10000  16

In [5]:
def calculate_std_dev(df, N):
    """
    Calculate the standard deviation of daily returns over the past N days.
    """

    df['daily_return'] = df['close'].pct_change()

    std_dev = df['daily_return'].rolling(window=N).std().iloc[-1]

    return std_dev

In [6]:
def log_transform_close(df):
    log_returns = np.log(df['close'] / df['close'].shift(1))
    df['log_return'] = log_returns
    return df

In [17]:
def calculate_moving_average(df, indicator, N):
    """
    Calculate the N-day moving average of indicator in the dataframe.
    """
    moving_average = df[indicator].rolling(window=N).mean().iloc[-1]
    return moving_average

In [None]:
sub_df = df_dict['AAPL']

In [8]:
def calculate_slope_pe_movement(symbol, df):
    scaler = StandardScaler()
    sub_df = df.loc[df['symbol'] == symbol].copy()
    
    # scale PE ratio
    sub_df.loc[:, 'pe_scaled'] = scaler.fit_transform(sub_df['priceEarningsRatio'].values.reshape(-1, 1))
    
    # create a dummy x range for corresponding PE
    sub_df.loc[:, 'x_range'] = np.linspace(0, len(sub_df) - 1, len(sub_df))
    sub_df = sub_df[['x_range', 'pe_scaled']]

    # calculate slope of PE ratio movement
    slope = np.polyfit(sub_df['x_range'], sub_df['pe_scaled'].values.flatten(), 1)[0]
    
    return slope

In [9]:
columns_to_keep = [
    'symbol', 'calendarYear', 'period', 
    'currentRatio', 'quickRatio', 
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin', 
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio', 
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio', 
    'debtEquityRatio', 'longTermDebtToCapitalization', 
    'assetTurnover', 'inventoryTurnover'
]

existing_columns = [col for col in columns_to_keep if col in df.columns]

filtered_df = df[existing_columns]

In [10]:
def calculate_column_slope(column, sub_df):
    scaler = StandardScaler()
    
    # scale column
    sub_df.loc[:, 'var_scaled'] = scaler.fit_transform(sub_df[column].values.reshape(-1, 1))
    
    # create a dummy x range for corresponding PE
    sub_df.loc[:, 'x_range'] = np.linspace(0, len(sub_df) - 1, len(sub_df))
    sub_df = sub_df[['x_range', 'var_scaled']]

    # calculate slope of PE ratio movement
    slope = np.polyfit(sub_df['x_range'], sub_df['pe_scaled'].values.flatten(), 1)[0]
    
    return slope, moving_average

def ma_slope_column(sub_df, column):
    scaler = StandardScaler()
    moving_average = sub_df[column].rolling(window=len(sub_df)).mean().iloc[-1] # not sure if this is calculating in the right temporal direction

    # standardize column values
    standardized_values = scaler.fit_transform(sub_df[column].values.reshape(-1, 1)).flatten()
    
    # calculate slope of the line of best fit
    x_range = np.arange(len(standardized_values))
    slope = np.polyfit(x_range, standardized_values, 1)[0]
        
    return moving_average, slope

columns_to_analyze = [
    'currentRatio', 'quickRatio', 
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin', 
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio', 
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio', 
    'debtEquityRatio', 'longTermDebtToCapitalization', 
    'assetTurnover', 'inventoryTurnover'
]

# want to iterate through symbols and calculate moving average and direction of movement for each column -- MOVING AVERAGE SHOULD NOT BE STANDARDIZED, only using the standardization to calculate slope
# final df should contain that, plus the moving average of close prices, plus volume & other good stuff
# also maybe standard deviation of all of these values

In [36]:
data = {}

for symbol in df['symbol'].unique():
    sub_df = df.loc[df['symbol'] == symbol].copy()
    
    column_data = {}
    for column in columns_to_analyze:
        ma, slope = ma_slope_column(sub_df, column)
        column_data[f'{column}_ma'] = ma
        column_data[f'{column}_slope'] = slope

    volatility_90 = calculate_std_dev(sub_df, 90) 

    sub_df_dict = df_dict[symbol]

    close_ma_30 = calculate_moving_average(sub_df_dict, 'close', 30)
    close_ma_90 = calculate_moving_average(sub_df_dict, 'close', 90)
    close_ma_200 = calculate_moving_average(sub_df_dict, 'close', 200)

    volume_ma_30 = calculate_moving_average(sub_df_dict, 'volume', 30)
    volume_ma_90 = calculate_moving_average(sub_df_dict, 'volume', 90)
    volume_ma_200 = calculate_moving_average(sub_df_dict, 'volume', 200)

    data[symbol] = {
        volatility_90: volatility_90,
        'close_ma_30': close_ma_30,
        'close_ma_90': close_ma_90,
        'close_ma_200': close_ma_200,
        'volume_ma_30': volume_ma_30,
        'volume_ma_90': volume_ma_90,
        'volume_ma_200': volume_ma_200,
        **column_data
    }   
    

In [40]:
finaldf = pd.DataFrame(data).T
finaldf.dropna(axis=1, how='all', inplace=True)

finaldf.ffill(inplace=True) # forward fill NA

finaldf.to_csv('finaldf.csv')