In [41]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../stock_data_21_25.csv')

In [10]:
import time
from getData import get_stock_data, sp_list

start_time = time.time()
data = get_stock_data('AAPL', '2021-01-01', '2025-01-01')
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")


Execution time: 0.32764506340026855 seconds


In [11]:
def get_df(symbol):
    data = get_stock_data(symbol, '2021-01-01', '2025-01-01')
    return pd.DataFrame(data)

df_dict = {symbol: get_df(symbol) for symbol in sp_list}

In [16]:
def calculate_std_dev(df, N):
    """
    Calculate the standard deviation of daily returns over the past N days.
    """

    df['daily_return'] = df['close'].pct_change()

    std_dev = df['daily_return'].rolling(window=N).std().iloc[-1]

    return std_dev

In [26]:
def log_transform_close(df):
    log_returns = np.log(df['close'] / df['close'].shift(1))
    df['log_return'] = log_returns
    return df

for symbol, df in df_dict.items():
    df_dict[symbol] = log_transform_close(df)

In [28]:
def calculate_moving_average(df, N):
    """
    Calculate the N-day moving average of the 'close' prices in the dataframe.
    """
    moving_average = df['close'].rolling(window=N).mean()
    return moving_average

In [71]:
def calculate_slope_pe_movement(symbol, df):
    scaler = StandardScaler()
    sub_df = df.loc[df['symbol'] == symbol].copy()
    
    # scale PE ratio
    sub_df.loc[:, 'pe_scaled'] = scaler.fit_transform(sub_df['priceEarningsRatio'].values.reshape(-1, 1))
    
    # create a dummy x range for corresponding PE
    sub_df.loc[:, 'x_range'] = np.linspace(0, len(sub_df) - 1, len(sub_df))
    sub_df = sub_df[['x_range', 'pe_scaled']]

    # calculate slope of PE ratio movement
    slope = np.polyfit(sub_df['x_range'], sub_df['pe_scaled'].values.flatten(), 1)[0]
    
    return slope

In [86]:
columns_to_keep = [
    'symbol', 'calendarYear', 'period', 
    'currentRatio', 'quickRatio', 
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin', 
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio', 
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio', 
    'debtEquityRatio', 'longTermDebtToCapitalization', 
    'assetTurnover', 'inventoryTurnover'
]

existing_columns = [col for col in columns_to_keep if col in df.columns]

filtered_df = df[existing_columns]

In [None]:
def calculate_column_slope(column, sub_df):
    scaler = StandardScaler()
    
    # scale column
    sub_df.loc[:, 'var_scaled'] = scaler.fit_transform(sub_df[column].values.reshape(-1, 1))
    
    # create a dummy x range for corresponding PE
    sub_df.loc[:, 'x_range'] = np.linspace(0, len(sub_df) - 1, len(sub_df))
    sub_df = sub_df[['x_range', 'var_scaled']]

    # calculate slope of PE ratio movement
    slope = np.polyfit(sub_df['x_range'], sub_df['pe_scaled'].values.flatten(), 1)[0]
    
    return slope

def ma_slope_columns(sub_df, columns):
    result = pd.DataFrame(index=df.index)
    scaler = StandardScaler()
    
    for column in columns:
        # Calculate moving average for all available elements
        moving_average = sub_df[column].rolling(window=len(sub_df)).mean()
        
        # Standardize the column values
        standardized_values = scaler.fit_transform(df[column].values.reshape(-1, 1)).flatten()
        
        # Calculate the slope of the line of best fit
        x_range = np.arange(len(standardized_values))
        slope = np.polyfit(x_range, standardized_values, 1)[0]
        
        result[f'{column}_slope'] = slope
    
    return result

columns_to_analyze = [
    'currentRatio', 'quickRatio', 
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin', 
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio', 
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio', 
    'debtEquityRatio', 'longTermDebtToCapitalization', 
    'assetTurnover', 'inventoryTurnover'
]

# want to iterate through symbols and calculate moving average and direction of movement for each column -- MOVING AVERAGE SHOULD NOT BE STANDARDIZED, only using the standardization to calculate slope
# final df should contain that, plus the moving average of close prices, plus volume & other good stuff
# also maybe standard deviation of all of these values