In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler

df = pd.read_csv('../data/raw/fundamentals_raw_21-25.csv')

In [77]:
df

Unnamed: 0,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,cashAndCashEquivalents,shortTermInvestments,cashAndShortTermInvestments,...,priceToSalesRatio,priceEarningsRatio,priceToFreeCashFlowsRatio,priceToOperatingCashFlowsRatio,priceCashFlowRatio,priceEarningsToGrowthRatio,priceSalesRatio,dividendYield,enterpriseValueMultiple,priceFairValue
0,APO,USD,1858681.0,2025-02-24,2025-02-24 16:40:05,2024,Q4,1.616600e+10,1.898140e+11,2.059800e+11,...,18.322975,16.285376,-24200.069000,-24200.069000,-24200.069000,-0.338934,18.322975,0.003109,43.192366,5.610634
1,APO,USD,1858681.0,2024-11-06,2024-11-06 17:02:40,2024,Q3,1.666000e+10,4.090000e+08,1.496000e+10,...,9.407219,22.540787,42.912156,42.912156,42.912156,0.111881,9.407219,0.004116,31.998703,4.093507
2,APO,USD,1858681.0,2024-08-08,2024-08-07 18:17:49,2024,Q2,1.798300e+10,1.492000e+09,1.947500e+10,...,11.583957,20.431493,82.892098,82.892098,82.892098,-0.496808,11.583957,0.004361,29.875851,4.564411
3,APO,USD,1858681.0,2024-05-07,2024-05-07 16:03:22,2024,Q1,1.971600e+10,8.960000e+08,2.061200e+10,...,9.395723,11.588278,92.901536,92.901536,92.901536,-0.225591,9.395723,0.004278,21.133871,4.473246
4,APO,USD,1858681.0,2024-02-27,2024-02-27 16:08:55,2023,Q4,1.769100e+10,2.235000e+09,1.992600e+10,...,4.926938,4.936771,-12.982576,39.899531,39.899531,0.008248,4.926938,0.005163,18.692227,3.875175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7843,SPGI,USD,64040.0,2022-05-04,2022-05-03 19:32:44,2022,Q1,4.407000e+09,0.000000e+00,4.407000e+09,...,47.250538,22.850513,547.968621,508.475387,508.475387,0.378588,47.250538,0.001648,156.569527,2.837574
7844,SPGI,USD,64040.0,2022-02-08,2022-02-08 17:17:01,2021,Q4,6.505000e+09,1.100000e+07,6.505000e+09,...,54.470848,42.124122,121.252804,120.994819,120.994819,-2.733938,54.470848,0.001635,104.324748,53.979654
7845,SPGI,USD,64040.0,2021-10-26,2021-10-26 17:24:45,2021,Q3,5.899000e+09,0.000000e+00,5.899000e+09,...,49.044562,32.106650,106.732014,105.849019,105.849019,0.000000,49.044562,0.001817,88.277730,55.477507
7846,SPGI,USD,64040.0,2021-07-29,2021-07-29 16:23:11,2021,Q2,5.213000e+09,0.000000e+00,5.213000e+09,...,46.930845,30.963772,107.899956,107.081647,107.081647,5.719191,46.930845,0.001872,77.776215,78.131510


In [59]:
columns_to_keep = [
    'symbol', 'calendarYear', 'period',
    'currentRatio', 'quickRatio',
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin',
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio',
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio',
    'debtEquityRatio', 'longTermDebtToCapitalization',
    'assetTurnover', 'inventoryTurnover'
]

df = df[columns_to_keep]
df = df.iloc[4:]

# create a list of dataframes for each quarter

quarters_dfs = []
for year in df['calendarYear'].unique():
    for quarter in ['Q1', 'Q2', 'Q3', 'Q4']:
        quarter_df = df[(df['calendarYear'] == year) & (df['period'] == quarter)]
        if not quarter_df.empty:
            quarters_dfs.append(quarter_df)

In [None]:
scaler = RobustScaler()

initial = quarters_dfs[0]
q1 = initial.copy()

scaling_columns = [
    'currentRatio', 'quickRatio',
    'returnOnEquity', 'returnOnAssets', 'netProfitMargin',
    'priceEarningsRatio', 'priceBookValueRatio', 'priceToSalesRatio',
    'freeCashFlowPerShare', 'operatingCashFlowPerShare', 'cashFlowToDebtRatio',
    'debtEquityRatio', 'longTermDebtToCapitalization',
    'assetTurnover', 'inventoryTurnover'
]

q1.loc[:, scaling_columns] = q1[scaling_columns].applymap(lambda x: np.log1p(x) if x > 0 else 0)
q1[scaling_columns] = scaler.fit_transform(q1[scaling_columns])


In [None]:
scaled_quarters = []

scaler = RobustScaler()

for quarter_df in quarters_dfs:
    if not quarter_df.empty:
        scaled_df = quarter_df.copy()
        scaled_df.loc[:, scaling_columns] = scaled_df[scaling_columns].applymap(lambda x: np.log1p(x) if x > 0 else 0)
        scaled_df[scaling_columns] = scaler.fit_transform(scaled_df[scaling_columns])
        scaled_quarters.append(scaled_df)

In [71]:
idx = []
for year in [2021, 2022, 2023, 2024]:
    for quarter in ['Q1', 'Q2', 'Q3', 'Q4']:
        idx.append(f"{year}_{quarter}")

In [89]:
past16 = scaled_quarters[:-3]

In [92]:
for i in range(len(past16)):
    dataset = past16[i]
    dataset.to_csv(f'../data/processed/quarterly/{idx[i]}.csv', index=False)