# Pobranie Danych

dane zostały poprane ze strony [macrotrends.net](https://www.macrotrends.net/stocks) przy pomocy skryptu `scrape.py` i umieszczone kolejno w folderach `balance-sheet`, `cash-flow-statement`, `financial-ratios`, `income-statement`.
Kazdy folder posiada pliki csv z odpowiadającym raportem finansowym dla kazdej z firm indeksu S&P500. Firmy z indeksu S&P zostały pobrane z publicznego repozytrium github - [link](https://github.com/datasets/s-and-p-500-companies/blob/main/data/constituents.csv)

# Oczyszczanie danych

Zbiorcze dane z raportów finansowych (`balance-sheet`, `cash-flow-statement`, `financial-ratios`, `income-statement`) zostały umieszczone w oddzielnych plikach dla danego typu raportu i danej firmy. 
Dlatego przed przystąpieniem do analizy nalezy dane połączyć do zbiorczych plików CSV które będą zawierać raporty dla wszystkich firm z indeksu.

In [659]:
import pandas as pd
import glob
import os
from datetime import datetime


NOT_INCLUDE = [
    'BF.B',
    'BRK.B',
    'GOOG',
    'FOX',
    'NWS'
]

constituents = pd.read_csv('sp500/constituents.csv')

constituents['Date added'] = pd.to_datetime(constituents['Date added'])
constituents = constituents.loc[~constituents['Symbol'].isin(NOT_INCLUDE)] # remove classes other than A
selected_companies = set(constituents['Symbol'].to_list())

folders = [
    "financial-ratios",
    "balance-sheet",
    "income-statement",
    "cash-flow-statement"
]

df = None

for ticker in selected_companies:
    all_files_exists = all(
        [os.path.isfile(f"sp500/{folder}/{ticker}.csv") for folder in folders]
    )
    if not all_files_exists:
        continue

    ticker_df = None
    
    for folder in folders:
        file_path = f"sp500/{folder}/{ticker}.csv"
        new_df = pd.read_csv(file_path)
        new_df = new_df.rename(columns={'Unnamed: 0': 'date'})
        new_df['date'] = pd.to_datetime(new_df['date'])
        new_df.fillna(0, inplace=True)
        #new_df = new_df[(new_df['date'].dt.month.eq(12))]
        new_df['ticker'] = ticker
        
        new_df.set_index(['ticker', 'date'], inplace=True)
        
        if ticker_df is None:
            ticker_df = new_df
            continue

        ticker_df = ticker_df.merge(new_df, left_index=True, right_on=['ticker', 'date'])

    if df is None:
        df = ticker_df
        continue
    
    df = pd.concat([df, ticker_df])


In [612]:
df = df.dropna(axis=1)

In [None]:
import numpy as np

def clean_ratio(values):
    return np.array([x for x in values if np.isfinite(x) and x > 0])


def analyze_pe_percentile(clean_ratios, target_pe):
    percentile = np.percentile(clean_ratios, np.linspace(0, 100, 101))
    target_percentile = np.interp(target_pe, percentile, np.linspace(0, 100, 101))
    
    stats = {
        'percentile': round(target_percentile, 2),
        'larger_than_percent': round(100 - target_percentile, 2),
        'median_pe': np.median(clean_ratios),
        'mean_pe': np.mean(clean_ratios),
        'total_companies': len(clean_ratios),
        'summary': f"P/E ratio of {target_pe:.2f} is larger than {round(target_percentile, 2)}% "
                  f"of companies in the dataset"
    }
    return stats

In [614]:
df.to_csv('sp500/data.csv')

In [615]:
import yfinance as yf
start_date = '2009-12-31'
end_date = '2024-12-18'
ticks = yf.Tickers(list(selected_companies))
sp500_close = ticks.history(start=start_date, end=end_date, interval="3mo").Close
sp500_close_index = yf.Tickers('^GSPC').history(start=start_date, end=end_date, interval="3mo").Close
sp500_close = sp500_close.merge(sp500_close_index, left_index=True, right_on='Date')
sp500_close.tail()

[*********************100%***********************]  498 of 498 completed
[*********************100%***********************]  1 of 1 completed


Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS,^GSPC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-01,136.43486,179.868805,,157.470001,116.318733,83.2892,368.850586,560.280029,187.735748,51.260288,...,33.37772,104.338028,50.733646,101.902802,,135.719101,123.330635,279.480011,,5096.27002
2024-03-01,129.750748,191.55687,,144.929993,100.674599,97.590714,278.838379,444.76001,230.578552,60.835281,...,29.278637,94.32563,53.840923,115.386284,,135.422745,114.42466,312.339996,,5277.509766
2024-06-01,142.428513,228.483871,,117.309998,112.143715,107.537109,339.076752,574.409973,232.010849,59.908264,...,30.102736,76.628067,60.092632,116.99807,,133.602753,114.948456,345.380005,,5648.399902
2024-09-01,137.750702,237.069183,,136.110001,118.212868,95.774498,360.901001,515.929993,216.279999,54.089909,...,32.060555,94.379997,71.937637,117.959999,,138.270004,111.851395,407.0,,6032.379883
2024-12-01,136.460007,253.479996,,132.240005,113.290001,90.529999,357.299988,455.230011,212.918472,51.490002,...,29.379999,91.959999,68.080002,108.010002,,134.080292,108.059998,395.0,,6050.609863


In [657]:
def calculate_log_returns(df):
    returns_df = df.copy()
    returns_df = np.log(df / df.shift(1))
    returns_df = returns_df.dropna(how='all')
    return returns_df


log_returns = pd.melt(
    calculate_log_returns(sp500_close).reset_index(),
    id_vars=['Date'],
    value_vars=list(sp500_close.columns),
    var_name='ticker',
    value_name='log_return'
).rename(columns={"Date": 'date'}).set_index(['date', 'ticker']).sort_index()

target = pd.melt(
    sp500_close.reset_index(),
    id_vars=['Date'],
    value_vars=list(sp500_close.columns),
    var_name='ticker',
    value_name='price'
).rename(columns={"Date": 'date'}).set_index(['date', 'ticker']).sort_index()

target['log_return'] = log_returns['log_return']
target['class_1'] = (target['log_return'] > 0) * 1

for i in range(len(target)):
    idx = target.index[i]
    spx_idx = (idx[0], '^GSPC')
    spx_return = target.at[spx_idx, 'log_return']
    stock_return = target.at[idx, 'log_return']
    target.at[idx, 'class_2'] = (stock_return > spx_return) * 1

target['class_2'] = target['class_2'].astype('int')
target.to_csv('sp500/target.csv')