# Correlation and utility analysis of different data in predicting stock market value

This notebook's purpose is to determine how we can use different data provided by yfinance to predict the next stock market rates.

In [14]:
# 1) Install yfinance if not already installed
import sys
import pandas as pd
from IPython.display import display, Image, HTML

try:
    import yfinance as yf
except ImportError:
    %pip install yfinance --quiet
    import yfinance as yf

print("yfinance version:", getattr(yf, "__version__", "unknown"))

yfinance version: 0.2.66


In [15]:
US_names = [
    # USA
    "AAPL",
    "MSFT",
    "NVDA",
    "AMZN",
    "GOOGL",
    "GOOG",
    "META",
    "AVGO",
    "LLY",
    "TSLA",
    "JPM",
    "V",
    "XOM",
    "UNH",
    "JNJ",
    "WMT",
    "MA",
    "PG",
    "ORCL",
    "COST",
    "MRK",
    "HD",
    "KO",
    "PEP",
    "BAC",
    "ADBE",
    "CRM",
    "NFLX",
    "CSCO",
    "AMD",
]

CN_names = [
    # China
    "600519.SS",
    "601318.SS",
    "601398.SS",
    "601288.SS",
    "601988.SS",
    "601857.SS",
    "600028.SS",
    "600036.SS",
    "601166.SS",
    "600900.SS",
    "601888.SS",
    "601012.SS",
    "600104.SS",
    "600030.SS",
    "600585.SS",
    "600000.SS",
    "601601.SS",
    "601939.SS",
    "600019.SS",
    "600276.SS",
    "601766.SS",
    "600309.SS",
    "601633.SS",
    "600887.SS",
    "601668.SS",
    "601658.SS",
    "601728.SS",
    "601628.SS",
    "688981.SS",
]

EU_names = [
    # Europe
    "MC.PA",
    "ASML.AS",
    "OR.PA",
    "TTE.PA",
    "SAN.PA",
    "RMS.PA",
    "AIR.PA",
    "BNP.PA",
    "SU.PA",
    "KER.PA",
]

JP_names = [
    # Japan
    "7203.T",
    "6758.T",
    "9984.T",
    "6861.T",
    "8035.T",
    "9983.T",
    "9432.T",
    "6098.T",
    "4502.T",
    "8316.T",
]

SA_names = [
    # Saudi Arabia
    "2222.SR",
    "1120.SR",
    "2010.SR",
    "1180.SR",
    "1211.SR",
    "7010.SR",
    "2280.SR",
    "1050.SR",
    "1060.SR",
    "4013.SR",
]

CRYPTO_names = [
    # Cryptocurrency
    "BTC-USD",
    "ETH-USD",
    "USDT-USD",
    "BNB-USD",
    "SOL-USD",
    "XRP-USD",
    "USDC-USD",
    "DOGE-USD",
    "ADA-USD",
    "TRX-USD",
    "TON-USD",
    "AVAX-USD",
    "SHIB-USD",
    "DOT-USD",
    "DAI-USD",
    "LTC-USD",
    "WBTC-USD",
    "BCH-USD",
    "LINK-USD",
    "NEAR-USD",
]

FX_names = [
    # Forex
    "EURUSD=X",
    "USDJPY=X",
    "GBPUSD=X",
    "USDCNY=X",
    "USDCHF=X",
    "AUDUSD=X",
    "USDCAD=X",
    "NZDUSD=X",
    "EURJPY=X",
    "GBPJPY=X",
]

COM_names = [
    # Commodities & Energy
    "GC=F",
    "SI=F",
    "PL=F",
    "CL=F",
    "BZ=F",
    "NG=F",
    "ZC=F",
    "ZW=F",
    "ZS=F",
    "KC=F",
]

IDX_names = [
    # Indices
    "^GSPC",
    "^DJI",
    "^IXIC",
    "^RUT",
    "^GSPTSE",
    "^FTSE",
    "^GDAXI",
    "^FCHI",
    "^STOXX50E",
    "FTSEMIB.MI",
    "SMIN.SW",
    "^IBEX",
    "^AEX",
    "^BFX",
    "^OMX",
    "^N100",
    "^N225",
    "TOPIX100.T",
    "^HSI",
    "000001.SS",
    "399001.SZ",
    "^KS11",
    "^TWII",
    "^STI",
    "^BSESN",
    "^AXJO",
    "^NZ50",
    "^JKSE",
    "^KLSE",
    "^BVSP",
]


In [16]:
ticker_groups = {
    "USA": US_names,
    "China": CN_names,
    "EU": EU_names,
    "JP": JP_names,
    "SA": SA_names,
    "CRYPTO": CRYPTO_names,
    "FX": FX_names,
    "COM": COM_names,
    "IDX": IDX_names,
}

data = {}
ticker_markets = {}

for market, names in ticker_groups.items():
    for name in names:
        ticker = yf.Ticker(name)
        info = None
        if hasattr(ticker, "get_info"):
            try:
                info = ticker.get_info()
            except Exception as e:
                print("get_info() failed for ", ticker.ticker, ":", e)
                try:
                    info = ticker.info
                except Exception as e2:
                    print(".info failed for ", ticker.ticker, ":", e2)
        else:
            try:
                info = ticker.info
            except Exception as e3:
                print(".info failed for ", ticker.ticker, ":", e3)
        if info is None:
            print(f"No info retrieved for {ticker.ticker}; skipping.")
            continue
        data[ticker.ticker] = info
        ticker_markets[ticker.ticker] = market


In [17]:
features = [
    "allTimeHigh",
    "allTimeLow",
    "auditRisk",
    "averageAnalystRating",
    "ask",
    "beta",
    "bid",
    "boardRisk",
    "bookValue",
    "currentPrice",
    "dayHigh",
    "dayLow",
    "debtToEquity",
    "displayName",
    "dividendDate",
    "dividendRate",
    "earningsGrowth",
    "earningsQuarterlyGrowth",
    "ebitda",
    "ebitdaMargins",
    "enterpriseToEbitda",
    "enterpriseToRevenue",
    "enterpriseValue",
    "epsCurrentYear",
    "epsForward",
    "epsTrailingTwelveMonths",
    "exDividendDate",
    "fiftyDayAverage",
    "fiftyDayAverageChange",
    "fiftyDayAverageChangePercent",
    "fiftyTwoWeekChangePercent",
    "fiftyTwoWeekHigh",
    "fiftyTwoWeekHighChange",
    "fiftyTwoWeekHighChangePercent",
    "fiftyTwoWeekLow",
    "fiftyTwoWeekLowChange",
    "fiftyTwoWeekLowChangePercent",
    "floatShares",
    "forwardEps",
    "freeCashflow",
    "grossMargins",
    "grossProfits",
    "hasPrePostMarketData",
    "impliedSharesOutstanding",
    "industry",
    "industryDisp",
    "irWebsite",
    "longBusinessSummary",
    "longName",
    "marketCap",
    "netIncomeToCommon",
    "open",
    "operatingCashflow",
    "operatingMargins",
    "overallRisk",
    "postMarketChange",
    "postMarketChangePercent",
    "postMarketPrice",
    "postMarketTime",
    "previousClose",
    "priceEpsCurrentYear",
    "priceToBook",
    "profitMargins",
    "quickRatio",
    "recommendationKey",
    "recommendationMean",
    "regularMarketChange",
    "regularMarketChangePercent",
    "regularMarketDayHigh",
    "regularMarketDayLow",
    "regularMarketOpen",
    "regularMarketPreviousClose",
    "regularMarketPrice",
    "regularMarketTime",
    "returnOnAssets",
    "returnOnEquity",
    "revenueGrowth",
    "revenuePerShare",
    "sector",
    "sectorDisp",
    "shareHolderRightsRisk",
    "sharesPercentSharesOut",
    "sharesShort",
    "shortName",
    "symbol",
    "targetHighPrice",
    "targetLowPrice",
    "targetMeanPrice",
    "targetMedianPrice",
    "totalCash",
    "totalCashPerShare",
    "totalDebt",
    "totalRevenue",
    "trailingEps",
    "trailingPE",
    "trailingPegRatio",
    "twoHundredDayAverage",
    "twoHundredDayAverageChange",
    "twoHundredDayAverageChangePercent",
    "volume",
    "website",
]

def compute_fill_rate_series(frame):
    return (
        frame.notna()
        .mean()
        .mul(100)
        .round(2)
    )

if not data:
    print("No ticker info retrieved; cannot compute feature fill rates.")
else:
    info_df = pd.DataFrame.from_dict(data, orient="index")
    feature_df = info_df.reindex(columns=features)

    group_fill_rates = {}
    group_sizes = {}
    markets = ["USA", "China", "EU", "JP", "SA", "CRYPTO", "FX", "COM", "IDX"]

    for market in markets:
        market_tickers = [ticker for ticker, origin in ticker_markets.items() if origin == market]
        subset = feature_df.loc[[ticker for ticker in market_tickers if ticker in feature_df.index]]
        if subset.empty:
            print(f"No data available for {market}; skipping fill-rate computation.")
            continue
        group_fill_rates[market] = compute_fill_rate_series(subset)
        group_sizes[market] = len(subset)

    if feature_df.empty:
        print("No data available for combined dataset; skipping fill-rate computation.")
    else:
        group_fill_rates["Combined"] = compute_fill_rate_series(feature_df)
        group_sizes["Combined"] = len(feature_df)

    if not group_fill_rates:
        print("No fill-rate results to display.")
    else:
        combined_df = (
            pd.DataFrame(group_fill_rates)
            .reindex(features)
            .rename_axis("feature")
            .reset_index()
        )

        rename_map = {
        "USA": "fill_rate_pct_USA",
        "China": "fill_rate_pct_China",
        "EU": "fill_rate_pct_EU",
        "JP": "fill_rate_pct_JP",
        "SA": "fill_rate_pct_SA",
        "CRYPTO": "fill_rate_pct_CRYPTO",
        "FX": "fill_rate_pct_FX",
        "COM": "fill_rate_pct_COM",
        "IDX": "fill_rate_pct_IDX",
        "Combined": "fill_rate_pct_Total",
        }
        combined_df = combined_df.rename(columns=rename_map)

        ordered_columns = [
        "feature",
        "fill_rate_pct_USA",
        "fill_rate_pct_China",
        "fill_rate_pct_EU",
        "fill_rate_pct_JP",
        "fill_rate_pct_SA",
        "fill_rate_pct_CRYPTO",
        "fill_rate_pct_FX",
        "fill_rate_pct_COM",
        "fill_rate_pct_IDX",
        "fill_rate_pct_Total",
        ]
        combined_df = combined_df[[col for col in ordered_columns if col in combined_df.columns]]

        for label, size in group_sizes.items():
            print(f"Computed fill rates across {size} {label.lower()} tickers.")

        with pd.option_context("display.max_rows", None, "display.max_columns", None):
            display(combined_df)


Computed fill rates across 30 usa tickers.
Computed fill rates across 29 china tickers.
Computed fill rates across 10 eu tickers.
Computed fill rates across 10 jp tickers.
Computed fill rates across 10 sa tickers.
Computed fill rates across 20 crypto tickers.
Computed fill rates across 10 fx tickers.
Computed fill rates across 10 com tickers.
Computed fill rates across 30 idx tickers.
Computed fill rates across 159 combined tickers.


Unnamed: 0,feature,fill_rate_pct_USA,fill_rate_pct_China,fill_rate_pct_EU,fill_rate_pct_JP,fill_rate_pct_SA,fill_rate_pct_CRYPTO,fill_rate_pct_FX,fill_rate_pct_COM,fill_rate_pct_IDX,fill_rate_pct_Total
0,allTimeHigh,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,93.33,98.74
1,allTimeLow,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,93.33,98.74
2,auditRisk,96.67,34.48,100.0,100.0,0.0,0.0,0.0,0.0,0.0,37.11
3,averageAnalystRating,100.0,0.0,100.0,90.0,70.0,0.0,0.0,0.0,0.0,35.22
4,ask,100.0,100.0,90.0,100.0,100.0,0.0,100.0,100.0,93.33,85.53
5,beta,100.0,100.0,100.0,80.0,100.0,0.0,0.0,0.0,0.0,54.72
6,bid,100.0,100.0,90.0,100.0,100.0,0.0,100.0,100.0,93.33,85.53
7,boardRisk,96.67,34.48,100.0,100.0,0.0,0.0,0.0,0.0,0.0,37.11
8,bookValue,100.0,100.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,55.97
9,currentPrice,100.0,100.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,55.97


In [18]:
# Build long-form dataframe of info key/value pairs per ticker
if not data:
    print("No ticker info retrieved; cannot build per-ticker dataframes.")
else:
    ticker_dataframes = {}
    ticker_numeric_dataframes = {}
    combined_frames = []
    combined_numeric_frames = []
    for ticker, info in data.items():
        if not isinstance(info, dict) or not info:
            print(f"No structured info for {ticker}; skipping.")
            continue
        ticker_df = pd.DataFrame(info.items(), columns=["attribute", "value"])
        ticker_df.insert(0, "ticker", ticker)
        ticker_dataframes[ticker] = ticker_df
        combined_frames.append(ticker_df)

        numeric_values = pd.to_numeric(ticker_df["value"], errors="coerce")
        numeric_mask = numeric_values.notna()
        if numeric_mask.any():
            ticker_numeric_df = ticker_df.loc[numeric_mask].copy()
            ticker_numeric_df.loc[:, "value"] = numeric_values.loc[numeric_mask]
            ticker_numeric_dataframes[ticker] = ticker_numeric_df
            combined_numeric_frames.append(ticker_numeric_df)
        else:
            print(f"No numeric values for {ticker}; numeric dataframe skipped.")

    if not combined_frames:
        print("No per-ticker dataframes were created.")
    else:
        all_ticker_info_df = pd.concat(combined_frames, ignore_index=True)
        print(f"Built long-form dataframe with {len(all_ticker_info_df)} rows across {len(combined_frames)} tickers.")
        with pd.option_context("display.max_rows", 20, "display.max_columns", None):
            display(all_ticker_info_df)

        if combined_numeric_frames:
            all_ticker_numeric_info_df = pd.concat(combined_numeric_frames, ignore_index=True)
            print("Numeric-only dataframe copy:")
            print(f"Contains {len(all_ticker_numeric_info_df)} numeric rows across {len(combined_numeric_frames)} tickers.")
            with pd.option_context("display.max_rows", 20, "display.max_columns", None):
                display(all_ticker_numeric_info_df)
        else:
            print("No numeric-only dataframes were created.")


Built long-form dataframe with 19620 rows across 159 tickers.


Unnamed: 0,ticker,attribute,value
0,AAPL,address1,One Apple Park Way
1,AAPL,city,Cupertino
2,AAPL,state,CA
3,AAPL,zip,95014
4,AAPL,country,United States
...,...,...,...
19615,^BVSP,regularMarketChangePercent,-0.031442
19616,^BVSP,regularMarketPrice,143904.38
19617,^BVSP,shortName,IBOVESPA
19618,^BVSP,longName,IBOVESPA


Numeric-only dataframe copy:
Contains 14362 numeric rows across 159 tickers.


Unnamed: 0,ticker,attribute,value
0,AAPL,zip,95014.0
1,AAPL,fullTimeEmployees,150000.0
2,AAPL,auditRisk,7.0
3,AAPL,boardRisk,1.0
4,AAPL,compensationRisk,3.0
...,...,...,...
14357,^BVSP,fiftyTwoWeekChangePercent,9.225142
14358,^BVSP,fiftyDayAverageChange,4268.3906
14359,^BVSP,fiftyDayAverageChangePercent,0.030568
14360,^BVSP,regularMarketChangePercent,-0.031442


In [19]:
# Build per-ticker dataframes (full + numeric-only) and store them in a container DataFrame
if not data:
    print("No ticker info retrieved; cannot build per-ticker dataframes.")
else:
    ticker_dataframes = {}
    ticker_numeric_dataframes = {}
    ticker_rows = []

    for ticker, info in data.items():
        if not isinstance(info, dict) or not info:
            print(f"No structured info for {ticker}; skipping.")
            continue

        ticker_df = pd.DataFrame(info.items(), columns=["attribute", "value"])
        ticker_df.insert(0, "ticker", ticker)

        ticker_obj = yf.Ticker(ticker)
        extra_rows = []
        try:
            hist_daily = ticker_obj.history(period="1y", interval="1d")
        except Exception as exc:
            print(f"history() failed for {ticker}: {exc}")
            hist_daily = pd.DataFrame()

        if hist_daily.empty or "Close" not in hist_daily.columns:
            if hist_daily.empty:
                print(f"No historical daily data for {ticker}; window averages skipped.")
            else:
                print(f"No close price series for {ticker}; window averages skipped.")
        else:
            close_series = hist_daily["Close"].dropna()
            if close_series.empty:
                print(f"No close price values for {ticker}; window averages skipped.")
            else:
                latest_ts = close_series.index[-1]
                window_specs = [
                    ("avg_price_last_24h", pd.Timedelta(days=1)),
                    ("avg_price_last_7d", pd.Timedelta(days=7)),
                    ("avg_price_last_15d", pd.Timedelta(days=15)),
                    ("avg_price_last_30d", pd.Timedelta(days=30)),
                    ("avg_price_last_6m", pd.DateOffset(months=6)),
                    ("avg_price_last_1y", pd.DateOffset(years=1)),
                ]

                for label, offset in window_specs:
                    if isinstance(offset, pd.DateOffset):
                        cutoff = latest_ts - offset
                    else:
                        cutoff = latest_ts - offset
                    window_series = close_series[close_series.index >= cutoff]
                    if window_series.empty:
                        continue
                    avg_val = window_series.mean()
                    extra_rows.append({
                        "ticker": ticker,
                        "attribute": label,
                        "value": avg_val,
                    })

        if extra_rows:
            ticker_df = pd.concat([ticker_df, pd.DataFrame(extra_rows)], ignore_index=True)

        ticker_dataframes[ticker] = ticker_df

        numeric_values = pd.to_numeric(ticker_df["value"], errors="coerce")
        numeric_mask = numeric_values.notna()
        numeric_df = None
        if numeric_mask.any():
            numeric_df = ticker_df.loc[numeric_mask].copy()
            numeric_df.loc[:, "value"] = numeric_values.loc[numeric_mask]
            ticker_numeric_dataframes[ticker] = numeric_df

        ticker_rows.append({
            "ticker": ticker,
            "info_dataframe": ticker_df,
            "numeric_dataframe": numeric_df,
            "rows_total": len(ticker_df),
            "rows_numeric": len(numeric_df) if numeric_df is not None else 0,
        })

    if not ticker_rows:
        print("No per-ticker dataframes were created.")
    else:
        per_ticker_df = pd.DataFrame(ticker_rows)
        print(f"Created dataframe container for {len(per_ticker_df)} tickers.")
        with pd.option_context("display.max_rows", None, "display.max_columns", None):
            display(per_ticker_df)


Created dataframe container for 159 tickers.


Unnamed: 0,ticker,info_dataframe,numeric_dataframe,rows_total,rows_numeric
0,AAPL,ticker attribute ...,ticker attribute ...,184,143
1,MSFT,ticker attribute ...,ticker attribute ...,184,142
2,NVDA,ticker attribute ...,ticker attribute ...,184,143
3,AMZN,ticker attribute ...,ticker attribute ...,177,135
4,GOOGL,ticker attribute ...,ticker attribute ...,181,142
5,GOOG,ticker attribute ...,ticker attribute ...,175,135
6,META,ticker attribute ...,ticker attribute ...,181,141
7,AVGO,ticker attribute ...,ticker attribute ...,183,143
8,LLY,ticker attribute ...,ticker attribute ...,183,143
9,TSLA,ticker attribute ...,ticker attribute ...,176,136


In [22]:
i=0
for _, row in per_ticker_df.iterrows():
    i+=1
    print(f"Ticker: {row['ticker']}")
    print(f"Total rows in info dataframe: {row['rows_total']}")
    print(f"Total rows in numeric dataframe: {row['rows_numeric']}")
    print("Info DataFrame preview:")
    display(row['info_dataframe'].head())
    if row['numeric_dataframe'] is not None:
        print("Numeric DataFrame preview:")
        display(row['numeric_dataframe'].head())
    else:
        print("No numeric dataframe available.")
    print("-" * 40)
    if i > 2:
        print("... Output truncated for brevity ...")
        break

Ticker: AAPL
Total rows in info dataframe: 184
Total rows in numeric dataframe: 143
Info DataFrame preview:


Unnamed: 0,ticker,attribute,value
0,AAPL,address1,One Apple Park Way
1,AAPL,city,Cupertino
2,AAPL,state,CA
3,AAPL,zip,95014
4,AAPL,country,United States


Numeric DataFrame preview:


Unnamed: 0,ticker,attribute,value
3,AAPL,zip,95014.0
14,AAPL,fullTimeEmployees,150000.0
16,AAPL,auditRisk,7.0
17,AAPL,boardRisk,1.0
18,AAPL,compensationRisk,3.0


----------------------------------------
Ticker: MSFT
Total rows in info dataframe: 184
Total rows in numeric dataframe: 142
Info DataFrame preview:


Unnamed: 0,ticker,attribute,value
0,MSFT,address1,One Microsoft Way
1,MSFT,city,Redmond
2,MSFT,state,WA
3,MSFT,zip,98052-6399
4,MSFT,country,United States


Numeric DataFrame preview:


Unnamed: 0,ticker,attribute,value
14,MSFT,fullTimeEmployees,228000.0
16,MSFT,auditRisk,9.0
17,MSFT,boardRisk,5.0
18,MSFT,compensationRisk,4.0
19,MSFT,shareHolderRightsRisk,2.0


----------------------------------------
Ticker: NVDA
Total rows in info dataframe: 184
Total rows in numeric dataframe: 143
Info DataFrame preview:


Unnamed: 0,ticker,attribute,value
0,NVDA,address1,2788 San Tomas Expressway
1,NVDA,city,Santa Clara
2,NVDA,state,CA
3,NVDA,zip,95051
4,NVDA,country,United States


Numeric DataFrame preview:


Unnamed: 0,ticker,attribute,value
3,NVDA,zip,95051.0
14,NVDA,fullTimeEmployees,36000.0
16,NVDA,auditRisk,5.0
17,NVDA,boardRisk,10.0
18,NVDA,compensationRisk,4.0


----------------------------------------
... Output truncated for brevity ...
