# Correlation and utility analysis of different data in predicting stock market value

This notebook's purpose is to determine how we can use different data provided by yfinance to predict the next stock market rates.

In [17]:
# 1) Install yfinance if not already installed
import sys
import pandas as pd
from IPython.display import display, Image, HTML

try:
    import yfinance as yf
except ImportError:
    %pip install yfinance --quiet
    import yfinance as yf

print("yfinance version:", getattr(yf, "__version__", "unknown"))

yfinance version: 0.2.66


In [18]:
US_names = [
    # USA
    "AAPL",
    "MSFT",
    "NVDA",
    "AMZN",
    "GOOGL",
    "GOOG",
    "META",
    "AVGO",
    "LLY",
    "TSLA",
    "JPM",
    "V",
    "XOM",
    "UNH",
    "JNJ",
    "WMT",
    "MA",
    "PG",
    "ORCL",
    "COST",
    "MRK",
    "HD",
    "KO",
    "PEP",
    "BAC",
    "ADBE",
    "CRM",
    "NFLX",
    "CSCO",
    "AMD",
]

CN_names = [
    # China
    "600519.SS",
    "601318.SS",
    "601398.SS",
    "601288.SS",
    "601988.SS",
    "601857.SS",
    "600028.SS",
    "600036.SS",
    "601166.SS",
    "600900.SS",
    "601888.SS",
    "601012.SS",
    "600104.SS",
    "600030.SS",
    "600585.SS",
    "600000.SS",
    "601601.SS",
    "601939.SS",
    "600019.SS",
    "600276.SS",
    "601766.SS",
    "600309.SS",
    "601633.SS",
    "600887.SS",
    "601668.SS",
    "601658.SS",
    "601728.SS",
    "601628.SS",
    "688981.SS",
]

EU_names = [
    # Europe
    "MC.PA",
    "ASML.AS",
    "OR.PA",
    "TTE.PA",
    "SAN.PA",
    "RMS.PA",
    "AIR.PA",
    "BNP.PA",
    "SU.PA",
    "KER.PA",
]

JP_names = [
    # Japan
    "7203.T",
    "6758.T",
    "9984.T",
    "6861.T",
    "8035.T",
    "9983.T",
    "9432.T",
    "6098.T",
    "4502.T",
    "8316.T",
]

SA_names = [
    # Saudi Arabia
    "2222.SR",
    "1120.SR",
    "2010.SR",
    "1180.SR",
    "1211.SR",
    "7010.SR",
    "2280.SR",
    "1050.SR",
    "1060.SR",
    "4013.SR",
]


In [19]:
ticker_groups = {
    "USA": US_names,
    "China": CN_names,
    "EU": EU_names,
    "JP": JP_names,
    "SA": SA_names,
}

data = {}
ticker_markets = {}

for market, names in ticker_groups.items():
    for name in names:
        ticker = yf.Ticker(name)
        info = None
        if hasattr(ticker, "get_info"):
            try:
                info = ticker.get_info()
            except Exception as e:
                print("get_info() failed for ", ticker.ticker, ":", e)
                try:
                    info = ticker.info
                except Exception as e2:
                    print(".info failed for ", ticker.ticker, ":", e2)
        else:
            try:
                info = ticker.info
            except Exception as e3:
                print(".info failed for ", ticker.ticker, ":", e3)
        if info is None:
            print(f"No info retrieved for {ticker.ticker}; skipping.")
            continue
        data[ticker.ticker] = info
        ticker_markets[ticker.ticker] = market


In [20]:
features = [
    "allTimeHigh",
    "allTimeLow",
    "auditRisk",
    "averageAnalystRating",
    "ask",
    "beta",
    "bid",
    "boardRisk",
    "bookValue",
    "currentPrice",
    "dayHigh",
    "dayLow",
    "debtToEquity",
    "displayName",
    "dividendDate",
    "dividendRate",
    "earningsGrowth",
    "earningsQuarterlyGrowth",
    "ebitda",
    "ebitdaMargins",
    "enterpriseToEbitda",
    "enterpriseToRevenue",
    "enterpriseValue",
    "epsCurrentYear",
    "epsForward",
    "epsTrailingTwelveMonths",
    "exDividendDate",
    "fiftyDayAverage",
    "fiftyDayAverageChange",
    "fiftyDayAverageChangePercent",
    "fiftyTwoWeekChangePercent",
    "fiftyTwoWeekHigh",
    "fiftyTwoWeekHighChange",
    "fiftyTwoWeekHighChangePercent",
    "fiftyTwoWeekLow",
    "fiftyTwoWeekLowChange",
    "fiftyTwoWeekLowChangePercent",
    "floatShares",
    "forwardEps",
    "freeCashflow",
    "grossMargins",
    "grossProfits",
    "hasPrePostMarketData",
    "impliedSharesOutstanding",
    "industry",
    "industryDisp",
    "irWebsite",
    "longBusinessSummary",
    "longName",
    "marketCap",
    "netIncomeToCommon",
    "open",
    "operatingCashflow",
    "operatingMargins",
    "overallRisk",
    "postMarketChange",
    "postMarketChangePercent",
    "postMarketPrice",
    "postMarketTime",
    "previousClose",
    "priceEpsCurrentYear",
    "priceToBook",
    "profitMargins",
    "quickRatio",
    "recommendationKey",
    "recommendationMean",
    "regularMarketChange",
    "regularMarketChangePercent",
    "regularMarketDayHigh",
    "regularMarketDayLow",
    "regularMarketOpen",
    "regularMarketPreviousClose",
    "regularMarketPrice",
    "regularMarketTime",
    "returnOnAssets",
    "returnOnEquity",
    "revenueGrowth",
    "revenuePerShare",
    "sector",
    "sectorDisp",
    "shareHolderRightsRisk",
    "sharesPercentSharesOut",
    "sharesShort",
    "shortName",
    "symbol",
    "targetHighPrice",
    "targetLowPrice",
    "targetMeanPrice",
    "targetMedianPrice",
    "totalCash",
    "totalCashPerShare",
    "totalDebt",
    "totalRevenue",
    "trailingEps",
    "trailingPE",
    "trailingPegRatio",
    "twoHundredDayAverage",
    "twoHundredDayAverageChange",
    "twoHundredDayAverageChangePercent",
    "volume",
    "website",
]

def compute_fill_rate_series(frame):
    return (
        frame.notna()
        .mean()
        .mul(100)
        .round(2)
    )

if not data:
    print("No ticker info retrieved; cannot compute feature fill rates.")
else:
    info_df = pd.DataFrame.from_dict(data, orient="index")
    feature_df = info_df.reindex(columns=features)

    group_fill_rates = {}
    group_sizes = {}
    markets = ["USA", "China", "EU", "JP", "SA"]

    for market in markets:
        market_tickers = [ticker for ticker, origin in ticker_markets.items() if origin == market]
        subset = feature_df.loc[[ticker for ticker in market_tickers if ticker in feature_df.index]]
        if subset.empty:
            print(f"No data available for {market}; skipping fill-rate computation.")
            continue
        group_fill_rates[market] = compute_fill_rate_series(subset)
        group_sizes[market] = len(subset)

    if feature_df.empty:
        print("No data available for combined dataset; skipping fill-rate computation.")
    else:
        group_fill_rates["Combined"] = compute_fill_rate_series(feature_df)
        group_sizes["Combined"] = len(feature_df)

    if not group_fill_rates:
        print("No fill-rate results to display.")
    else:
        combined_df = (
            pd.DataFrame(group_fill_rates)
            .reindex(features)
            .rename_axis("feature")
            .reset_index()
        )

        rename_map = {
            "USA": "fill_rate_pct_USA",
            "China": "fill_rate_pct_China",
            "EU": "fill_rate_pct_EU",
            "JP": "fill_rate_pct_JP",
            "SA": "fill_rate_pct_SA",
            "Combined": "fill_rate_pct_Total",
        }
        combined_df = combined_df.rename(columns=rename_map)

        ordered_columns = [
            "feature",
            "fill_rate_pct_USA",
            "fill_rate_pct_China",
            "fill_rate_pct_EU",
            "fill_rate_pct_JP",
            "fill_rate_pct_SA",
            "fill_rate_pct_Total",
        ]
        combined_df = combined_df[[col for col in ordered_columns if col in combined_df.columns]]

        for label, size in group_sizes.items():
            print(f"Computed fill rates across {size} {label.lower()} tickers.")

        with pd.option_context("display.max_rows", None, "display.max_columns", None):
            display(combined_df)


Computed fill rates across 30 usa tickers.
Computed fill rates across 29 china tickers.
Computed fill rates across 10 eu tickers.
Computed fill rates across 10 jp tickers.
Computed fill rates across 10 sa tickers.
Computed fill rates across 89 combined tickers.


Unnamed: 0,feature,fill_rate_pct_USA,fill_rate_pct_China,fill_rate_pct_EU,fill_rate_pct_JP,fill_rate_pct_SA,fill_rate_pct_Total
0,allTimeHigh,100.0,100.0,100.0,100.0,100.0,100.0
1,allTimeLow,100.0,100.0,100.0,100.0,100.0,100.0
2,auditRisk,96.67,34.48,100.0,100.0,0.0,66.29
3,averageAnalystRating,100.0,0.0,100.0,90.0,70.0,62.92
4,ask,100.0,100.0,90.0,100.0,100.0,98.88
5,beta,100.0,100.0,100.0,80.0,100.0,97.75
6,bid,100.0,100.0,90.0,100.0,100.0,98.88
7,boardRisk,96.67,34.48,100.0,100.0,0.0,66.29
8,bookValue,100.0,100.0,100.0,100.0,100.0,100.0
9,currentPrice,100.0,100.0,100.0,100.0,100.0,100.0
