# Correlation and utility analysis of different data in predicting stock market value

This notebook's purpose is to determine how we can use different data provided by yfinance to predict the next stock market rates.

In [25]:
# 1) Install yfinance if not already installed
import sys
import pandas as pd
from IPython.display import display, Image, HTML

try:
    import yfinance as yf
except ImportError:
    %pip install yfinance --quiet
    import yfinance as yf

print("yfinance version:", getattr(yf, "__version__", "unknown"))

yfinance version: 0.2.66


In [26]:
names = [
    "AAPL",
    "MSFT",
    "NVDA",
    "AMZN",
    "GOOGL",
    "GOOG",
    "META",
    "AVGO",
    "LLY",
    "TSLA",
    "JPM",
    "V",
    "XOM",
    "UNH",
    "JNJ",
    "WMT",
    "MA",
    "PG",
    "ORCL",
    "COST",
    "MRK",
    "HD",
    "KO",
    "PEP",
    "BAC",
    "ADBE",
    "CRM",
    "NFLX",
    "CSCO",
    "AMD",
]


In [27]:
tickers = (yf.Ticker(name) for name in names)
data={ }
for ticker in tickers:
    if hasattr(ticker, "get_info"):
        try:
            info = ticker.get_info()
        except Exception as e:
            print("get_info() failed for ", ticker.ticker, ":", e)
            try:
                info = ticker.info
            except Exception as e2:
                print(".info failed for ", ticker.ticker, ":", e2)
    else:
        try:
            info = ticker.info
        except Exception as e3:
            print(".info failed for ", ticker.ticker, ":", e3)
    data[ticker.ticker] = info

In [28]:
features = [
    "allTimeHigh",
    "allTimeLow",
    "auditRisk",
    "averageAnalystRating",
    "ask",
    "beta",
    "bid",
    "boardRisk",
    "bookValue",
    "currentPrice",
    "dayHigh",
    "dayLow",
    "debtToEquity",
    "displayName",
    "dividendDate",
    "dividendRate",
    "earningsGrowth",
    "earningsQuarterlyGrowth",
    "ebitda",
    "ebitdaMargins",
    "enterpriseToEbitda",
    "enterpriseToRevenue",
    "enterpriseValue",
    "epsCurrentYear",
    "epsForward",
    "epsTrailingTwelveMonths",
    "exDividendDate",
    "fiftyDayAverage",
    "fiftyDayAverageChange",
    "fiftyDayAverageChangePercent",
    "fiftyTwoWeekChangePercent",
    "fiftyTwoWeekHigh",
    "fiftyTwoWeekHighChange",
    "fiftyTwoWeekHighChangePercent",
    "fiftyTwoWeekLow",
    "fiftyTwoWeekLowChange",
    "fiftyTwoWeekLowChangePercent",
    "floatShares",
    "forwardEps",
    "freeCashflow",
    "grossMargins",
    "grossProfits",
    "hasPrePostMarketData",
    "impliedSharesOutstanding",
    "industry",
    "industryDisp",
    "irWebsite",
    "longBusinessSummary",
    "longName",
    "marketCap",
    "netIncomeToCommon",
    "open",
    "operatingCashflow",
    "operatingMargins",
    "overallRisk",
    "postMarketChange",
    "postMarketChangePercent",
    "postMarketPrice",
    "postMarketTime",
    "previousClose",
    "priceEpsCurrentYear",
    "priceToBook",
    "profitMargins",
    "quickRatio",
    "recommendationKey",
    "recommendationMean",
    "regularMarketChange",
    "regularMarketChangePercent",
    "regularMarketDayHigh",
    "regularMarketDayLow",
    "regularMarketOpen",
    "regularMarketPreviousClose",
    "regularMarketPrice",
    "regularMarketTime",
    "returnOnAssets",
    "returnOnEquity",
    "revenueGrowth",
    "revenuePerShare",
    "sector",
    "sectorDisp",
    "shareHolderRightsRisk",
    "sharesPercentSharesOut",
    "sharesShort",
    "shortName",
    "symbol",
    "targetHighPrice",
    "targetLowPrice",
    "targetMeanPrice",
    "targetMedianPrice",
    "totalCash",
    "totalCashPerShare",
    "totalDebt",
    "totalRevenue",
    "trailingEps",
    "trailingPE",
    "trailingPegRatio",
    "twoHundredDayAverage",
    "twoHundredDayAverageChange",
    "twoHundredDayAverageChangePercent",
    "volume",
    "website"
]

# Calculate feature fill rates across tickers
if not data:
    print("No ticker info retrieved; cannot compute feature fill rates.")
else:
    info_df = pd.DataFrame.from_dict(data, orient="index")
    feature_df = info_df.reindex(columns=features)
    fill_rates = (
        feature_df.notna()
        .mean()
        .mul(100)
        .round(2)
        .rename("fill_rate_pct")
    )
    fill_rates_df = (
        fill_rates.to_frame()
        .reset_index()
        .rename(columns={"index": "feature"})
        .sort_values(by="fill_rate_pct", ascending=False)
        .reset_index(drop=True)
    )
    print(f"Computed fill rates across {len(feature_df)} tickers.")
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        display(fill_rates_df)


Computed fill rates across 30 tickers.


Unnamed: 0,feature,fill_rate_pct
0,allTimeHigh,100.0
1,allTimeLow,100.0
2,averageAnalystRating,100.0
3,ask,100.0
4,beta,100.0
5,fiftyTwoWeekHighChangePercent,100.0
6,bid,100.0
7,bookValue,100.0
8,currentPrice,100.0
9,dayLow,100.0
