In [19]:
import pandas as pd
import yfinance as yf
from itertools import combinations
from numpy import std, log, polyfit, isnan
from statsmodels.tsa.stattools import adfuller

def get_sector_tickers(sectors_to_include=None, sectors_to_exclude=None):
    sp500_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    sp500_df['Symbol'] = sp500_df['Symbol'].str.replace('.', '-', regex=False)
    sp500_sectors = sp500_df[['Symbol', 'GICS Sector']]

    if sectors_to_include:
        sp500_sectors = sp500_sectors[sp500_sectors['GICS Sector'].isin(sectors_to_include)]
    if sectors_to_exclude:
        sp500_sectors = sp500_sectors[~sp500_sectors['GICS Sector'].isin(sectors_to_exclude)]

    return sp500_sectors.groupby('GICS Sector')['Symbol'].apply(list).to_dict()

def download_sector_prices(tickers, start_date, end_date):
    print(f"Downloading data for {len(tickers)} tickers...")
    return yf.download(tickers, start=start_date, end=end_date)['Adj Close']

def compute_average_correlation(returns):
    correlations = []
    for stock1, stock2 in combinations(returns.columns, 2):
        corr = returns[stock1].corr(returns[stock2])  
        if not pd.isna(corr):
            correlations.append(corr)
    return sum(correlations) / len(correlations) if correlations else 0

def hurst_exponent(ts):
    """
    Computes the Hurst Exponent of a time series.
    H < 0.5 indicates mean reversion.
    H = 0.5 indicates a random walk.
    H > 0.5 indicates persistence.
    """
    if ts.isnull().any():
        ts = ts.dropna()

    # Rescale the time series
    lags = range(2, 100)
    tau = []
    for lag in lags:
        diff_series = ts.diff(lag).dropna()
        if len(diff_series) > 0:
            tau.append(std(diff_series))

    if len(tau) > 1:
        log_lags = log(lags[:len(tau)])  
        log_tau = log(tau)
        poly = polyfit(log_lags, log_tau, 1)
        return abs(poly[0]) * 2  # Hurst exponent is derived as slope * 2
    else:
        return float('nan')  

def compute_dispersion(returns):
    spreads = []
    for stock1, stock2 in combinations(returns.columns, 2):
        spread = returns[stock1] - returns[stock2] 
        spreads.append(spread.std())
    return sum(spreads) / len(spreads) if spreads else 0

def analyze_sectors(start_date="2024-01-01", end_date="2024-12-01", sectors_to_include=None):
    sector_tickers = get_sector_tickers(sectors_to_include)
    results = []

    for sector, tickers in sector_tickers.items():
        print(f"Analyzing sector: {sector}")
        prices = download_sector_prices(tickers, start_date, end_date)

        if prices.empty:
            print(f"No data for sector: {sector}")
            continue

        returns = prices.pct_change().dropna()

        average_correlation = compute_average_correlation(returns)
        hurst = hurst_exponent(returns.mean(axis=1))  
        dispersion = compute_dispersion(returns)

        results.append({
            'Sector': sector,
            'Average Correlation': average_correlation,
            'Hurst Exponent': hurst,
            'Dispersion': dispersion,
        })

    results_df = pd.DataFrame(results)
    print("\nSector Analysis Results:")
    print(results_df)
    return results_df

results = analyze_sectors(start_date="2024-01-01", end_date="2024-12-01")

Analyzing sector: Communication Services
Downloading data for 22 tickers...


[*********************100%***********************]  22 of 22 completed
[                       0%                       ]

Analyzing sector: Consumer Discretionary
Downloading data for 50 tickers...


[*********************100%***********************]  50 of 50 completed
[****                   8%                       ]  3 of 38 completed

Analyzing sector: Consumer Staples
Downloading data for 38 tickers...


[*********************100%***********************]  38 of 38 completed
[*******               14%                       ]  3 of 22 completed

Analyzing sector: Energy
Downloading data for 22 tickers...


[*********************100%***********************]  22 of 22 completed
[                       0%                       ]

Analyzing sector: Financials
Downloading data for 72 tickers...


[*********************100%***********************]  72 of 72 completed
[**                     5%                       ]  3 of 62 completed

Analyzing sector: Health Care
Downloading data for 62 tickers...


[*********************100%***********************]  62 of 62 completed
[*                      3%                       ]  2 of 78 completed

Analyzing sector: Industrials
Downloading data for 78 tickers...


[*********************100%***********************]  78 of 78 completed
  log_tau = log(tau)
[**                     4%                       ]  3 of 69 completed

Analyzing sector: Information Technology
Downloading data for 69 tickers...


[*********************100%***********************]  69 of 69 completed
[***                    7%                       ]  2 of 28 completed

Analyzing sector: Materials
Downloading data for 28 tickers...


[*********************100%***********************]  28 of 28 completed
[*****                 10%                       ]  3 of 31 completed

Analyzing sector: Real Estate
Downloading data for 31 tickers...


[*********************100%***********************]  31 of 31 completed
[*****                 10%                       ]  3 of 31 completed

Analyzing sector: Utilities
Downloading data for 31 tickers...


[*********************100%***********************]  31 of 31 completed



Sector Analysis Results:
                    Sector  Average Correlation  Hurst Exponent  Dispersion
0   Communication Services             0.197907        0.022309    0.022572
1   Consumer Discretionary             0.248120        0.005277    0.024633
2         Consumer Staples             0.209593        0.006135    0.020315
3                   Energy             0.531601        0.006003    0.016164
4               Financials             0.337698        0.016064    0.017808
5              Health Care             0.182197        0.009455    0.024025
6              Industrials             0.270163             NaN    0.022886
7   Information Technology             0.304175        0.020536    0.028185
8                Materials             0.304050        0.012036    0.022770
9              Real Estate             0.486539        0.013862    0.014497
10               Utilities             0.526968        0.011807    0.014958
