In [None]:
import numpy as np
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm
from ite.cost.x_factory import co_factory
import random
import scipy
import pickle
import warnings
import dill

In [None]:
def shuffle_data(data, shuffle_indices):
    # Create a copy of the original data to avoid modifying the input in place.
    shuffled_data = data.copy()

    for indices in shuffle_indices:
        # Generate a random permutation for the indices.
        permutation = np.random.permutation(shuffled_data.shape[0])

        # Apply the permutation to the specified indices.
        for idx in indices:
            shuffled_data[:, idx] = shuffled_data[permutation, idx]

    return shuffled_data

def Streitberg_4(X, div_func):
    n = X.shape[0]

    X_fully_shuffled = shuffle_data(X, [[0], [1], [2]])
    p1234 = div_func(X, X_fully_shuffled)
    p1p234 = div_func(X[:,[1, 2, 3]], X_fully_shuffled[:,[1, 2, 3]])
    p2p134 = div_func(X[:,[0, 2, 3]], X_fully_shuffled[:,[0, 2, 3]])
    p3p124 = div_func(X[:,[0, 1, 3]], X_fully_shuffled[:,[0, 1, 3]])
    p4p123 = div_func(X[:,[0, 1, 2]], X_fully_shuffled[:,[0, 1, 2]])
    p12p34 = div_func(shuffle_data(X, [[0, 1]]), X_fully_shuffled)
    p13p24 = div_func(shuffle_data(X, [[0, 2]]), X_fully_shuffled)
    p14p23 = div_func(shuffle_data(X, [[0, 3]]), X_fully_shuffled)
    p1p2p34 = div_func(X[:,[2, 3]], X_fully_shuffled[:,[2, 3]])
    p1p3p24 = div_func(X[:,[1, 3]], X_fully_shuffled[:,[1, 3]])
    p1p4p23 = div_func(X[:,[1, 2]], X_fully_shuffled[:,[1, 2]])
    p2p3p14 = div_func(X[:,[0, 3]], X_fully_shuffled[:,[0, 3]])
    p2p4p13 = div_func(X[:,[0, 2]], X_fully_shuffled[:,[0, 2]])
    p3p4p12 = div_func(X[:,[0, 1]], X_fully_shuffled[:,[0, 1]])

    streitberg_4 = (p1234 - (p1p234 + p2p134 + p3p124 + p4p123) - (p12p34 + p13p24 + p14p23)
                    + 2 * (p1p2p34 + p1p3p24 + p1p4p23 + p2p3p14 + p2p4p13 + p3p4p12))

    return streitberg_4

def Streitberg_3(X, div_func):
    n = X.shape[0]

    X_fully_shuffled = shuffle_data(X, [[0], [1]])
    p123 = div_func(X, X_fully_shuffled)
    p1p23 = div_func(X[:,[1, 2]], X_fully_shuffled[:,[1, 2]])
    p2p13 = div_func(X[:,[0, 2]], X_fully_shuffled[:,[0, 2]])
    p3p12 = div_func(X[:,[0, 1]], X_fully_shuffled[:,[0, 1]])

    streitberg_3 = p123 - (p1p23 + p2p13 + p3p12)

    return streitberg_3

def Streitberg_2(X, div_func):
    n = X.shape[0]

    X_fully_shuffled = shuffle_data(X, [[0]])
    p12 = div_func(X, X_fully_shuffled)
    
    return p12

# Stock info

In [None]:
info = pd.read_csv('data/sp500/sp500_companies.csv')

In [None]:
# Initialize an empty dictionary
category_dict = {}

# Group by 'Category' and iterate over each group
for category, group in info.groupby('Sector'):
    # Count the occurrences of each subcategory within the group
    subcategory_counts = group['Industry'].value_counts().to_dict()
    
    # Store the counts dictionary in the main dictionary with category as key
    category_dict[category] = subcategory_counts

In [None]:
info[info['Symbol']=='STT']

# Stock data

In [None]:
df = pd.read_csv('data/sp500/sp500_stocks.csv')
df['return'] = df['Close'] - df['Open']
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
pivot_df = df[['Date', 'Symbol', 'return']].pivot_table(index='Date', columns='Symbol', values='return')

In [None]:
pivot_df.reset_index(inplace=True)
pivot_df.columns.name = None

In [None]:
# Check for missing values in each column and count them
missing_values_count = pivot_df.isna().sum()

# Count the number of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].count()

In [None]:
# if more than 20% missingness in data
drop_company = list(missing_values_count[missing_values_count > 720].keys())

In [None]:
pivot_df_drop = pivot_df.drop(columns=drop_company)

In [None]:
pivot_df_dropna = pivot_df_drop.dropna()

In [None]:
sector_dict = info[['Symbol', 'Sector']]
sector_dict = sector_dict[~sector_dict['Symbol'].isin(drop_company)]

In [None]:
sector_dict = sector_dict.groupby('Sector')['Symbol'].apply(list).to_dict()

In [None]:
sector_dict['random'] = list(pivot_df_dropna.columns[1:])

In [None]:
# with open('data/sector_dict.pkl', 'wb') as f:
#     pickle.dump(sector_dict, f)
# with open('data/pivot_df_dropna.pkl', 'wb') as f:
#     pickle.dump(pivot_df_dropna, f)

In [None]:
pivot_df_dropna.shape

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
wiki = pd.read_html(url)[0]

In [None]:
sector_dict = wiki[['Symbol', 'GICS Sector']]
sector_dict = sector_dict[~sector_dict['Symbol'].isin(drop_company)]
companies = set(pivot_df_dropna.columns[1:]).intersection(set(sector_dict['Symbol']))
sector_dict = sector_dict[sector_dict['Symbol'].isin(companies)]

In [None]:
sector_dict = sector_dict.groupby('GICS Sector')['Symbol'].apply(list).to_dict()

In [None]:
sector_dict['Random'] = list(pivot_df_dropna.columns[1:])

In [None]:
# with open('data/sp500/sector_dict_wiki.pkl', 'wb') as f:
#     pickle.dump(sector_dict, f)

# Run experiments (seed=331)

In [None]:
with open('data/sector_dict.pkl', 'rb') as f:
    sector_dict = pickle.load(f)
with open('data/pivot_df_dropna.pkl', 'rb') as f:
    pivot_df_dropna = pickle.load(f)

In [None]:
sector_dictq = sector_dict.pop('random')

In [None]:
cost_name = 'BDTsallis_KnnK'  # dim >= 1
co = co_factory(cost_name, mult=True, alpha=0.5, k=30)  # cost object 

# COVID samples

In [None]:
with open('data/sp500/sector_dict_wiki.pkl', 'rb') as f:
    sector_dict = pickle.load(f)
with open('data/sp500/pivot_df_dropna.pkl', 'rb') as f:
    pivot_df_dropna = pickle.load(f)

In [None]:
covid_sample = pivot_df_dropna[pivot_df_dropna['Date'] > '2020']

In [None]:
precovid_sample = pivot_df_dropna[pivot_df_dropna['Date'] < '2020']

In [None]:
cost_name = 'BDTsallis_KnnK'  # dim >= 1
co = co_factory(cost_name, mult=True, alpha=0.5, k=30)  # cost object 

In [None]:
def sampledata(my_list, samples=500, k=2, seed=331):
    unique_sets = set()

    local_random = random.Random()
    local_random.seed(seed)

    total_combinations = scipy.special.comb(len(my_list), k)
    max_unique_sets = min(samples, total_combinations)

    while len(unique_sets) < max_unique_sets:
        random_set = frozenset(local_random.sample(my_list, k=k))
        unique_sets.add(random_set)

    unique_sets = [list(s) for s in unique_sets]
    return unique_sets

In [None]:
def covid_sample_hoi(seed):
    hoi_dict = defaultdict(lambda: defaultdict(int))
    for sector, symbol in tqdm(sector_dict.items()):
        # 2way info
        unique_sets = sampledata(symbol, samples=500, k=2, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(covid_sample[sample_ind])
            info = Streitberg_2(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        # 3way
        unique_sets = sampledata(symbol, samples=500, k=3, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(covid_sample[sample_ind])
            info = Streitberg_3(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        # 4way
        unique_sets = sampledata(symbol, samples=500, k=4, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(covid_sample[sample_ind])
            info = Streitberg_4(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        
    return hoi_dict

In [None]:
covid_hoi = covid_sample_hoi(331)

In [None]:
with open('data/sp500/covid_hoi_sp500_wiki.pkl', 'wb') as f:
     dill.dump(covid_hoi, f)

In [None]:
def precovid_sample_hoi(seed):
    hoi_dict = defaultdict(lambda: defaultdict(int))
    for sector, symbol in tqdm(sector_dict.items()):
        # 2way info
        unique_sets = sampledata(symbol, samples=500, k=2, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(precovid_sample[sample_ind])
            info = Streitberg_2(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        # 3way
        unique_sets = sampledata(symbol, samples=500, k=3, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(precovid_sample[sample_ind])
            info = Streitberg_3(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        # 4way
        unique_sets = sampledata(symbol, samples=500, k=4, seed=seed)
        for sample_ind in unique_sets:
            X = np.array(precovid_sample[sample_ind])
            info = Streitberg_4(X, co.estimation)
            hoi_dict[sector][tuple(sample_ind)] = info
        
    return hoi_dict

In [None]:
precovid_hoi = precovid_sample_hoi(331)