In [None]:
pip install rdt



In [None]:

import pandas as pd
X = pd.read_csv("censusData.csv")

In [None]:
class DataTransformer(object):
    """Data Transformer.

    Model continuous columns with a BayesianGMM and normalize them to a scalar between [-1, 1]
    and a vector. Discrete columns are encoded using a OneHotEncoder.
    """

    def __init__(self, max_clusters=10, weight_threshold=0.005):
        """Create a data transformer.

        Args:
            max_clusters (int):
                Maximum number of Gaussian distributions in Bayesian GMM.
            weight_threshold (float):
                Weight threshold for a Gaussian distribution to be kept.
        """
        self._max_clusters = max_clusters
        self._weight_threshold = weight_threshold

    def _fit_continuous(self, data):
        """Train Bayesian GMM for continuous columns.

        Args:
            data (pd.DataFrame):
                A dataframe containing a column.

        Returns:
            namedtuple:
                A ``ColumnTransformInfo`` object.
        """
        column_name = data.columns[0]
        gm = ClusterBasedNormalizer(
            missing_value_generation='from_column',
            max_clusters=min(len(data), self._max_clusters),
            weight_threshold=self._weight_threshold,
        )
        gm.fit(data, column_name)
        num_components = sum(gm.valid_component_indicator)

        return ColumnTransformInfo(
            column_name=column_name,
            column_type='continuous',
            transform=gm,
            output_info=[SpanInfo(1, 'tanh'), SpanInfo(num_components, 'softmax')],
            output_dimensions=1 + num_components,
        )

    def _fit_discrete(self, data):
        """Fit one hot encoder for discrete column.

        Args:
            data (pd.DataFrame):
                A dataframe containing a column.

        Returns:
            namedtuple:
                A ``ColumnTransformInfo`` object.
        """
        column_name = data.columns[0]
        ohe = OneHotEncoder()
        ohe.fit(data, column_name)
        num_categories = len(ohe.dummies)

        return ColumnTransformInfo(
            column_name=column_name,
            column_type='discrete',
            transform=ohe,
            output_info=[SpanInfo(num_categories, 'softmax')],
            output_dimensions=num_categories,
        )

    def fit(self, raw_data, discrete_columns=()):
        """Fit the ``DataTransformer``.

        Fits a ``ClusterBasedNormalizer`` for continuous columns and a
        ``OneHotEncoder`` for discrete columns.

        This step also counts the #columns in matrix data and span information.
        """
        self.output_info_list = []
        self.output_dimensions = 0
        self.dataframe = True

        if not isinstance(raw_data, pd.DataFrame):
            self.dataframe = False
            # work around for RDT issue #328 Fitting with numerical column names fails
            discrete_columns = [str(column) for column in discrete_columns]
            column_names = [str(num) for num in range(raw_data.shape[1])]
            raw_data = pd.DataFrame(raw_data, columns=column_names)

        self._column_raw_dtypes = raw_data.infer_objects().dtypes
        self._column_transform_info_list = []
        for column_name in raw_data.columns:
            if column_name in discrete_columns:
                column_transform_info = self._fit_discrete(raw_data[[column_name]])
            else:
                column_transform_info = self._fit_continuous(raw_data[[column_name]])

            self.output_info_list.append(column_transform_info.output_info)
            self.output_dimensions += column_transform_info.output_dimensions
            self._column_transform_info_list.append(column_transform_info)

    def _transform_continuous(self, column_transform_info, data):
        column_name = data.columns[0]
        flattened_column = data[column_name].to_numpy().flatten()
        data = data.assign(**{column_name: flattened_column})
        gm = column_transform_info.transform
        transformed = gm.transform(data)

        #  Converts the transformed data to the appropriate output format.
        #  The first column (ending in '.normalized') stays the same,
        #  but the lable encoded column (ending in '.component') is one hot encoded.
        output = np.zeros((len(transformed), column_transform_info.output_dimensions))
        output[:, 0] = transformed[f'{column_name}.normalized'].to_numpy()
        index = transformed[f'{column_name}.component'].to_numpy().astype(int)
        output[np.arange(index.size), index + 1] = 1.0

        return output

    def _transform_discrete(self, column_transform_info, data):
        ohe = column_transform_info.transform
        return ohe.transform(data).to_numpy()

    def _synchronous_transform(self, raw_data, column_transform_info_list):
        """Take a Pandas DataFrame and transform columns synchronous.

        Outputs a list with Numpy arrays.
        """
        column_data_list = []
        for column_transform_info in column_transform_info_list:
            column_name = column_transform_info.column_name
            data = raw_data[[column_name]]
            if column_transform_info.column_type == 'continuous':
                column_data_list.append(self._transform_continuous(column_transform_info, data))
            else:
                column_data_list.append(self._transform_discrete(column_transform_info, data))

        return column_data_list

    def _parallel_transform(self, raw_data, column_transform_info_list):
        """Take a Pandas DataFrame and transform columns in parallel.

        Outputs a list with Numpy arrays.
        """
        processes = []
        for column_transform_info in column_transform_info_list:
            column_name = column_transform_info.column_name
            data = raw_data[[column_name]]
            process = None
            if column_transform_info.column_type == 'continuous':
                process = delayed(self._transform_continuous)(column_transform_info, data)
            else:
                process = delayed(self._transform_discrete)(column_transform_info, data)
            processes.append(process)

        return Parallel(n_jobs=-1)(processes)

    def transform(self, raw_data):
        """Take raw data and output a matrix data."""
        if not isinstance(raw_data, pd.DataFrame):
            column_names = [str(num) for num in range(raw_data.shape[1])]
            raw_data = pd.DataFrame(raw_data, columns=column_names)

        # Only use parallelization with larger data sizes.
        # Otherwise, the transformation will be slower.
        if raw_data.shape[0] < 500:
            column_data_list = self._synchronous_transform(
                raw_data, self._column_transform_info_list
            )
        else:
            column_data_list = self._parallel_transform(raw_data, self._column_transform_info_list)

        return np.concatenate(column_data_list, axis=1).astype(float)

    def _inverse_transform_continuous(self, column_transform_info, column_data, sigmas, st):
        gm = column_transform_info.transform
        data = pd.DataFrame(column_data[:, :2], columns=list(gm.get_output_sdtypes())).astype(float)
        data[data.columns[1]] = np.argmax(column_data[:, 1:], axis=1)
        if sigmas is not None:
            selected_normalized_value = np.random.normal(data.iloc[:, 0], sigmas[st])
            data.iloc[:, 0] = selected_normalized_value

        return gm.reverse_transform(data)

    def _inverse_transform_discrete(self, column_transform_info, column_data):
        ohe = column_transform_info.transform
        data = pd.DataFrame(column_data, columns=list(ohe.get_output_sdtypes()))
        return ohe.reverse_transform(data)[column_transform_info.column_name]

    def inverse_transform(self, data, sigmas=None):
        """Take matrix data and output raw data.

        Output uses the same type as input to the transform function.
        Either np array or pd dataframe.
        """
        st = 0
        recovered_column_data_list = []
        column_names = []
        for column_transform_info in self._column_transform_info_list:
            dim = column_transform_info.output_dimensions
            column_data = data[:, st : st + dim]
            if column_transform_info.column_type == 'continuous':
                recovered_column_data = self._inverse_transform_continuous(
                    column_transform_info, column_data, sigmas, st
                )
            else:
                recovered_column_data = self._inverse_transform_discrete(
                    column_transform_info, column_data
                )

            recovered_column_data_list.append(recovered_column_data)
            column_names.append(column_transform_info.column_name)
            st += dim

        recovered_data = np.column_stack(recovered_column_data_list)
        recovered_data = pd.DataFrame(recovered_data, columns=column_names).astype(
            self._column_raw_dtypes
        )
        if not self.dataframe:
            recovered_data = recovered_data.to_numpy()

        return recovered_data

    def convert_column_name_value_to_id(self, column_name, value):
        """Get the ids of the given `column_name`."""
        discrete_counter = 0
        column_id = 0
        for column_transform_info in self._column_transform_info_list:
            if column_transform_info.column_name == column_name:
                break
            if column_transform_info.column_type == 'discrete':
                discrete_counter += 1

            column_id += 1

        else:
            raise ValueError(f"The column_name `{column_name}` doesn't exist in the data.")

        ohe = column_transform_info.transform
        data = pd.DataFrame([value], columns=[column_transform_info.column_name])
        one_hot = ohe.transform(data).to_numpy()[0]
        if sum(one_hot) == 0:
            raise ValueError(f"The value `{value}` doesn't exist in the column `{column_name}`.")

        return {
            'discrete_column_id': discrete_counter,
            'column_id': column_id,
            'value_id': np.argmax(one_hot),
        }


import pandas as pd
import torch
import torch.nn as nn
from google.colab import drive
from sklearn.preprocessing import OneHotEncoder
X = pd.read_csv("censusData.csv")
drive.mount('/content/drive')

categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country','income']
print("Categorical: ",categorical_columns)
continuous_columns = [col for col in X.columns if col not in categorical_columns]
print("Continuous: ",continuous_columns)

X['capital-gain-binary'] = (X['capital-gain'] != 0).astype(int)  # 1 if gain exists, 0 otherwise
categorical_columns.append('capital-gain-binary')
X['capital-loss-binary'] = (X['capital-loss'] != 0).astype(int)  # 1 if loss exists, 0 otherwise
categorical_columns.append('capital-loss-binary')

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, drop='if_binary')
X_encoded_cat = encoder.fit_transform(X[categorical_columns])
print(encoder.get_feature_names_out())
from collections import namedtuple

from rdt.transformers import ClusterBasedNormalizer, OneHotEncoder
SpanInfo = namedtuple('SpanInfo', ['dim', 'activation_fn'])
ColumnTransformInfo = namedtuple(
    'ColumnTransformInfo',
    ['column_name', 'column_type', 'transform', 'output_info', 'output_dimensions'],
)
import numpy as np
dataTrans = DataTransformer()
print(X[continuous_columns].values)
# Precompute an encoder to handle later conversions efficiently
for column in continuous_columns:
  CTI = dataTrans._fit_continuous(pd.DataFrame(X[column],columns=[column]))
  temp = dataTrans._transform_continuous(CTI, pd.DataFrame(X[column],columns=[column]))
  X[column] = temp
print('cti; ',CTI)
X_continuous = X[continuous_columns].values
print(X_continuous)
temp = np.concatenate((X_encoded_cat,np.array(X_continuous)),axis=1)
np.savetxt("original_datav8.csv", temp, delimiter=",")
print(torch.tensor(X_encoded_cat).shape)
X_encoded = torch.cat((torch.tensor(X_encoded_cat),torch.tensor(X_continuous)),dim=1)
dataset = list(X_encoded)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(X_encoded.shape)
num_variables = X_encoded_cat.shape[1] + X_continuous.shape[1]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Categorical:  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
Continuous:  ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']




['workclass_?' 'workclass_Federal-gov' 'workclass_Local-gov'
 'workclass_Never-worked' 'workclass_Private' 'workclass_Self-emp-inc'
 'workclass_Self-emp-not-inc' 'workclass_State-gov'
 'workclass_Without-pay' 'workclass_nan' 'education_10th' 'education_11th'
 'education_12th' 'education_1st-4th' 'education_5th-6th'
 'education_7th-8th' 'education_9th' 'education_Assoc-acdm'
 'education_Assoc-voc' 'education_Bachelors' 'education_Doctorate'
 'education_HS-grad' 'education_Masters' 'education_Preschool'
 'education_Prof-school' 'education_Some-college'
 'marital-status_Divorced' 'marital-status_Married-AF-spouse'
 'marital-status_Married-civ-spouse'
 'marital-status_Married-spouse-absent' 'marital-status_Never-married'
 'marital-status_Separated' 'marital-status_Widowed' 'occupation_?'
 'occupation_Adm-clerical' 'occupation_Armed-Forces'
 'occupation_Craft-repair' 'occupation_Exec-managerial'
 'occupation_Farming-fishing' 'occupation_Handlers-cleaners'
 'occupation_Machine-op-inspct' 'oc

In [None]:
mi_matrix = torch.load('mutual_information_matrix_full.pt')
temp = mi_matrix.clone()
def clearData(mi_matrix):
    temp = torch.zeros_like(mi_matrix)
    for i in range(mi_matrix.shape[0]):
        for j in range(mi_matrix.shape[1]):
          k=i
          l=j
          if i >= 60 and i < 107:
            k+=1
            if i >= 105:
              k+=1
          if j >= 60 and j < 107:
            l+=1
            if j >= 105:
              l+=1
          if (k >= 107 and i<107) or (l >= 107 and j<107) :
            continue
          temp[i,j] = mi_matrix[k,j]
    return temp



In [None]:
import numpy as np
from sklearn.metrics import mutual_info_score
import time
mutual_information_matrix = torch.load('mutual_information_matrix_full.pt')

def calculate_mutual_information_matrix(X_encoded):
    n_features = X_encoded.shape[1]
    temp = clearData(mutual_information_matrix[:107,:107])
    mi_matrix = mutual_information_matrix
    mi_matrix[:107,:107] = temp
    for i in range(num_variables):
        for j in range(i + 1, num_variables):
            if not ((i>=105 and i<107) or (j>=105 and j<107)):
              continue
            start_time = time.time()
            X_col_i = X_encoded[:, i]
            X_col_j = X_encoded[:, j]
            mi_matrix[i, j] = mutual_info_score(X_col_i, X_col_j)
            mi_matrix[j, i] = mi_matrix[i, j]  # MI is symmetric
            end_time = time.time()
            processed = end_time-start_time
            print(i, " - ", j, " : ", mi_matrix[i, j], f" - {processed:4f} s")

    return mi_matrix

# Calculate mutual information
mi_matrix = torch.tensor(calculate_mutual_information_matrix(X_encoded)).float()

torch.save(mi_matrix, 'mutual_information_matrix_fullv3.pt')
!cp 'mutual_information_matrix_fullv3.pt' /content/drive/MyDrive


0  -  105  :  tensor(6.6708e-05)  - 0.028211 s
0  -  106  :  tensor(0.0001)  - 0.015138 s
1  -  105  :  tensor(9.6461e-05)  - 0.014884 s
1  -  106  :  tensor(5.0161e-05)  - 0.014739 s
2  -  105  :  tensor(2.5689e-05)  - 0.015042 s
2  -  106  :  tensor(6.0200e-05)  - 0.015033 s
3  -  105  :  tensor(1.7656e-05)  - 0.015339 s
3  -  106  :  tensor(9.7977e-06)  - 0.015207 s
4  -  105  :  tensor(0.0006)  - 0.011549 s
4  -  106  :  tensor(0.0003)  - 0.010646 s
5  -  105  :  tensor(0.0017)  - 0.015034 s
5  -  106  :  tensor(0.0004)  - 0.015103 s
6  -  105  :  tensor(0.0002)  - 0.014920 s
6  -  106  :  tensor(8.6182e-05)  - 0.015317 s
7  -  105  :  tensor(5.4102e-06)  - 0.015172 s
7  -  106  :  tensor(2.5491e-06)  - 0.015076 s
8  -  105  :  tensor(4.3283e-07)  - 0.015786 s
8  -  106  :  tensor(3.8616e-09)  - 0.014853 s
9  -  105  :  tensor(0.0001)  - 0.015497 s
9  -  106  :  tensor(9.8107e-07)  - 0.015178 s
10  -  105  :  tensor(0.0005)  - 0.015317 s
10  -  106  :  tensor(0.0001)  - 0.015286 s


  mi_matrix = torch.tensor(calculate_mutual_information_matrix(X_encoded)).float()


105  -  108  :  tensor(0.0087)  - 0.013429 s
105  -  109  :  tensor(0.2851)  - 0.014081 s
105  -  110  :  tensor(0.0041)  - 0.013791 s
105  -  111  :  tensor(0.0056)  - 0.014162 s
106  -  107  :  tensor(0.0048)  - 0.013983 s
106  -  108  :  tensor(0.0037)  - 0.012660 s
106  -  109  :  tensor(0.0041)  - 0.014259 s
106  -  110  :  tensor(0.1887)  - 0.013669 s
106  -  111  :  tensor(0.0033)  - 0.014741 s


In [None]:
pip install dcor

Collecting dcor
  Downloading dcor-0.6-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dcor
Successfully installed dcor-0.6


In [None]:
import dcor
# Initialize distance correlation matrix (consider symmetry)
distance_correlation_matrix = clearData(torch.load('distance_correlation_matrix_full.pt'))
dcor_matrix = distance_correlation_matrix
for i in range(X_encoded.shape[1]):
    X_col_i = X_encoded[:, i]
    for j in range(i + 1, X_encoded.shape[1]):
        if not ((i>=105 and i<107) or (j>=105 and j<107)):
          continue
        X_col_j = X_encoded[:, j]

        # Calculate distance correlation
        start_time = time.time()
        dcor_value = dcor.distance_correlation(X_col_i, X_col_j)
        dcor_matrix[i, j] = dcor_value
        dcor_matrix[j, i] = dcor_value  # Distance correlation is symmetric

        end_time = time.time()
        processed = end_time-start_time
        print(i, " - ", j, " : ", dcor_value, f" - {processed:4f} s")

# Save the matrix
torch.save(dcor_matrix, 'distance_correlation_matrix_fullv3.pt')
!cp 'distance_correlation_matrix_fullv3.pt' /content/drive/MyDrive

0  -  105  :  0.011213355182485247  - 0.080725 s
0  -  106  :  0.014170189919922577  - 0.125271 s
1  -  105  :  0.014414836111897781  - 0.122015 s
1  -  106  :  0.010405349416430187  - 0.123041 s
2  -  105  :  0.007258870393968656  - 0.122225 s
2  -  106  :  0.01127165182096131  - 0.125784 s
3  -  105  :  0.00429434152049267  - 0.121508 s
3  -  106  :  0.003168101436037406  - 0.127950 s
4  -  105  :  0.0356867719368177  - 0.127882 s
4  -  106  :  0.0229843961089595  - 0.121410 s
5  -  105  :  0.06581464568080314  - 0.126580 s
5  -  106  :  0.03170023667570466  - 0.168638 s
6  -  105  :  0.01900204726937539  - 0.123196 s
6  -  106  :  0.013503020321858007  - 0.122241 s
7  -  105  :  0.00326375793418361  - 0.126648 s
7  -  106  :  0.0022408225852264788  - 0.125521 s
8  -  105  :  0.0009511051757710437  - 0.125435 s
8  -  106  :  8.814760694931098e-05  - 0.138649 s
9  -  105  :  0.01527636532455883  - 0.121690 s
9  -  106  :  0.001391011050757997  - 0.126257 s
10  -  105  :  0.02762816813

In [None]:
import time
import pandas as pd
from scipy.stats import chi2_contingency

chi2_matrix2 = clearData(torch.load('chi2_matrix_full.pt'))
theils_u_matrix2 = clearData(torch.load('theils_u_matrix_full.pt'))
cramers_v_matrix2 = clearData(torch.load('cramers_v_matrix_full.pt'))


torch.save(chi2_matrix2, 'chi2_matrix_fullv3.pt')
!cp 'chi2_matrix_fullv3.pt' /content/drive/MyDrive

torch.save(theils_u_matrix2, 'theils_u_matrix_fullv3.pt')
!cp 'theils_u_matrix_fullv3.pt' /content/drive/MyDrive

torch.save(cramers_v_matrix2, 'cramers_v_matrix_fullv3.pt')
!cp 'cramers_v_matrix_fullv3.pt' /content/drive/MyDrive

In [None]:
pip install minepy

Collecting minepy
  Downloading minepy-1.2.6.tar.gz (496 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/497.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/497.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.0/497.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: minepy
  Building wheel for minepy (setup.py) ... [?25l[?25hdone
  Created wheel for minepy: filename=minepy-1.2.6-cp310-cp310-linux_x86_64.whl size=187095 sha256=7ac6c38d1725ea1a0ed10833bc63164a4db5fb5c4177736a768f1b886db4cc46
  Stored in directory: /root/.cache/pip/wheels/69/38/a6/825bb9b9ed81e6af43a0ef80c7cfe4cafcfdbc2f5cde2959d9
Successfully built minepy
Installing collected packages: minepy
Successfully installed minepy-1.2.6


In [None]:
import minepy
from scipy.stats import spearmanr
import torch
import time
def calculate_mic(X_col_i, X_col_j):
    mic_instance = minepy.MINE()
    mic_instance.compute_score(X_col_i, X_col_j)
    return mic_instance.mic()

def calculate_spearman(X_col_i, X_col_j):
    return spearmanr(X_col_i, X_col_j)[0]


mic_matrix = clearData(torch.load('maximal_coefficient_matrix_full.pt'))
spearman_matrix = clearData(torch.load('spearmans_rank_matrix_full.pt'))

for i in range(X_encoded.shape[1]):
    X_col_i = X_encoded[:, i]
    for j in range(i + 1, X_encoded.shape[1]):
        if not ((i>=105 and i<107) or (j>=105 and j<107)):
          continue
        X_col_j = X_encoded[:, j]
        # MIC
        start_time = time.time()
        mic_value = calculate_mic(X_col_i, X_col_j)
        mic_matrix[i, j] = mic_value
        mic_matrix[j, i] = mic_value
        spearman_value = calculate_spearman(X_col_i, X_col_j)
        spearman_matrix[i, j] = spearman_value
        spearman_matrix[j, i] = spearman_value  # Symmetric
        end_time = time.time()
        processed = end_time-start_time
        print(i, "-", j, ": MIC:", mic_value, " Spr: ",spearman_value, f"- {processed:4f} s")


torch.save(mic_matrix, 'maximal_coefficient_matrix_fullv3.pt')
!cp 'maximal_coefficient_matrix_fullv3.pt' /content/drive/MyDrive

torch.save(spearman_matrix, 'spearmans_rank_matrix_fullv3.pt')
!cp 'spearmans_rank_matrix_fullv3.pt' /content/drive/MyDrive


pearsons_correlation_coefficient_matrix = torch.from_numpy(np.corrcoef(X_encoded.T)).float()
torch.save(pearsons_correlation_coefficient_matrix, 'pearsons_correlation_coefficient_matrix_fullv3.pt')
!cp 'pearsons_correlation_coefficient_matrix_fullv3.pt' /content/drive/MyDrive

0 - 105 : MIC: 9.623971703977778e-05  Spr:  -0.011213355182429503 - 3.329754 s
0 - 106 : MIC: 0.00016203395419357882  Spr:  -0.014170189919862757 - 3.379069 s
1 - 105 : MIC: 0.00013916323471166496  Spr:  0.014414836111962658 - 3.323823 s
1 - 106 : MIC: 7.236774707080304e-05  Spr:  0.010405349416486765 - 3.483836 s
2 - 105 : MIC: 3.706109308212788e-05  Spr:  0.007258870393958812 - 3.516222 s
2 - 106 : MIC: 8.684969565491538e-05  Spr:  0.011271651820968674 - 3.291816 s
3 - 105 : MIC: 2.5472225454523112e-05  Spr:  -0.004294341519496433 - 3.382299 s
3 - 106 : MIC: 1.4135059760521168e-05  Spr:  -0.0031681014353235094 - 3.511914 s
4 - 105 : MIC: 0.0008938486141713293  Spr:  -0.03568677193676292 - 2.672731 s
4 - 106 : MIC: 0.00037153602516418297  Spr:  -0.022984396108958003 - 2.789333 s
5 - 105 : MIC: 0.002439893843461072  Spr:  0.06581464568081996 - 3.292227 s
5 - 106 : MIC: 0.0006017351905059751  Spr:  0.03170023667575518 - 3.387356 s
6 - 105 : MIC: 0.0002463767293445326  Spr:  0.0190020472

In [None]:
import numpy as np
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
from google.colab import drive
drive.mount('/content/drive')

#https://github.com/clovaai/rebias/tree/master
def to_numpy(x):
    """convert Pytorch tensor to numpy array
    """
    return x.clone().detach().cpu().numpy()


class HSIC(nn.Module):
    """Base class for the finite sample estimator of Hilbert-Schmidt Independence Criterion (HSIC)
    ..math:: HSIC (X, Y) := || C_{x, y} ||^2_{HS}, where HSIC (X, Y) = 0 iif X and Y are independent.

    Empirically, we use the finite sample estimator of HSIC (with m observations) by,
    (1) biased estimator (HSIC_0)
        Gretton, Arthur, et al. "Measuring statistical dependence with Hilbert-Schmidt norms." 2005.
        :math: (m - 1)^2 tr KHLH.
        where K_{ij} = kernel_x (x_i, x_j), L_{ij} = kernel_y (y_i, y_j), H = 1 - m^{-1} 1 1 (Hence, K, L, H are m by m matrices).
    (2) unbiased estimator (HSIC_1)
        Song, Le, et al. "Feature selection via dependence maximization." 2012.
        :math: \frac{1}{m (m - 3)} \bigg[ tr (\tilde K \tilde L) + \frac{1^\top \tilde K 1 1^\top \tilde L 1}{(m-1)(m-2)} - \frac{2}{m-2} 1^\top \tilde K \tilde L 1 \bigg].
        where \tilde K and \tilde L are related to K and L by the diagonal entries of \tilde K_{ij} and \tilde L_{ij} are set to zero.

    Parameters
    ----------
    sigma_x : float
        the kernel size of the kernel function for X.
    sigma_y : float
        the kernel size of the kernel function for Y.
    algorithm: str ('unbiased' / 'biased')
        the algorithm for the finite sample estimator. 'unbiased' is used for our paper.
    reduction: not used (for compatibility with other losses).
    """
    def __init__(self, sigma_x, sigma_y=None, algorithm='unbiased',
                 reduction=None):
        super(HSIC, self).__init__()

        if sigma_y is None:
            sigma_y = sigma_x

        self.sigma_x = sigma_x
        self.sigma_y = sigma_y

        if algorithm == 'biased':
            self.estimator = self.biased_estimator
        elif algorithm == 'unbiased':
            self.estimator = self.unbiased_estimator
        else:
            raise ValueError('invalid estimator: {}'.format(algorithm))

    def _kernel_x(self, X):
        raise NotImplementedError

    def _kernel_y(self, Y):
        raise NotImplementedError

    def biased_estimator(self, input1, input2):
        """Biased estimator of Hilbert-Schmidt Independence Criterion
        Gretton, Arthur, et al. "Measuring statistical dependence with Hilbert-Schmidt norms." 2005.
        """
        K = self._kernel_x(input1)
        L = self._kernel_y(input2)

        KH = K - K.mean(0, keepdim=True)
        LH = L - L.mean(0, keepdim=True)

        N = len(input1)

        return torch.trace(KH @ LH / (N - 1) ** 2)

    def unbiased_estimator(self, input1, input2):
        """Unbiased estimator of Hilbert-Schmidt Independence Criterion
        Song, Le, et al. "Feature selection via dependence maximization." 2012.
        """
        kernel_XX = self._kernel_x(input1)
        kernel_YY = self._kernel_y(input2)

        tK = kernel_XX - torch.diag(kernel_XX)
        tL = kernel_YY - torch.diag(kernel_YY)

        N = len(input1)

        hsic = (
            torch.trace(tK @ tL)
            + (torch.sum(tK) * torch.sum(tL) / (N - 1) / (N - 2))
            - (2 * torch.sum(tK, 0).dot(torch.sum(tL, 0)) / (N - 2))
        )

        return hsic / (N * (N - 3))

    def forward(self, input1, input2, **kwargs):
        return self.estimator(input1, input2)


class RbfHSIC(HSIC):
    """Radial Basis Function (RBF) kernel HSIC implementation.
    """
    def _kernel(self, X, sigma):
        X = X.view(len(X), -1)
        XX = X @ X.t()
        X_sqnorms = torch.diag(XX)
        X_L2 = -2 * XX + X_sqnorms.unsqueeze(1) + X_sqnorms.unsqueeze(0)
        gamma = 1 / (2 * sigma ** 2)

        kernel_XX = torch.exp(-gamma * X_L2)
        return kernel_XX

    def _kernel_x(self, X):
        return self._kernel(X, self.sigma_x)

    def _kernel_y(self, Y):
        return self._kernel(Y, self.sigma_y)



class LinearHSIC(HSIC):
    """Linear kernel HSIC implementation.
    """
    def _kernel(self, X):
        return X @ X.t()

    def _kernel_x(self, X):
        return self._kernel(X)

    def _kernel_y(self, Y):
        return self._kernel(Y)

class JaccardHSIC(HSIC):
    """Jaccard Kernel HSIC Implementation"""

    def _kernel(self, X, Y):
        # Ensure binary data (PyTorch equivalent)
        if not torch.is_tensor(X) or not torch.is_tensor(Y):
            X = torch.tensor(X)
            Y = torch.tensor(Y)

        if not X.dtype == torch.bool and not Y.dtype == torch.bool:
            X = (X > 0).type(torch.int)
            Y = (Y > 0).type(torch.int)

        # Reshape if necessary
        if len(X.shape) == 1:
          X = X.view(-1, 1)
        if len(Y.shape) == 1:
          Y = Y.view(-1, 1)

        intersection = (X & Y).sum(dim=1)
        union = (X | Y).sum(dim=1)

        # Handle potential zeros using torch-friendly methods
        kernel_matrix = intersection / union
        kernel_matrix[union == 0] = 0

        return kernel_matrix

    def _kernel_x(self, X):
        return self._kernel(X, X)

    def _kernel_y(self, Y):
        return self._kernel(Y, Y)


import time

# Initialize HSIC matrix (consider symmetry when deciding size)
#hsic_matrix = clearData(torch.load('rbf_hsic_matrix_full.pt'))
linear_hsic_matrix = clearData(torch.load('linear_hsic_matrix_full.pt'))
# Hyperparameter for RbfHSIC
sigma = 1.0

for i in range(X_encoded.shape[1]):
    X_col_i = X_encoded[:, i].reshape(-1, 1)

    for j in range(i + 1, X_encoded.shape[1]):
        if not ((i>=105 and i<107) or (j>=105 and j<107)):
          continue
        X_col_j = X_encoded[:, j].reshape(-1, 1)


        # Create HSIC estimator with torch (on the GPU)
       # hsic_estimator = RbfHSIC(sigma_x=sigma, sigma_y=sigma).cuda()
        linear_hsic_estimator = LinearHSIC(sigma_x=sigma, sigma_y=sigma).cuda()

         # Random Sampling
        sample_indices = np.random.choice(X_col_i.shape[0], size=28500, replace=False)
        X_col_i_sampled = X_col_i[sample_indices]
        X_col_j_sampled = X_col_j[sample_indices]

        # Convert data to PyTorch tensors (on the GPU)
        X_torch = X_col_i_sampled.cuda()
        Y_torch = X_col_j_sampled.cuda()

        # Compute HSIC on the GPU
        start_time = time.time()
       # hsic_value = hsic_estimator(X_torch, Y_torch)
       # hsic_matrix[i, j] = hsic_value
      #  hsic_matrix[j, i] = hsic_value
        hsic_value = linear_hsic_estimator(X_torch, Y_torch)
        linear_hsic_matrix[i, j] = hsic_value
        linear_hsic_matrix[j, i] = hsic_value

        end_time = time.time()
        processed = end_time-start_time
        print(i, " - ", j, " : ", hsic_value, f" - {processed:4f} s")

#torch.save(hsic_matrix, 'rbf_hsic_matrix_fullv3.pt')
#!cp 'rbf_hsic_matrix_fullv3.pt' /content/drive/MyDrive

torch.save(linear_hsic_matrix, 'linear_hsic_matrix_fullv3.pt')
!cp 'linear_hsic_matrix_fullv3.pt' /content/drive/MyDrive


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0  -  105  :  tensor(0.0007, device='cuda:0', dtype=torch.float64)  - 2.585455 s
0  -  106  :  tensor(0.0010, device='cuda:0', dtype=torch.float64)  - 2.444153 s
1  -  105  :  tensor(-0.0013, device='cuda:0', dtype=torch.float64)  - 2.444841 s
1  -  106  :  tensor(-0.0008, device='cuda:0', dtype=torch.float64)  - 2.444113 s
2  -  105  :  tensor(-0.0011, device='cuda:0', dtype=torch.float64)  - 2.443949 s
2  -  106  :  tensor(-0.0013, device='cuda:0', dtype=torch.float64)  - 2.444057 s
3  -  105  :  tensor(2.8012e-05, device='cuda:0', dtype=torch.float64)  - 2.444344 s
3  -  106  :  tensor(2.8786e-05, device='cuda:0', dtype=torch.float64)  - 2.443995 s
4  -  105  :  tensor(0.0056, device='cuda:0', dtype=torch.float64)  - 2.568030 s
4  -  106  :  tensor(0.0029, device='cuda:0', dtype=torch.float64)  - 2.444065 s
5  -  105  :  tensor(-0.0061, device='cuda:0', dt

In [None]:
pip install pypair



In [None]:
from pypair.contingency import (
    AgreementMixin,
    BinaryMixin,
    CategoricalMixin,
    ConfusionMixin
)

def get_measures(clazz):
    """
    Gets all the measures of a clazz.

    :param clazz: Clazz.
    :return: List of measures.
    """
    is_property = lambda v: isinstance(v, property)
    is_public = lambda n: not n.startswith('_')
    is_valid = lambda n, v: is_public(n) and is_property(v)

    measures = [n for n, v in clazz.__dict__.items() if is_valid(n, v)]
    return sorted(measures)
print("AgreementMixin: ")
print(get_measures(AgreementMixin))
print("BinaryMixin: ")
print(get_measures(BinaryMixin))
print("CategoricalMixin: ")
print(get_measures(CategoricalMixin))
print("ConfusionMixin: ")
print(get_measures(ConfusionMixin))

AgreementMixin: 
['chohen_k', 'cohen_light_k']
BinaryMixin: 
['ample', 'anderberg', 'baroni_urbani_buser_i', 'baroni_urbani_buser_ii', 'braun_banquet', 'chisq', 'chord', 'cole_i', 'cole_ii', 'contingency_coefficient', 'cosine', 'cramer_v', 'dennis', 'dice', 'disperson', 'driver_kroeber', 'euclid', 'eyraud', 'fager_mcgowan', 'faith', 'forbes_ii', 'forbesi', 'fossum', 'gilbert_wells', 'goodman_kruskal', 'gower', 'gower_legendre', 'hamann', 'hamming', 'hellinger', 'inner_product', 'intersection', 'jaccard', 'jaccard_3w', 'jaccard_distance', 'johnson', 'kulcyznski_ii', 'kulczynski_i', 'lance_williams', 'mcconnaughey', 'mcnemar_test', 'mean_manhattan', 'michael', 'mountford', 'ochia_i', 'ochia_ii', 'odds_ratio', 'pattern_difference', 'pearson_heron_i', 'pearson_heron_ii', 'pearson_i', 'peirce', 'person_ii', 'roger_tanimoto', 'russel_rao', 'shape_difference', 'simpson', 'size_difference', 'sokal_michener', 'sokal_sneath_i', 'sokal_sneath_ii', 'sokal_sneath_iii', 'sokal_sneath_iv', 'sokal_sne

In [None]:
from abc import ABC
from functools import lru_cache, reduce
from itertools import chain, product
from math import sqrt, log2, pi, log, cos

import pandas as pd
from scipy import stats
from scipy.special import binom

from pypair.decorator import timeit, similarity, distance
from pypair.util import MeasureMixin


class CategoricalMixin(object):
    """
    Categorical computations based off a contingency table.
    """



    def chisq(self):
        """
        The `chi-square statistic <https://en.wikipedia.org/wiki/Chi-square_distribution>`_ :math:`\\chi^2`,
        is defined as follows.

        :math:`\\sum_i \\sum_j \\frac{(O_{ij} - E_{ij})^2}{E_{ij}}`

        In a contingency table, :math:`O_ij` is the observed cell count corresponding to the :math:`i` row
        and :math:`j` column. :math:`E_ij` is the expected cell count corresponding to the :math:`i` row and
        :math:`j` column.

        :math:`E_i = \\frac{N_{i*} N_{*j}}{N}`

        Where :math:`N_{i*}` is the i-th row marginal, :math:`N_{*j}` is the j-th column marginal and
        :math:`N` is the sum of all the values in the contingency cells (or the total size of the data).

        References

        - `Chi-Square Statistic Definition <https://www.investopedia.com/terms/c/chi-square-statistic.asp>`_

        :return: Chi-square statistic.
        """
        n = self._n
        r = self._r
        c = self._k
        row_marginals = self._r_margs
        col_marginals = self._k_margs

        get_expected = lambda i, j: row_marginals[i] * col_marginals[j] / n
        expected = [[get_expected(i, j) for j in range(c)] for i in range(r)]

        chisq = sum([(o - e) ** 2 / e for o, e in zip(chain(*self._table), chain(*expected))])
        return chisq



    def chisq_dof(self):
        """
        Returns the degrees of freedom form :math:`\\chi^2`, which is defined as :math:`(R - 1)(C - 1)`,
        where :math:`R` is the number of rows and :math:`C` is the number of columns in a contingency
        table induced by two categorical variables.

        :return: Degrees of freedom.
        """
        return (self._r - 1) * (self._k - 1)



    def phi(self):
        """
        Gets :math:`\\phi`.

        :math:`\\phi = \\sqrt{\\frac{\\chi^2}{N}}`

        :return: :math:`\\phi`.
        """
        return sqrt(self.chisq() / self._n)



    def uncertainty_coefficient(self):
        """
        The `uncertainty coefficient <https://en.wikipedia.org/wiki/Uncertainty_coefficient>`_ :math:`U(X|Y)`
        for two variables :math:`X` and :math:`Y` is defined as follows.

        :math:`U(X|Y) = \\frac{I(X;Y)}{H(X)}`

        Where,

        - :math:`H(X) = -\\sum_x P(x) \\log P(x)`
        - :math:`I(X;Y) = \\sum_y \\sum_x P(x, y) \\log \\frac{P(x, y)}{P(x) P(y)}`

        :math:`H(X)` is called the entropy of :math:`X` and :math:`I(X;Y)` is the mutual information
        between :math:`X` and :math:`Y`. Note that :math:`I(X;Y) < H(X)` and both values are positive.
        As such, the uncertainty coefficient may be viewed as the normalized mutual information
        between :math:`X` and :math:`Y` and in the range :math:`[0, 1]`.

        :return: Uncertainty coefficient.
        """
        n = self._n

        h_b = map(lambda j: self._k_margs[j] / n, range(self._k))
        h_b = map(lambda p: p * log(p), h_b)
        h_b = -reduce(lambda x, y: x + y, h_b)

        i_ab = self.mutual_information()

        e = i_ab / h_b

        return e



    def uncertainty_coefficient_reversed(self):
        """
        `Uncertainty coefficient <https://en.wikipedia.org/wiki/Uncertainty_coefficient>`_.

        :return: Uncertainty coefficient.
        """
        n = self._n

        h_b = map(lambda i: self._r_margs[i] / n, range(self._r))
        h_b = map(lambda p: p * log(p), h_b)
        h_b = -reduce(lambda x, y: x + y, h_b)

        i_ab = self.mutual_information()

        e = i_ab / h_b

        return e



    def mutual_information(self):
        """
        The `mutual information <https://en.wikipedia.org/wiki/Mutual_information>`_ between
        two variables :math:`X` and :math:`Y` is denoted as :math:`I(X;Y)`.  :math:`I(X;Y)` is
        unbounded and in the range :math:`[0, \\infty]`. A higher mutual information
        value implies strong association. The formula for :math:`I(X;Y)` is defined as follows.

        :math:`I(X;Y) = \\sum_y \\sum_x P(x, y) \\log \\frac{P(x, y)}{P(x) P(y)}`

        :return: Mutual information.
        """
        n = self._n

        get_p_a = lambda i: self._r_margs[i] / n
        get_p_b = lambda j: self._k_margs[j] / n
        get_p_ab = lambda i, j: self._table[i][j] / n
        get_mi = lambda i, j: get_p_ab(i, j) * log(get_p_ab(i, j) / get_p_a(i) / get_p_b(j))

        mi = sum((get_mi(i, j) for i, j in product(*[range(self._r), range(self._k)])))

        return mi



    def gk_lambda(self):
        """
        Goodman-Kruskal's lambda is the `proportional reduction in error`
        of predicting one variable `b` given another `a`: :math:`\\lambda_{B|A}`.

        - The probability of an error in predicting the column category: :math:`P_e = 1 - \\frac{\\max_{c} N_{* c}}{N}`
        - The probability of an error in predicting the column category given the row category: :math:`P_{e|r} = 1 - \\frac{\\sum_r \\max_{c} N_{r c}}{N}`

        Where,

        - :math:`\\max_{c} N_{* c}` is the maximum of the column marginals
        - :math:`\\sum_r \\max_{c} N_{r c}` is the sum over the maximum value per row
        - :math:`N` is the total

        Thus, :math:`\\lambda_{B|A} = \\frac{P_e - P_{e|r}}{P_e}`.

        The way the contingency table is setup by default is that `a` is on
        the rows and `b` is on the columns. Note that Goodman-Kruskal's lambda
        is not symmetric: :math:`\\lambda_{B|A}` does not necessarily equal
        :math:`\\lambda_{A|B}`. By default, :math:`\\lambda_{B|A}` is computed, but
        if you desire the reverse, use `goodman_kruskal_lambda_reversed()`.

        References

        - `Goodman-Kruskal's lambda <https://en.wikipedia.org/wiki/Goodman_and_Kruskal%27s_lambda>`_.
        - `Correlation <http://cda.psych.uiuc.edu/web_407_spring_2014/correlation_week4.pdf>`_.

        :return: Goodman-Kruskal's lambda.
        """
        n = self._n
        r = self._r

        x = sum([max(self._table[i]) for i in range(r)])
        y = max(self._k_margs)
        gkl = (x - y) / (n - y)
        return gkl



    def gk_lambda_reversed(self):
        """
        Computes :math:`\\lambda_{A|B}`.

        :return: Goodman-Kruskal's lambda.
        """
        n = self._n
        r = self._r
        c = self._k

        x = sum([max([self._table[i][j] for i in range(r)]) for j in range(c)])
        y = max(self._r_margs)
        gkl = (x - y) / (n - y)
        return gkl



    def adjusted_rand_index(self):
        """
        The Adjusted Rand Index (ARI) should yield a value between
        [0, 1], however, negative values can also arise when the index
        is less than the expected value. This function uses `binom()`
        from `scipy.special`, and when n >= 300, the results are too
        large and may cause overflow.

        TODO: use a different way to compute binomial coefficient

        References

        - `Adjusted Rand Index <https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index>`_.
        - `Python binomial coefficient <https://stackoverflow.com/questions/26560726/python-binomial-coefficient>`_.

        :return: Adjusted Rand Index.
        """
        a_i = sum([int(binom(a, 2)) for a in self._r_margs])
        b_j = sum([int(binom(b, 2)) for b in self._k_margs])
        n_ij = sum([int(binom(n, 2)) for n in chain(*self._table)])
        n = binom(self._n, 2)

        top = (n_ij - (a_i * b_j) / n)
        bot = 0.5 * (a_i + b_j) - (a_i * b_j) / n
        s = top / bot
        return s


class BinaryMixin(object):
    """
    Binary computations based off of `a`, `b`, `c` and `d` from a 2x2 contingency table.
    """



    def __abcdn(self):
        """
        Gets a, b, c, d, n.

        :returns: a, b, c, d, n
        """
        return self._a, self._b, self._c, self._d, self._n


    @timeit
    def __sigma(self):
        """
        Gets :math:`\\max(a, b) + \\max(c, d) + \\max(a, c) + \\max(b, d)`.

        :returns: :math:`\\max(a, b) + \\max(c, d) + \\max(a, c) + \\max(b, d)`.
        """
        a, b, c, d, n = self.__abcdn()
        return max(a, b) + max(c, d) + max(a, c) + max(b, d)


    @timeit
    def __sigma_prime(self):
        """
        Gets :math:`\\max(a + c, b + d) + \\max(a + b, c + d)`.

        :return: :math:`\\max(a + c, b + d) + \\max(a + b, c + d)`
        """
        a, b, c, d, n = self.__abcdn()
        return max(a + c, b + d) + max(a + b, c + d)


    @timeit
    @similarity

    def jaccard_3w(self):
        """
        3W-Jaccard

        :math:`\\frac{3a}{3a+b+c}`

        :return: 3W-Jaccard.
        """
        a, b, c, d, n = self.__abcdn()
        return 3 * a / (3 * a + b + c)


    @timeit
    @similarity

    def ample(self):
        """
        Ample

        :math:`\\left|\\frac{a(c+d)}{c(a+b)}\\right|`

        :return: Ample.
        """
        a, b, c, d, n = self.__abcdn()
        return abs((a * (c + d)) / (c * (a + b)))


    @timeit
    @similarity

    def anderberg(self):
        """
        Anderberg

        :math:`\\frac{\\sigma-\\sigma'}{2n}`

        :return: Anderberg.
        """
        *_, n = self.__abcdn()
        return (self.__sigma() - self.__sigma_prime()) / (2 * n)


    @timeit
    @similarity

    def baroni_urbani_buser_i(self):
        """
        Baroni-Urbani-Buser-I

        :math:`\\frac{\\sqrt{ad}+a}{\\sqrt{ad}+a+b+c}`

        :return: Baroni-Urbani-Buser-I.
        """
        a, b, c, d, n = self.__abcdn()
        return (sqrt(a * d) + a) / (sqrt(a * d) + a + b + c)


    @timeit
    @similarity

    def baroni_urbani_buser_ii(self):
        """
        Baroni-Urbani-Buser-II

        :math:`\\frac{\\sqrt{ad}+a-(b+c)}{\\sqrt{ad}+a+b+c}`

        :return: Baroni-Urbani-Buser-II.
        """
        a, b, c, d, n = self.__abcdn()
        return (sqrt(a * d) + a - (b + c)) / (sqrt(a * d) + a + b + c)


    @timeit
    @similarity

    def braun_banquet(self):
        """
        Braun-Banquet

        :math:`\\frac{a}{\\max(a+b,a+c)}`

        :return: Braun-Banquet.
        """
        a, b, c, d, n = self.__abcdn()
        return a / max(a + b, a + c)


    @timeit
    @similarity

    def cole_i(self):
        """
        Cole-I

        :math:`\\frac{\\sqrt{2}(ad-bc)}{\\sqrt{(ad-bc)^2-(a+b)(a+c)(b+d)(c+d)}}`

        :return: Cole-I.
        """
        a, b, c, d, n = self.__abcdn()
        return (2 * (a * d - b * c) ** 2) / ((a * d - b * c) ** 2 - (a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def cole_ii(self):
        """
        Cole-II

        :math:`\\frac{ad-bc}{\\min((a+b)(a+c),(b+d)(c+d))}`

        :return: Cole-II.
        """
        a, b, c, d, n = self.__abcdn()
        return (2 * (a * d - b * c) ** 2) / ((a * d - b * c) ** 2 - (a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def cosine(self):
        """
        Cosine

        :math:`\\frac{a}{(a+b)(a+c)}`

        :return: Cosine.
        """
        a, b, c, d, n = self.__abcdn()
        return a / ((a + b) * (a + c))


    @timeit
    @similarity

    def dennis(self):
        """
        Dennis

        :math:`\\frac{ad-bc}{\\sqrt{n(a+b)(a+c)}}`

        :return: Dennis.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d - b * c) / sqrt(n * (a + b) * (a + c))


    @timeit
    @similarity

    def dice(self):
        """
        Dice; Czekanowski; Nei-Li

        :math:`\\frac{2a}{2a+b+c}`

        :return: Dice.
        """
        a, b, c, d, n = self.__abcdn()
        return (2 * a) / (2 * a + b + c)


    @timeit
    @similarity

    def disperson(self):
        """
        Disperson

        :math:`\\frac{ad-bc}{(a+b+c+d)^2}`

        :return: Disperson.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d - b * c) / (a + b + c + d) ** 2


    @timeit
    @similarity

    def driver_kroeber(self):
        """
        Driver-Kroeber

        :math:`\\frac{a}{2}\\left(\\frac{1}{a+b}+\\frac{1}{a+c}\\right)`

        :return: Driver-Kroeber.
        """
        a, b, c, d, n = self.__abcdn()
        return (a / 2) * ((1 / (a + b)) + (1 / (a + c)))


    @timeit
    @similarity

    def eyraud(self):
        """
        Eyraud

        :math:`\\frac{n^2(na-(a+b)(a+c))}{(a+b)(a+c)(b+d)(c+d)}`

        :return: Eyraud.
        """
        a, b, c, d, n = self.__abcdn()
        return (n ** 2 * (n * a - (a + b) * (a + c))) / ((a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def fager_mcgowan(self):
        """
        Fager-McGowan

        :math:`\\frac{a}{\\sqrt{(a+b)(a+c)}}-\\frac{max(a+b,a+c)}{2}`

        :return: Fager-McGowan.
        """
        a, b, c, d, n = self.__abcdn()
        return a / sqrt((a + b) * (a + c)) - max(a + b, a + c) / 2


    @timeit
    @similarity

    def faith(self):
        """
        Faith

        :math:`\\frac{a+0.5d}{a+b+c+d}`

        :return: Faith.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + 0.5 * d) / (a + b + c + d)


    @timeit
    @similarity

    def forbes_ii(self):
        """
        Forbes-II

        :math:`\\frac{na-(a+b)(a+c)}{n \\min(a+b,a+c) - (a+b)(a+c)}`

        :return: Forbes-II.
        """
        a, b, c, d, n = self.__abcdn()
        return (n * a - (a + b) * (a + c)) / (n * min(a + b, a + c) - (a + b) * (a + c))


    @timeit
    @similarity

    def forbesi(self):
        """
        Forbesi

        :math:`\\frac{na}{(a+b)(a+c)}`

        :return: Forbesi.
        """
        a, b, c, d, n = self.__abcdn()
        return (n * a) / ((a + b) * (a + c))


    @timeit
    @similarity

    def fossum(self):
        """
        Fossum

        :math:`\\frac{n(a-0.5)^2}{(a+b)(a+c)}`

        :return: Fossum.
        """
        a, b, c, d, n = self.__abcdn()
        return (n * (a - 0.5) ** 2) / ((a + b) * (a + c))


    @timeit
    @similarity

    def gilbert_wells(self):
        """
        Gilbert-Wells

        :math:`\\log a - \\log n - \\log \\frac{a+b}{n} - \\log \\frac{a+c}{n}`

        :return: Gilbert-Wells.
        """
        a, b, c, d, n = self.__abcdn()
        return log(a) - log(n) - log((a + b) / n) - log((a + c) / n)


    @timeit
    @similarity

    def goodman_kruskal(self):
        """
        Goodman-Kruskal

        :math:`\\frac{\\sigma - \\sigma'}{2n-\\sigma'}`

        :return: Goodman-Kruskal.
        """
        *_, n = self.__abcdn()
        return (self.__sigma() - self.__sigma_prime()) / (2 * n - self.__sigma_prime())


    @timeit
    @similarity

    def gower(self):
        """
        Gower

        :math:`\\frac{a+d}{\\sqrt{(a+b)(a+c)(b+d)(c+d)}}`

        :return: Gower.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + d) / sqrt((a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def gower_legendre(self):
        """
        Gower-Legendre

        :math:`\\frac{a+d}{a+0.5b+0.5c+d}`

        :return: Gower-Legendre.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + d) / (a + 0.5 * (b + c) + d)


    @timeit
    @similarity

    def hamann(self):
        """
        Hamann.

        :math:`\\frac{(a+d)-(b+c)}{a+b+c+d}`

        :return: Hamann.
        """
        a, b, c, d, n = self.__abcdn()
        return ((a + d) - (b + c)) / (a + b + c + d)


    @timeit
    @similarity

    def inner_product(self):
        """
        Inner-product.

        :math:`a+d`

        :return: Inner-product.
        """
        a, b, c, d, n = self.__abcdn()
        return a + d


    @timeit
    @similarity

    def intersection(self):
        """
        Intersection

        :math:`a`

        :return: Intersection.
        """
        a, *_ = self.__abcdn()
        return a


    @timeit
    @similarity

    def jaccard(self):
        """
        Jaccard

        :math:`\\frac{a}{a+b+c}`

        :return: Jaccard.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (a + b + c)


    @timeit
    @similarity

    def johnson(self):
        """
        Johnson.

        :math:`\\frac{a}{a+b}+\\frac{a}{a+c}`

        :return: Johnson.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (a + b) + a / (a + c)


    @timeit
    @similarity

    def kulczynski_i(self):
        """
        Kulczynski-I

        :math:`\\frac{a}{b+c}`

        :return: Kulczynski-I.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (b + c)


    @timeit
    @similarity

    def kulcyznski_ii(self):
        """
        Kulczynski-II

        :math:`\\frac{0.5a(2a+b+c)}{(a+b)(a+c)}`

        :return: Kulczynski-II.
        """
        a, b, c, d, n = self.__abcdn()
        return 0.5 * ((a / (a + b)) * (a / (a + c)))


    @timeit
    @similarity

    def mcconnaughey(self):
        """
        McConnaughey

        :math:`\\frac{a^2 - bc}{(a+b)(a+c)}`

        :return: McConnaughey.
        """
        a, b, c, d, n = self.__abcdn()
        return (a ** 2 - b * c) / ((a + d) ** 2 + (b + c) ** 2)


    @timeit
    @similarity

    def michael(self):
        """
        Michael

        :math:`\\frac{4(ad-bc)}{(a+d)^2+(b+c)^2}`

        :return: Michael.
        """
        a, b, c, d, n = self.__abcdn()
        return (4 * (a * d - b * c)) / ((a + d) ** 2 + (b + c) ** 2)


    @timeit
    @similarity

    def mountford(self):
        """
        Mountford

        :math:`\\frac{a}{0.5(ab + ac) + bc}`

        :return: Mountford.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (0.5 * (a * b + a * c) + b * c)


    @timeit
    @similarity

    def ochia_i(self):
        """
        Ochia-I

        Also known as `Fowlkes-Mallows Index <https://en.wikipedia.org/wiki/Fowlkes%E2%80%93Mallows_index>`_.
        This measure is typically used to judge the similarity between two clusters.
        A larger value indicates that the clusters are more similar.

        :math:`\\frac{a}{\\sqrt{(a+b)(a+c)}}`

        :return: Ochai-I.
        """
        a, b, c, d, n = self.__abcdn()
        return sqrt((a / (a + b)) * (a / (a + c)))


    @timeit
    @similarity

    def ochia_ii(self):
        """
        Ochia-II

        :math:`\\frac{ad}{\\sqrt{(a+b)(a+c)(b+d)(c+d)}}`

        :return: Ochia-II.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d) / sqrt((a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def pearson_heron_i(self):
        """
        Pearson-Heron-I

        :math:`\\frac{ad-bc}{\\sqrt{(a+b)(a+c)(b+d)(c+d)}}`

        :return: Pearson-Heron-I.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d - b * c) / sqrt((a + b) * (a + c) * (b + d) * (c + d))


    @timeit
    @similarity

    def pearson_heron_ii(self):
        """
        Pearson-Heron-II

        :math:`\\sqrt{\\frac{\\chi^2}{n+\\chi^2}}`

        :return: Pearson-Heron-II.
        """
        a, b, c, d, n = self.__abcdn()
        return cos((pi * sqrt(b * c)) / (sqrt(a * d) + sqrt(b * c)))


    @timeit
    @similarity

    def pearson_i(self):
        """
        Pearson-I

        :math:`\\chi^2=\\frac{n(ad-bc)^2}{(a+b)(a+c)(c+d)(b+d)}`

        :return: Pearson-I.
        """
        a, b, c, d, n = self.__abcdn()
        return (n * (a * d - b * c) ** 2) / ((a + b) * (a + c) * (c + d) * (b + d))


    @timeit
    @similarity

    def chisq(self):
        """
        :math:`\\chi^2` (alias for Pearson-I)

        :return: :math:`\\chi^2`.
        """
        return self.pearson_i


    @timeit
    @similarity

    def person_ii(self):
        """
        Pearson-II

        :math:`\\sqrt{\\frac{\\rho}{n+\\rho}}`

        - :math:`\\rho=\\frac{ad-bc}{\\sqrt{(a+b)(a+c)(b+d)(c+d)}}`

        :return: Pearson-II.
        """
        *_, n = self.__abcdn()
        chisq = self.chisq()
        return sqrt(chisq / (n + chisq))


    @timeit
    @similarity

    def peirce(self):
        """
        Peirce

        :math:`\\frac{ab+bc}{ab+2bc+cd}`

        :return: Peirce.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * b + b * c) / (a * b + 2 * b * c + c * d)


    @timeit
    @similarity

    def roger_tanimoto(self):
        """
        Roger-Tanimoto

        :math:`\\frac{a+d}{a+2b+2c+d}`

        :return: Roger-Tanimoto.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + d) / (a + 2 * (b + c) + d)


    @timeit
    @similarity

    def russel_rao(self):
        """
        Russel-Rao

        :math:`\\frac{a}{a+b+c+d}`

        :return: Russel-Rao.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (a + b + c + d)


    @timeit
    @similarity

    def simpson(self):
        """
        Simpson (or `Overlap <https://en.wikipedia.org/wiki/Overlap_coefficient>`_).

        :math:`\\frac{a}{\\min(a+b,a+c)}`

        :return: Simpson.
        """
        a, b, c, d, n = self.__abcdn()
        return a / min(a + b, a + c)


    @timeit
    @similarity

    def sokal_michener(self):
        """
        Sokal-Michener

        :math:`\\frac{a+d}{a+b+c+d}`

        :return: Sokal-Michener.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + d) / (a + b + c + d)


    @timeit
    @similarity

    def sokal_sneath_i(self):
        """
        Sokal-Sneath-I


        :math:`\\frac{a}{a+2b+2c}`

        :return: Sokal-Sneath-I.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (a + 2 * (b + c))


    @timeit
    @similarity

    def sokal_sneath_ii(self):
        """
        Sokal-Sneath-II

        :math:`\\frac{2a+2d}{2a+b+c+2d}`

        :return: Sokal-Sneath-II.
        """
        a, b, c, d, n = self.__abcdn()
        return 2 * (a + d) / (2 * (a + d) + b + c)


    @timeit
    @similarity

    def sokal_sneath_iii(self):
        """
        Sokal-Sneath-III

        :math:`\\frac{a+d}{b+c}`

        :return: Sokal-Sneath-III.
        """
        a, b, c, d, n = self.__abcdn()
        return (a + d) / (b + c)


    @timeit
    @similarity

    def sokal_sneath_iv(self):
        """
        Sokal-Sneath-IV

        :math:`\\frac{ad}{(a+b)(a+c)(b+d)\\sqrt{c+d}}`

        :return: Sokal-Sneath-IV.
        """
        a, b, c, d, n = self.__abcdn()
        return 0.25 * ((a / (a + b)) + (a / (a + c)) + (d / (b + d)) + (d / (b + d)))


    @timeit
    @similarity

    def sokal_sneath_v(self):
        """
        Sokal-Sneath-V

        :math:`\\frac{1}{4}\\left(\\frac{a}{a+b}+\\frac{a}{a+c}+\\frac{d}{b+d}+\\frac{d}{b+d}\\right)`

        :return: Sokal-Sneath-V.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d) / ((a + b) * (a + c) * (b + d) * sqrt(c + d))


    @timeit
    @similarity

    def sorensen_dice(self):
        """
        Sørensen–Dice

        :math:`\\frac{2(a + d)}{2(a + d) + b + c}`

        :return: Sørensen–Dice,
        """
        a, b, c, d, n = self.__abcdn()
        return 2 * (a + d) / (2 * (a + d) + b + c)


    @timeit
    @similarity

    def sorgenfrei(self):
        """
        Sorgenfrei

        :math:`\\frac{a^2}{(a+b)(a+c)}`

        :return: Sorgenfrei.
        """
        a, b, c, d, n = self.__abcdn()
        return a ** 2 / ((a + b) * (a + c))


    @timeit
    @similarity

    def stiles(self):
        """
        Stiles

        :math:`\\log_{10} \\frac{n\\left(|ad-bc|-\\frac{n}{2}\\right)^2}{(a+b)(a+c)(b+d)(c+d)}`

        :return: Stiles.
        """
        a, b, c, d, n = self.__abcdn()
        return log((n * (abs(a * d - b * c) - 0.5) ** 2) / ((a + b) * (a + c) * (b + d) * (c + d)), 10)


    @timeit
    @similarity

    def tanimoto_i(self):
        """
        Tanimoto-I

        :math:`\\frac{a}{2a+b+c}`

        :return: Tanimoto-I.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (2 * a + b + c)


    @timeit
    @similarity

    def tanimoto_ii(self):
        """
        Tanimoto-II

        :math:`\\frac{a}{b + c}`

        :return: Tanimoto-II.
        """
        a, b, c, d, n = self.__abcdn()
        return a / (b + c)


    @timeit
    @similarity

    def tarwid(self):
        """
        Tarwind

        :math:`\\frac{na - (a+b)(a+c)}{na + (a+b)(a+c)}`

        :return: Tarwind.
        """
        a, b, c, d, n = self.__abcdn()
        return (n * a - (a + b) * (a + c)) / (n * a + (a + b) * (a + c))


    @timeit
    @similarity

    def tarantula(self):
        """
        Tarantula

        :math:`\\frac{a(c+d)}{c(a+b)}`

        :return: Tarantula.
        """
        a, b, c, d, n = self.__abcdn()
        return a * (c + d) / (c * (a + b))


    @timeit
    @similarity

    def yule_q(self):
        """
        Yule's Q

        :math:`\\frac{ad-bc}{ad+bc}`

        Also, Yule's Q is based off of the odds ratio or cross-product ratio, :math:`\\alpha`.

        :math:`Q = \\frac{\\alpha - 1}{\\alpha + 1}`

        Yule's Q is the same as Goodman-Kruskal's :math:`\\lambda` for 2 x 2 contingency tables and is also
        a measure of proportional reduction in error (PRE).

        :return: Yule's Q.
        """
        a, b, c, d, n = self.__abcdn()
        return (a * d - b * c) / (a * d + b * c)


    @timeit
    @similarity

    def yule_w(self):
        """
        Yule's w

        :math:`\\frac{\\sqrt{ad}-\\sqrt{bc}}{\\sqrt{ad}+\\sqrt{bc}}`

        :return: Yule's w.
        """
        a, b, c, d, n = self.__abcdn()
        return (sqrt(a * d) - sqrt(b * c)) / (sqrt(a * d) + sqrt(b * c))


    @timeit
    @distance

    def chord(self):
        """
        Chord

        :math:`\\sqrt{2\\left(1 - \\frac{a}{\\sqrt{(a+b)(a+c)}}\\right)}`

        :return: Chord (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return sqrt(2 * (1 - a / sqrt((a + b) * (a + c))))


    @timeit
    @distance

    def euclid(self):
        """
        Euclid

        :math:`\\sqrt{b+c}`

        :return: Euclid (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return sqrt(b + c)


    @timeit
    @distance

    def hamming(self):
        """
        Hamming; Canberra; Manhattan; Cityblock; Minkowski

        :math:`b+c`

        :return: Hamming (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return b + c


    @timeit
    @distance

    def hellinger(self):
        """
        Hellinger

        :math:`2\\sqrt{1 - \\frac{a}{\\sqrt{(a+b)(a+c)}}}`

        :return: Hellinger (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return 2 * sqrt(1 - a / sqrt((a + b) * (a + c)))


    @timeit
    @distance

    def jaccard_distance(self):
        """
        Jaccard

        :math:`\\frac{b + c}{a + b + c}`

        :return: Jaccard (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (b + c) / (a + b + c)


    @timeit
    @distance

    def lance_williams(self):
        """
        Lance-Williams; Bray-Curtis

        :math:`\\frac{b+c}{2a+b+c}`

        :return: Lance-Williams (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (b + c) / (2 * a + b + c)


    @timeit
    @distance

    def mean_manhattan(self):
        """
        Mean-Manhattan

        :math:`\\frac{b+c}{a+b+c+d}`

        :return: Mean-Manhattan (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (b + c) / (a + b + c + d)


    @timeit
    @distance

    def pattern_difference(self):
        """
        Pattern difference

        :math:`\\frac{4bc}{(a+b+c+d)^2}`

        :return: Pattern difference (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (4 * b * c) / (a + b + c + d) ** 2


    @timeit
    @distance

    def shape_difference(self):
        """
        Shape difference

        :math:`\\frac{n(b+c)-(b-c)^2}{(a+b+c+d)^2}`

        :return: Shape difference (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (n * (b + c) - (b - c) ** 2) / (a + b + c + d) ** 2


    @timeit
    @distance

    def size_difference(self):
        """
        Size difference

        :math:`\\frac{(b+c)^2}{(a+b+c+d)^2}`

        :return: Size difference (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (b + c) ** 2 / (a + b + c + d) ** 2


    @timeit
    @distance

    def vari(self):
        """
        Vari

        :math:`\\frac{b+c}{4a+4b+4c+4d}`

        :return: Vari (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return (b + c) / (4 * (a + b + c + d))


    @timeit
    @distance

    def yule_q_difference(self):
        """
        Yule's q

        :math:`\\frac{2bc}{ad+bc}`

        :return: Yule's q (distance).
        """
        a, b, c, d, n = self.__abcdn()
        return 2 * b * c / (a * d + b * c)


    @timeit
    @distance

    def tanimoto_distance(self):
        """
        `Tanimoto similarity and distance <https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance>`_.

        :return: Tanimoto distance.
        """
        d = -log2(self.roger_tanimoto())
        return d


    @timeit

    def cramer_v(self):
        """
        `Cramer's V <https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V>`_.

        :return: Cramer's V.
        """
        *_, n = self.__abcdn()
        chisq = self.chisq()
        s = sqrt(chisq / n)
        return s


    @timeit

    def contingency_coefficient(self):
        """
        `Contingency coefficient <https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C>`_.

        :return: Contingency coefficient.
        """
        *_, n = self.__abcdn()
        chisq = self.chisq()
        s = sqrt(chisq / (n + chisq))
        return s


    @timeit

    def tschuprow_t(self):
        """
        `Tschuprow's T <https://en.wikipedia.org/wiki/Tschuprow%27s_T>`_.

        :return: Tschuprow's T.
        """
        s = sqrt(self.chisq())
        return s


    @timeit

    def mcnemar_test(self):
        """
        `McNemar's test <https://en.wikipedia.org/wiki/McNemar%27s_test>`_.

        :return: A tuple. First element is chi-square test statistics. Second element is p-value.
        """
        a, b, c, d, n = self.__abcdn()
        chisq = (b - c) ** 2 / (b + c)
        p = 1 - stats.chi2.cdf(chisq, 1)
        return chisq, p


    @timeit

    def odds_ratio(self):
        """
        `Odds ratio <https://en.wikipedia.org/wiki/Contingency_table#Odds_ratio>`_. The odds
        ratio is also referred to as the `cross-product ratio`.

        :return: Odds ratio.
        """
        a, b, c, d, n = self.__abcdn()

        p_11 = a / n
        p_10 = b / n
        p_01 = c / n
        p_00 = d / n

        ratio = (p_11 * p_00) / (p_10 * p_01)
        return ratio


    @timeit

    def yule_y(self):
        """
        Yule's Y is based off of the odds ratio or cross-product ratio, :math:`\\alpha`.

        :math:`Y = \\frac{\\sqrt\\alpha - 1}{\\sqrt\\alpha + 1}`

        :return: Yule's Y.
        """
        alpha = sqrt(self.odds_ratio())
        q = (alpha - 1) / (alpha + 1)
        return q


    @timeit

    def tetrachoric(self):
        """
        Tetrachoric correlation ranges from :math:`[-1, 1]`, where 0 indicates no agreement,
        1 indicates perfect agreement and -1 indicates perfect disagreement.

        - if :math:`b=0` or :math:`c=0`, 1.0
        - if :math:`a=0` or :math:`b=0`, -1.0
        - else, :math:`\\frac{y-1}{y+1}, y={\\left(\\frac{da}{bc}\\right)}^{\\frac{\\pi}{4}}`

        References

        - `Tetrachoric correlation <https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#TETRA>`_.
        - `Tetrachoric Correlation: Definition, Examples, Formula <https://www.statisticshowto.com/tetrachoric-correlation/>`_.
        - `Tetrachoric Correlation Estimation <https://www.real-statistics.com/correlation/polychoric-correlation/tetrachoric-correlation-estimation/>`_.

        :return: Tetrachoric correlation.
        """
        a, b, c, d, n = self.__abcdn()

        if b == 0 or c == 0:
            return 1.0
        if d == 0 or a == 0:
            return -1.0

        y = pow((d * a) / (b * c), pi / 4.0)
        p = (y - 1) / (y + 1)
        return p

    @timeit
    @similarity

    def tversky_index(self, theta=1, phi=0):
        """
        Compute's Tversky's Index.

        :math:`\\frac{a}{a+\\theta b+\\phi c}`

        :math:`\\theta` and :math:`\\phi` are typically between :math:`[0,1]`
        and :math:`\\theta + \\phi = 1`.

        :param theta: Weight :math:`[0,1]` of how important match on row variable is. Default 1.
        :param phi: Weight :math:`[0,1]` of how important match on column variable is. Default 0.
        :return: Tversky's Index.
        """
        a, b, c, _ = self.__abcd
        return a / (a + theta * b + phi * c)


class ConfusionMixin(object):
    """
    Confusion matrix computations.
    """


    def tp(self):
        """
        TP

        :return: TP.
        """
        return self._tp


    def fn(self):
        """
        FN

        :return: FN.
        """
        return self._fn


    def fp(self):
        """
        FP

        :return: FP.
        """
        return self._fp


    def tn(self):
        """
        TN

        :return: TN.
        """
        return self._tn


    def n(self):
        """
        :math:`N = TP + FN + FP + TN`

        :return: N.
        """
        return self._n


    def __counts(self):
        """
        Returns TP, FN, FP, TN, N.

        :return: TP, FN, FP, TN, N.
        """
        return self._tp, self._fn, self._fp, self._tn, self._n



    def tpr(self):
        """
        True positive rate.

        :math:`TPR = \\frac{TP}{TP + FN}`

        Aliases

        - sensitivity
        - recall
        - hit rate
        - power
        - probability of detection

        :return: TPR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return tp / (tp + fn)



    def tnr(self):
        """
        True negative rate.

        :math:`TNR = \\frac{TN}{TN + FP}`

        Aliases

        - specificity
        - selectivity

        :return: TNR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return tn / (tn + fp)



    def ppv(self):
        """
        Positive predictive value.

        :math:`PPV = \\frac{TP}{TP + FP}`

        Aliases

        - precision

        :return: PPV.
        """
        tp, fn, fp, tn, n = self.__counts()
        return tp / (tp + fp)



    def npv(self):
        """
        Negative predictive value.

        :math:`NPV = \\frac{TN}{TN + FN}`

        :return: NPV.
        """
        tp, fn, fp, tn, n = self.__counts()
        return tn / (tn + fn)



    def fnr(self):
        """
        False negative rate.

        :math:`FNR = \\frac{FN}{FN + TP}`

        Aliases

        - miss rate

        :return: FNR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return fn / (fn + tp)



    def fpr(self):
        """
        False positive rate.

        :math:`FPR = \\frac{FP}{FP + TN}`

        Aliases

        - fall-out
        - probability of false alarm

        :return: FPR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return fp / (fp + tn)



    def fdr(self):
        """
        False discovery rate.

        :math:`FDR = \\frac{FP}{FP + TP}`

        :return: FDR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return fp / (fp + tp)



    def fomr(self):
        """
        False omission rate.

        :math:`FOR = \\frac{FN}{FN + TN}`

        :return: FOR.
        """
        tp, fn, fp, tn, n = self.__counts()
        return fn / (fn + tn)



    def pt(self):
        """
        Prevalence threshold.

        :math:`PT = \\frac{\\sqrt{TPR(-TNR + 1)} + TNR - 1}{TPR + TNR - 1}`

        :return: Prevalence threshold.
        """
        tpr = self.tpr()
        tnr = self.tnr()

        return (sqrt(tpr * (-tnr + 1)) + tnr - 1) / (tpr + tnr - 1)



    def ts(self):
        """
        Threat score.

        :math:`TS = \\frac{TP}{TP + FN + FP}`

        Aliases

        - critical success index (CSI).

        :return: TS.
        """
        tp, fn, fp, tn, n = self.__counts()
        return tp / (tp + fn + fp)



    def acc(self):
        """
        Accuracy.

        :math:`ACC = \\frac{TP + TN}{TP + TN + FP + FN}`

        :return: Accuracy.
        """
        tp, fn, fp, tn, n = self.__counts()
        return (tp + tn) / (tp + tn + fp + fn)



    def ba(self):
        """
        Balanced accuracy.

        :math:`BA = \\frac{TPR + TNR}{2}`

        :return: Balanced accuracy.
        """
        return (self.tpr() + self.tnr()) / 2



    def f1(self):
        """
        F1 score: harmonic mean of precision and sensitivity.

        :math:`F1 = \\frac{PPV \\times TPR}{PPV + TPR}`

        :return: F1.
        """
        return 2 * (self.ppv() * self.tpr()) / (self.ppv() + self.tpr())



    def mcc(self):
        """
        Matthew's correlation coefficient.

        :math:`MCC = \\frac{TP + TN - FP \\times FN}{\\sqrt{(TP + FP)(TP + FN)(TN + FP)(TN + FN)}}`

        :return: MCC.
        """
        tp, fn, fp, tn, n = self.__counts()

        return (tp + tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))



    def bm(self):
        """
        Bookmaker informedness.

        :math:`BI = TPR + TNR - 1`

        :return: BM.
        """
        return self.tpr() + self.tnr() - 1



    def mk(self):
        """
        Markedness.

        :math:`MK = PPV + NPV - 1`

        Aliases

        - deltaP

        :return: Markedness.
        """
        return self.ppv() + self.npv() - 1



    def sensitivity(self):
        """
        Alias to TPR.

        :return: Sensitivity.
        """
        return self.tpr()



    def specificity(self):
        """
        Alias to TNR.

        :return: Specificity.
        """
        return self.tnr()



    def precision(self):
        """
        Alias to PPV.

        :return: PPV.
        """
        return self.ppv()



    def recall(self):
        """
        Alias to TPR.

        :return: TPR.
        """
        return self.tpr()



    def prevalence(self):
        """
        Prevalence.

        :math:`\\frac{TP + FN}{N}`

        :return: Prevalence.
        """
        tp, fn, fp, tn, n = self.__counts()
        return (tp + fn) / n



    def plr(self):
        """
        Positive likelihood ratio.

        :math:`PLR = \\frac{TPR}{FPR}`

        Aliases

        - LR+

        :return: PLR.
        """
        return self.tpr() / self.fpr()



    def nlr(self):
        """
        Negative likelihood ratio.

        :math:`NLR = \\frac{FNR}{TNR}`

        Aliases

        - LR-

        :return: NLR.
        """
        return self.fnr() / self.tnr()



    def dor(self):
        """
        Diagnostic odds ratio.

        :math:`\\frac{PLR}{NLR}`

        :return: DOR.
        """
        return self.plr() / self.nlr()


class AgreementMixin(object):
    """
    Agreement computations.
    """



    def chohen_k(self):
        """
        Computes Cohen's :math:`\\kappa`.

        - :math:`\\kappa = \\frac{\\theta_1 - \\theta_2}{1 - \\theta_2}`
        - :math:`\\theta_1 = \\sum_i p_{ii}`
        - :math:`\\theta_2 = \\sum_i p_{i+}p_{+i}`

        :return: :math:`\\kappa`.
        """
        theta_1 = sum([self._table[i][i] for i in range(self._r)])
        theta_2 = sum([self._r_margs[i] * self._k_margs[i] for i in range(self._r)])
        k = (theta_1 - theta_2) / (1 - theta_2)
        return k



    def cohen_light_k(self):
        """
        Cohen-Light :math:`\\kappa`. :math:`\\kappa` is a measure of conditional agreement.
        Several :math:`\\kappa`, one for each unique value, will be computed and returned.

        - :math:`\\kappa = \\frac{\\theta_1 - \\theta_2}{1 - \\theta_2}`
        - :math:`\\theta_1 = \\frac{p_{ii}}{p_{i+}}`
        - :math:`\\theta_2 = p_{+i}`

        :return: A list of :math:`\\kappa`.
        """
        theta_1 = lambda i: self._table[i][i] / self._r_margs[i]
        theta_2 = lambda i: self._k_margs[i]
        kappa = lambda t_1, t_2: (t_1 - t_2) / (1 - t_2)

        kappas = [kappa(theta_1(i), theta_2(i)) for i in range(self._r)]
        return kappas


class ContingencyTable(MeasureMixin, ABC):
    """
    Abstract contingency table. All other tables inherit from this one.
    """

    def __init__(self, table):
        """
        ctor.

        :param table: A table of counts (list of lists).
        """
        n_rows = len(table)
        n_cols = len(table[0])

        self._r_margs = [sum(table[r]) for r in range(n_rows)]
        self._k_margs = [sum([table[r][c] for r in range(len(table))]) for c in range(n_cols)]
        self._n = sum(self._r_margs)
        self._r = len(self._r_margs)
        self._k = len(self._k_margs)
        self._table = table

    @staticmethod
    def _to_binary_counts(a, b, a_0=0, a_1=1, b_0=0, b_1=1):
        def to_count(x, y):
            _a, _b, _c, _d = 0, 0, 0, 0

            if x == a_1 and y == b_1:
                _a = 1
            elif x == a_1 and y == b_0:
                _b = 1
            elif x == a_0 and y == b_1:
                _c = 1
            else:
                _d = 1
            return _a, _b, _c, _d

        def add_count(x, y):
            return x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3]

        is_valid = lambda x, y: x is not None and y is not None

        counts = (to_count(x, y) for x, y in zip(a, b) if is_valid(x, y))
        counts = reduce(lambda x, y: add_count(x, y), counts)
        counts = add_count(counts, (1, 1, 1, 1))
        return counts

    @staticmethod
    def _to_categorical_counts(a, b, a_vals=None, b_vals=None):
        df = pd.DataFrame({'a': a, 'b': b})
        contingency_table = pd.crosstab(df['a'], df['b'])
        table = contingency_table.values.tolist()
        return table


class CategoricalTable(CategoricalMixin, ContingencyTable):
    """
    Represents a contingency table for categorical variables.

    References

    - `Contingency table <https://en.wikipedia.org/wiki/Contingency_table>`_
    - `More Correlation Coefficients <https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#TETRA>`_
    """

    def __init__(self, a, b, a_vals=None, b_vals=None):
        """
        ctor. If `a_vals` or `b_vals` are `None`, then the possible
        values will be determined empirically from the data.

        :param a: Iterable list.
        :param b: Iterable list.
        :param a_vals: All possible values in a. Defaults to `None`.
        :param b_vals: All possible values in b. Defaults to `None`.
        """
        table = ContingencyTable._to_categorical_counts(a, b, a_vals=a_vals, b_vals=b_vals)
        super().__init__(table)


class BinaryTable(CategoricalMixin, BinaryMixin, ContingencyTable):
    """
    Represents a contingency table for binary variables.
    """

    def __init__(self, a, b, a_0=0, a_1=1, b_0=0, b_1=1):
        """
        ctor.

        :param a: Iterable list.
        :param b: Iterable list.
        :param a_0: The zero value for a. Defaults to 0.
        :param a_1: The one value for a. Defaults to 1.
        :param b_0: The zero value for b. Defaults to 0.
        :param b_1: The zero value for b. Defaults to 1.
        """
        self.n = len(a)
        a, b, c, d = ContingencyTable._to_binary_counts(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1)
        super().__init__([[a, b], [c, d]])
        self._a = a
        self._b = b
        self._c = c
        self._d = d


class ConfusionMatrix(ConfusionMixin, ContingencyTable):
    """
    Represents a `confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix>`_. The confusion
    matrix looks like what is shown below for two binary variables `a` and `b`;
    `a` is in the rows and `b` in the columns. Most of the statistics around performance comes
    from the counts of `TN`, `FN`, `FP` and `TP`.

    .. list-table:: Confusion Matrix
       :widths: 25 25 25

       * -
         - b=0
         - b=1
       * - a=0
         - TN
         - FP
       * - a=1
         - FN
         - TP
    """

    def __init__(self, a, b, a_0=0, a_1=1, b_0=0, b_1=1):
        """
        ctor. Note that `a` is the ground truth and `b` is the prediction.

        :param a: Binary variable (iterable). Ground truth.
        :param b: Binary variable (iterable). Prediction.
        :param a_0: The zero value for a. Defaults to 0.
        :param a_1: The one value for a. Defaults to 1.
        :param b_0: The zero value for b. Defaults to 0.
        :param b_1: The zero value for b. Defaults to 1.
        """
        tp, fn, fp, tn = ContingencyTable._to_binary_counts(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1)
        super().__init__([[tp, fn], [fp, tn]])
        self._tp = tp
        self._fn = fn
        self._fp = fp
        self._tn = tn


class AgreementTable(AgreementMixin, ContingencyTable):
    """
    Represents a contingency table for agreement data against one variable. The variable is typically
    a rating variable (e.g. dislike, neutral, like), and the data is a pairing of ratings over
    the same set of items. The agreement table that is induced by the data is typically squared,
    where the number of rows and columns are equal.
    """

    def __init__(self, a, b, a_vals=None, b_vals=None):
        """
        ctor.

        :param a: Categorical variable.
        :param b: Categorical variable.
        :param a_vals: Values in `a`. Default `None`; figure out empirically.
        :param b_vals: Values in `b`. Default `None`; figure out empirically.
        """
        table = ContingencyTable._to_categorical_counts(a, b, a_vals=a_vals, b_vals=b_vals)
        super().__init__(table)

        if self._k != self._r:
            raise ValueError(f'Table not symmetric: rows={self._r}, cols={self._k}')


class CategoricalStats(CategoricalMixin, ContingencyTable):
    """
    Computes categorical stats.
    """

    def __init__(self, table):
        """
        ctor.

        :param table: Contingency table.
        """
        super().__init__(table)


class BinaryStats(CategoricalMixin, BinaryMixin, ContingencyTable):
    """
    Computes binary stats.
    """

    def __init__(self, table):
        """
        ctor.

        :param table: Contingency table.
        """
        super().__init__(table)
        self._a = table[0][0]
        self._b = table[0][1]
        self._c = table[1][0]
        self._d = table[1][1]


class ConfusionStats(ConfusionMixin, ContingencyTable):
    """
    Computes confusion matrix stats.
    """

    def __init__(self, table):
        """
        ctor.

        :param table: Contingency table.
        """
        super().__init__(table)
        self._tp = table[0][0]
        self._fn = table[0][1]
        self._fp = table[1][0]
        self._tn = table[1][1]


class AgreementStats(AgreementMixin, ContingencyTable):
    """
    Computes agreement stats.
    """

    def __init__(self, table):
        """
        ctor.

        :param table: Contingency table.
        """
        super().__init__(table)
        if self._k != self._r:
            raise ValueError(f'Table not symmetric: rows={self._r}, cols={self._k}')

In [None]:
from tempfile import tempdir
import time
def extract_measures_to_matrix(matrix, measure_class):
    """
    Extracts measures from a pypair measure class and creates a matrix.
    Args:
        measure_class: A pypair measure class (e.g., AgreementMixin).
    Returns:
        tuple:
            - measure_matrix (torch.Tensor): Matrix where each cell contains a dictionary of measures.
            - feature_names (list): List of corresponding feature names.
    """
    feature_names = ['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov'
, 'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc'
, 'workclass_Self-emp-not-inc', 'workclass_State-gov'
, 'workclass_Without-pay', 'workclass_nan', 'education_10th', 'education_11th'
, 'education_12th', 'education_1st-4th', 'education_5th-6th'
, 'education_7th-8th', 'education_9th', 'education_Assoc-acdm'
, 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate'
, 'education_HS-grad', 'education_Masters', 'education_Preschool'
, 'education_Prof-school', 'education_Some-college'
, 'marital-status_Divorced', 'marital-status_Married-AF-spouse'
, 'marital-status_Married-civ-spouse'
, 'marital-status_Married-spouse-absent', 'marital-status_Never-married'
, 'marital-status_Separated', 'marital-status_Widowed', 'occupation_?'
, 'occupation_Adm-clerical', 'occupation_Armed-Forces'
, 'occupation_Craft-repair', 'occupation_Exec-managerial'
, 'occupation_Farming-fishing', 'occupation_Handlers-cleaners'
, 'occupation_Machine-op-inspct', 'occupation_Other-service'
, 'occupation_Priv-house-serv', 'occupation_Prof-specialty'
, 'occupation_Protective-serv', 'occupation_Sales', 'occupation_Tech-support'
, 'occupation_Transport-moving', 'occupation_nan', 'relationship_Husband'
, 'relationship_Not-in-family', 'relationship_Other-relative'
, 'relationship_Own-child', 'relationship_Unmarried', 'relationship_Wife'
, 'race_Amer-Indian-Eskimo', 'race_Asian-Pac-Islander', 'race_Black'
, 'race_Other', 'race_White', 'sex_Male', 'native-country_?'
, 'native-country_Cambodia', 'native-country_Canada', 'native-country_China'
, 'native-country_Columbia', 'native-country_Cuba'
, 'native-country_Dominican-Republic', 'native-country_Ecuador'
, 'native-country_El-Salvador', 'native-country_England'
, 'native-country_France', 'native-country_Germany', 'native-country_Greece'
, 'native-country_Guatemala', 'native-country_Haiti'
, 'native-country_Holand-Netherlands', 'native-country_Honduras'
, 'native-country_Hong', 'native-country_Hungary', 'native-country_India'
, 'native-country_Iran', 'native-country_Ireland', 'native-country_Italy'
, 'native-country_Jamaica', 'native-country_Japan', 'native-country_Laos'
, 'native-country_Mexico', 'native-country_Nicaragua'
, 'native-country_Outlying-US(Guam-USVI-etc)', 'native-country_Peru'
, 'native-country_Philippines', 'native-country_Poland'
, 'native-country_Portugal', 'native-country_Puerto-Rico'
, 'native-country_Scotland', 'native-country_South', 'native-country_Taiwan'
, 'native-country_Thailand', 'native-country_Trinadad&Tobago'
, 'native-country_United-States', 'native-country_Vietnam'
, 'native-country_Yugoslavia', 'native-country_nan', 'income_>50K'
, 'capital-gain-binary_1', 'capital-loss-binary_1']
    # Calculate measures pairwise
    for i in range(len(feature_names)):
        X_col_i = X_encoded[:, i]
        for j in range(i + 1, len(feature_names)):
            if not ((i>=105 and i<107) or (j>=105 and j<107)):
              continue
            start_time = time.time()
            X_col_j = X_encoded[:, j]
            if issubclass(measure_class, ConfusionMixin):
                table = ConfusionMatrix(X_col_i, X_col_j)
                measures = ['acc', 'ba', 'bm', 'dor', 'f1', 'fdr', 'fn', 'fnr', 'fomr', 'fp', 'fpr', 'mcc', 'mk', 'n', 'nlr', 'npv', 'plr', 'ppv', 'precision', 'prevalence', 'pt', 'recall', 'sensitivity', 'specificity', 'tn', 'tnr', 'tp', 'tpr', 'ts']
            elif issubclass(measure_class, AgreementMixin):
                table = AgreementTable(X_col_i, X_col_j)
                measures = ['chohen_k', 'cohen_light_k']
            elif issubclass(measure_class, BinaryMixin):
                table = BinaryTable(X_col_i, X_col_j)
                measures = ['ample', 'anderberg', 'baroni_urbani_buser_i', 'baroni_urbani_buser_ii', 'braun_banquet', 'chisq', 'chord', 'cole_i', 'cole_ii', 'contingency_coefficient', 'cosine', 'cramer_v', 'dennis', 'dice', 'disperson', 'driver_kroeber', 'euclid', 'eyraud', 'fager_mcgowan', 'faith', 'forbes_ii', 'forbesi', 'fossum', 'gilbert_wells', 'goodman_kruskal', 'gower', 'gower_legendre', 'hamann', 'hamming', 'hellinger', 'inner_product', 'intersection', 'jaccard', 'jaccard_3w', 'jaccard_distance', 'johnson', 'kulcyznski_ii', 'kulczynski_i', 'lance_williams', 'mcconnaughey', 'mcnemar_test', 'mean_manhattan', 'michael', 'mountford', 'ochia_i', 'ochia_ii', 'odds_ratio', 'pattern_difference', 'pearson_heron_i', 'pearson_heron_ii', 'pearson_i', 'peirce', 'person_ii', 'roger_tanimoto', 'russel_rao', 'shape_difference', 'simpson', 'size_difference', 'sokal_michener', 'sokal_sneath_i', 'sokal_sneath_ii', 'sokal_sneath_iii', 'sokal_sneath_iv', 'sokal_sneath_v', 'sorensen_dice', 'sorgenfrei', 'stiles', 'tanimoto_distance', 'tanimoto_i', 'tanimoto_ii', 'tarantula', 'tarwid', 'tetrachoric', 'tschuprow_t', 'vari', 'yule_q', 'yule_q_difference', 'yule_w', 'yule_y']
            else:
                table = CategoricalTable(X_col_i, X_col_j)
                measures = ['adjusted_rand_index', 'chisq', 'chisq_dof', 'gk_lambda', 'gk_lambda_reversed', 'mutual_information', 'phi', 'uncertainty_coefficient', 'uncertainty_coefficient_reversed']
            for measure_name in measures:
                start_time = time.time()
                try:
                    exists=True
                    if measure_name == 'chohen_k':
                      measure_value = table.chohen_k()
                    elif measure_name == 'ample':
                      measure_value = table.ample()
                    elif measure_name == 'anderberg':
                      measure_value = table.anderberg()
                    elif measure_name == 'baroni_urbani_buser_i':
                      measure_value = table.baroni_urbani_buser_i()
                    elif measure_name == 'baroni_urbani_buser_ii':
                      measure_value = table.baroni_urbani_buser_ii()
                    elif measure_name == 'braun_banquet':
                      measure_value = table.braun_banquet()
                    elif measure_name == 'chisq':
                      measure_value = table.chisq()
                    elif measure_name == 'chord':
                      measure_value = table.chord()
                    elif measure_name == 'cole_i':
                      measure_value = table.cole_i()
                    elif measure_name == 'cole_ii':
                      measure_value = table.cole_ii()
                    elif measure_name == 'contingency_coefficient':
                      measure_value = table.contingency_coefficient()
                    elif measure_name == 'cosine':
                      measure_value = table.cosine()
                    elif measure_name == 'cramer_v':
                      measure_value = table.cramer_v()
                    elif measure_name == 'dennis':
                      measure_value = table.dennis()
                    elif measure_name == 'dice':
                      measure_value = table.dice()
                    elif measure_name == 'disperson':
                      measure_value = table.disperson()
                    elif measure_name == 'driver_kroeber':
                      measure_value = table.driver_kroeber()
                    elif measure_name == 'euclid':
                      measure_value = table.euclid()
                    elif measure_name == 'eyraud':
                      measure_value = table.eyraud()
                    elif measure_name == 'fager_mcgowan':
                      measure_value = table.fager_mcgowan()
                    elif measure_name == 'faith':
                      measure_value = table.faith()
                    elif measure_name == 'forbes_ii':
                      measure_value = table.forbes_ii()
                    elif measure_name == 'forbesi':
                      measure_value = table.forbesi()
                    elif measure_name == 'fossum':
                      measure_value = table.fossum()
                    elif measure_name == 'gilbert_wells':
                      measure_value = table.gilbert_wells()
                    elif measure_name == 'goodman_kruskal':
                      measure_value = table.goodman_kruskal()
                    elif measure_name == 'gower':
                      measure_value = table.gower()
                    elif measure_name == 'gower_legendre':
                      measure_value = table.gower_legendre()
                    elif measure_name == 'hamann':
                      measure_value = table.hamann()
                    elif measure_name == 'hamming':
                      measure_value = table.hamming()
                    elif measure_name == 'hellinger':
                      measure_value = table.hellinger()
                    elif measure_name == 'inner_product':
                      measure_value = table.inner_product()
                    elif measure_name == 'intersection':
                      measure_value = table.intersection()
                    elif measure_name == 'jaccard':
                      measure_value = table.jaccard()
                    elif measure_name == 'jaccard_3w':
                      measure_value = table.jaccard_3w()
                    elif measure_name == 'jaccard_distance':
                      measure_value = table.jaccard_distance()
                    elif measure_name == 'johnson':
                      measure_value = table.johnson()
                    elif measure_name == 'kulcyznski_ii':
                      measure_value = table.kulcyznski_ii()
                    elif measure_name == 'kulczynski_i':
                      measure_value = table.kulczynski_i()
                    elif measure_name == 'lance_williams':
                      measure_value = table.lance_williams()
                    elif measure_name == 'mcconnaughey':
                      measure_value = table.mcconnaughey()
                    elif measure_name == 'mcnemar_test':
                      measure_value = table.mcnemar_test()
                    elif measure_name == 'mean_manhattan':
                      measure_value = table.mean_manhattan()
                    elif measure_name == 'michael':
                      measure_value = table.michael()
                    elif measure_name == 'mountford':
                      measure_value = table.mountford()
                    elif measure_name == 'ochia_i':
                      measure_value = table.ochia_i()
                    elif measure_name == 'ochia_ii':
                      measure_value = table.ochia_ii()
                    elif measure_name == 'odds_ratio':
                      measure_value = table.odds_ratio()
                    elif measure_name == 'pattern_difference':
                      measure_value = table.pattern_difference()
                    elif measure_name == 'pearson_heron_i':
                      measure_value = table.pearson_heron_i()
                    elif measure_name == 'pearson_heron_ii':
                      measure_value = table.pearson_heron_ii()
                    elif measure_name == 'pearson_i':
                      measure_value = table.pearson_i()
                    elif measure_name == 'peirce':
                      measure_value = table.peirce()
                    elif measure_name == 'person_ii':
                      measure_value = table.person_ii()
                    elif measure_name == 'roger_tanimoto':
                      measure_value = table.roger_tanimoto()
                    elif measure_name == 'russel_rao':
                      measure_value = table.russel_rao()
                    elif measure_name == 'shape_difference':
                      measure_value = table.shape_difference()
                    elif measure_name == 'simpson':
                      measure_value = table.simpson()
                    elif measure_name == 'size_difference':
                      measure_value = table.size_difference()
                    elif measure_name == 'sokal_michener':
                      measure_value = table.sokal_michener()
                    elif measure_name == 'sokal_sneath_i':
                      measure_value = table.sokal_sneath_i()
                    elif measure_name == 'sokal_sneath_ii':
                      measure_value = table.sokal_sneath_ii()
                    elif measure_name == 'sokal_sneath_iii':
                      measure_value = table.sokal_sneath_iii()
                    elif measure_name == 'sokal_sneath_iv':
                      measure_value = table.sokal_sneath_iv()
                    elif measure_name == 'sokal_sneath_v':
                      measure_value = table.sokal_sneath_v()
                    elif measure_name == 'sorensen_dice':
                      measure_value = table.sorensen_dice()
                    elif measure_name == 'sorgenfrei':
                      measure_value = table.sorgenfrei()
                    elif measure_name == 'stiles':
                      measure_value = table.stiles()
                    elif measure_name == 'tanimoto_distance':
                      measure_value = table.tanimoto_distance()
                    elif measure_name == 'tanimoto_i':
                      measure_value = table.tanimoto_i()
                    elif measure_name == 'tanimoto_ii':
                      measure_value = table.tanimoto_ii()
                    elif measure_name == 'tarantula':
                      measure_value = table.tarantula()
                    elif measure_name == 'tarwid':
                      measure_value = table.tarwid()
                    elif measure_name == 'tetrachoric':
                      measure_value = table.tetrachoric()
                    elif measure_name == 'tschuprow_t':
                      measure_value = table.tschuprow_t()
                    elif measure_name == 'vari':
                      measure_value = table.vari()
                    elif measure_name == 'yule_q':
                      measure_value = table.yule_q()
                    elif measure_name == 'yule_q_difference':
                      measure_value = table.yule_q_difference()
                    elif measure_name == 'yule_w':
                      measure_value = table.yule_w()
                    elif measure_name == 'yule_y':
                      measure_value = table.yule_y()
                    elif measure_name == 'adjusted_rand_index':
                      measure_value = table.adjusted_rand_index()
                    elif measure_name == 'chisq':
                      measure_value = table.chisq()
                    elif measure_name == 'chisq_dof':
                      measure_value = table.chisq_dof()
                    elif measure_name == 'gk_lambda':
                      measure_value = table.gk_lambda()
                    elif measure_name == 'gk_lambda_reversed':
                      measure_value = table.gk_lambda_reversed()
                    elif measure_name == 'phi':
                      measure_value = table.phi()
                    elif measure_name == 'acc':
                      measure_value = table.acc()
                    elif measure_name == 'ba':
                      measure_value = table.ba()
                    elif measure_name == 'bm':
                      measure_value = table.bm()
                    elif measure_name == 'dor':
                      measure_value = table.dor()
                    elif measure_name == 'f1':
                      measure_value = table.f1()
                    elif measure_name == 'fdr':
                      measure_value = table.fdr()
                    elif measure_name == 'fn':
                      measure_value = table.fn()
                    elif measure_name == 'fnr':
                      measure_value = table.fnr()
                    elif measure_name == 'fomr':
                      measure_value = table.fomr()
                    elif measure_name == 'fp':
                      measure_value = table.fp()
                    elif measure_name == 'fpr':
                      measure_value = table.fpr()
                    elif measure_name == 'mcc':
                      measure_value = table.mcc()
                    elif measure_name == 'mk':
                      measure_value = table.mk()
                    elif measure_name == 'n':
                      measure_value = table.n()
                    elif measure_name == 'nlr':
                      measure_value = table.nlr()
                    elif measure_name == 'npv':
                      measure_value = table.npv()
                    elif measure_name == 'plr':
                      measure_value = table.plr()
                    elif measure_name == 'ppv':
                      measure_value = table.ppv()
                    elif measure_name == 'precision':
                      measure_value = table.precision()
                    elif measure_name == 'prevalence':
                      measure_value = table.prevalence()
                    elif measure_name == 'pt':
                      measure_value = table.pt()
                    elif measure_name == 'recall':
                      measure_value = table.recall()
                    elif measure_name == 'sensitivity':
                      measure_value = table.sensitivity()
                    elif measure_name == 'specificity':
                      measure_value = table.specificity()
                    elif measure_name == 'tn':
                      measure_value = table.tn()
                    elif measure_name == 'tnr':
                      measure_value = table.tnr()
                    elif measure_name == 'tp':
                      measure_value = table.tp()
                    elif measure_name == 'tpr':
                      measure_value = table.tpr()
                    elif measure_name == 'ts':
                      measure_value = table.tpr()
                    else:
                      measure_value = None
                      exists=False
                    if exists:
                      matrix[i][j][measure_name] = measure_value
                      matrix[j][i][measure_name] = measure_value
                      end_time = time.time()
                      processed = end_time - start_time
                      print(feature_names[i], "-", feature_names[j]," - ",measure_name, ": ",measure_value, f"- {processed:4f} s")
                except Exception as e:
                    print(f"Error calculating {measure_name}: {e}")

    return matrix, feature_names

import pickle



def load_measure_matrix(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data['matrix'], data['feature_names']


def process_matrix(filename):
  matrix, feature_names = load_measure_matrix(filename)
  temp = [[{} for _ in range(len(matrix))] for _ in range(len(matrix))]
  for i in range(len(matrix)):
      for j in range(len(matrix[i])):
          k, l = i, j
          if i >= 60 and i < 107:
              k += 1
              if i >= 105:
                  k += 1
          if j >= 60 and j < 107:
              l += 1
              if j >= 105:
                  l += 1
          if (k >= 107 and i < 107) or (l >= 107 and j < 107):
              continue
          if k < len(matrix) and l < len(matrix[k]):
              temp[i][j] = matrix[k][l]
  return temp

import time
measure_classes = [AgreementMixin, BinaryMixin,CategoricalMixin,ConfusionMixin]
import pickle
ll = 0
for measure_class in measure_classes:
    if ll == 0:
      matrix, feature_names = extract_measures_to_matrix(process_matrix('agreement_matrix.pkl'),measure_class)
    if ll == 1:
      matrix, feature_names = extract_measures_to_matrix(process_matrix('binary_matrix.pkl'),measure_class)
    if ll == 2:
      matrix, feature_names = extract_measures_to_matrix(process_matrix('categorical_matrix.pkl'),measure_class)
    if ll == 3:
      matrix, feature_names = extract_measures_to_matrix(process_matrix('confusion_matrix.pkl'),measure_class)

    matrix_name = measure_class.__name__.lower().replace('mixin', '') + '_matrixv3.pkl'
    with open(matrix_name, 'wb') as f:
        pickle.dump({'matrix': matrix, 'feature_names': feature_names}, f)
    if matrix_name=='agreement_matrixv3.pkl':
      !cp 'agreement_matrixv3.pkl' /content/drive/MyDrive
    if matrix_name=='binary_matrixv3.pkl':
      !cp 'binary_matrixv3.pkl' /content/drive/MyDrive
    if matrix_name=='categorical_matrixv3.pkl':
      !cp 'categorical_matrixv3.pkl' /content/drive/MyDrive
    if matrix_name=='confusion_matrixv3.pkl':
      !cp 'confusion_matrixv3.pkl' /content/drive/MyDrive

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
education_Bachelors - capital-gain-binary_1  -  ppv :  0.23160762942779292 - 0.000005 s
education_Bachelors - capital-gain-binary_1  -  precision :  0.23160762942779292 - 0.000005 s
education_Bachelors - capital-gain-binary_1  -  prevalence :  0.16433280104819228 - 0.000004 s
education_Bachelors - capital-gain-binary_1  -  pt :  0.4468168174297853 - 0.000008 s
education_Bachelors - capital-gain-binary_1  -  recall :  0.11648187367634234 - 0.000038 s
education_Bachelors - capital-gain-binary_1  -  sensitivity :  0.11648187367634234 - 0.000013 s
education_Bachelors - capital-gain-binary_1  -  specificity :  0.9240059776084667 - 0.000015 s
education_Bachelors - capital-gain-binary_1  -  tn :  37717 - 0.000005 s
education_Bachelors - capital-gain-binary_1  -  tnr :  0.9240059776084667 - 0.000004 s
education_Bachelors - capital-gain-binary_1  -  tp :  935 - 0.000005 s
education_Bachelors - capital-gain-binary_1  -  tpr :  0.11

In [None]:
import torch
import numpy as np
import pickle
rbf_hsic_matrix = torch.load('rbf_hsic_matrix_fullv3.pt')
linear_hsic_matrix = torch.load('linear_hsic_matrix_fullv3.pt')
mutual_information_matrix = torch.load('mutual_information_matrix_fullv3.pt')
distance_correlation_matrix = torch.load('distance_correlation_matrix_fullv3.pt')
chi2_matrix = torch.load('chi2_matrix_fullv3.pt')
theils_u_matrix = torch.load('theils_u_matrix_fullv3.pt')
cramers_v_matrix = torch.load('cramers_v_matrix_fullv3.pt')

maximal_coefficient_matrix = torch.load('maximal_coefficient_matrix_fullv3.pt')
spearsmans_matrix = torch.load('spearmans_rank_matrix_fullv3.pt')
pearsons_matrix = torch.load('pearsons_correlation_coefficient_matrix_fullv3.pt')

def load_measure_matrix(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data['matrix'], data['feature_names']

agreement_matrix, agreement_feature_names = load_measure_matrix('agreement_matrixv3.pkl')
binary_matrix, binary_feature_names = load_measure_matrix('binary_matrixv3.pkl')
categorical_matrix, categorical_feature_names = load_measure_matrix('categorical_matrixv3.pkl')
confusion_matrix, confusion_feature_names = load_measure_matrix('confusion_matrixv3.pkl')

num_features = rbf_hsic_matrix.shape[0]
from sklearn.preprocessing import OneHotEncoder

print(X_encoded.shape)

index = []
attr = []

for a in range(num_features):
    for b in range(a + 1, num_features):
        index.append([i,j])

        list1 = [linear_hsic_matrix[i, j],
            rbf_hsic_matrix[i, j],
            mutual_information_matrix[i, j],
            distance_correlation_matrix[i, j],
            maximal_coefficient_matrix[i,j],
            spearsmans_matrix[i,j],
            pearsons_matrix[i,j],
            chi2_matrix[i, j],
            theils_u_matrix[i, j],
            cramers_v_matrix[i, j],
                 ]
        if i<107 and j<107:
          for measure in agreement_matrix[i][j].keys():
            list1.append(agreement_matrix[i][j][measure])
          for measure in binary_matrix[i][j].keys():
            if measure == 'mcnemar_test':
              list1.append(binary_matrix[i][j][measure][0])
            else:
              list1.append(binary_matrix[i][j][measure])
          for measure in categorical_matrix[i][j].keys():
            list1.append(categorical_matrix[i][j][measure])
          for measure in confusion_matrix[i][j].keys():
            list1.append(confusion_matrix[i][j][measure])
          list1.extend([1,0,0])
        elif i>=107 and j>=107:
          list1.extend([0]*115)
          list1.extend([0,0,1])
        else:
          list1.extend([0]*115)
          list1.extend([0,1,0])
        attr.append(list1)


attr = torch.tensor(attr, dtype=torch.float)
index = torch.tensor(index, dtype=torch.int)
print(attr.shape)
print(index.shape)

torch.save(attr, 'census_attr_finalv2.pt')
!cp 'census_attr_finalv2.pt' /content/drive/MyDrive

torch.save(index, 'census_index_finalv2.pt')
!cp 'census_attr_finalv2.pt' /content/drive/MyDrive

torch.Size([48842, 112])
torch.Size([6441, 128])
torch.Size([6441, 2])
