Global

In [None]:
import numpy as np
from scipy.stats import entropy
import pandas as pd
from scipy.stats import ks_2samp
from scipy.spatial.distance import cosine
from statsmodels.stats.proportion import proportions_ztest

# KL

In [None]:
# Function to calculate KL divergence between two distributions
def calculate_kl_divergence(dist1, dist2):
      return entropy(dist1, dist2)

def KL(df,epsilon=1e-6):
  # Drop the id column for aggregation
  pattern_data = df.drop(columns=['id', 'class_name'])

  kl_results = []
  for i in range(10):
    # Split the data randomly into two groups (half and half)
    random_indices = np.random.permutation(len(pattern_data))
    mid_index = len(pattern_data) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = pattern_data.iloc[group1_indices]
    group2 = pattern_data.iloc[group2_indices]

    # Aggregate pattern counts within each group
    group1_aggregated = group1.sum(axis=0) + epsilon
    group2_aggregated = group2.sum(axis=0) + epsilon
    # Normalize the aggregated pattern counts to create valid probability distributions
    group1_distribution = group1_aggregated / group1_aggregated.sum()
    group2_distribution = group2_aggregated / group2_aggregated.sum()


    # Compute KL divergence between the two group-level distributions
    kl_results.append(calculate_kl_divergence(group1_distribution, group2_distribution))

  return np.round(np.mean(kl_results), 3)

In [None]:
def KL_diversity(group1, group2,epsilon=1e-6):
  group1 = group1.drop(columns=['id', 'class_name'])
  group2 = group2.drop(columns=['id', 'class_name'])
  # Aggregate pattern counts within each group
  group1_aggregated = group1.sum(axis=0) + epsilon
  group2_aggregated = group2.sum(axis=0)  +epsilon

  # Normalize the aggregated pattern counts to create valid probability distributions
  group1_distribution = group1_aggregated / group1_aggregated.sum()
  group2_distribution = group2_aggregated / group2_aggregated.sum()


  # Compute KL divergence between the two group-level distributions
  return calculate_kl_divergence(group1_distribution, group2_distribution)

# KS

In [None]:
def KS(df,epsilon=1e-6):
  # Drop the id column for aggregation
  pattern_data = df.drop(columns=['id', 'class_name'])
  ks_results = []
  p_val = []
  results = {'Fold': [], 'KS Statistic': [], 'P-Value': [], 'Result': []}
  for i in range(10):
    # Split the data randomly into two groups (half and half)
    random_indices = np.random.permutation(len(pattern_data))
    mid_index = len(pattern_data) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = pattern_data.iloc[group1_indices]
    group2 = pattern_data.iloc[group2_indices]

    # Aggregate pattern counts within each group
    group1_aggregated = group1.sum(axis=0)  + epsilon
    group2_aggregated = group2.sum(axis=0) + epsilon

    # Normalize to create comparable distributions
    group1_norm = group1_aggregated / group1_aggregated.sum()
    group2_norm = group2_aggregated / group2_aggregated.sum()

    # Perform KS test
    ks_statistic, p_value = ks_2samp(group1_norm, group2_norm)

    if p_value > 0.05:
      r = 'similar'
    else:
      r = 'not similar'

    results['Fold'].append(i)
    results['KS Statistic'].append(ks_statistic)
    results['P-Value'].append(p_value)
    results['Result'].append(r)

  return pd.DataFrame(results)



In [None]:
def KS_test(group1, group2, epsilon=1e-6):
  group1 = group1.drop(columns=['id', 'class_name'])
  group2 = group2.drop(columns=['id', 'class_name'])
  # Aggregate pattern counts within each group
  group1_aggregated = group1.sum(axis=0) +epsilon
  group2_aggregated = group2.sum(axis=0) + epsilon

  # Normalize for consistency
  group1_norm = group1_aggregated / group1_aggregated.sum()
  group2_norm = group2_aggregated / group2_aggregated.sum()

  # Run Kolmogorov-Smirnov test
  ks_statistic, p_value = ks_2samp(group1_norm, group2_norm)

  # Interpretation
  if p_value > 0.05:
    r = "similar"
  else:
    r = "not similar"

  result = {"KS Statistic": [ks_statistic], "P-Value": [p_value], "Result": [r]}
  return pd.DataFrame(result)


Calculate

In [None]:
def in_groups(dfs):
  for name, df in dfs:
    kl_result = KL(df) # one number
    ks_result = KS(df) # return df
    print(f"KL within {name}: {kl_result}")
    ks_result.to_csv(f"{name}_ks_within.csv", index=False)


In [None]:
def between_groups(groups):
  for name, group1, group2 in groups:
    print(name)
    kl_result = KL_diversity(group1, group2) # number
    ks_result = KS_test(group1, group2)
    print(f"KL between {name}: {kl_result}")
    ks_result.to_csv(f"{name}_ks_between.csv", index=False)


In [None]:
# load data
df = pd.read_csv('/content/train.csv')
# fill null as 0
df.fillna(0, inplace=True)


numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols = list(numeric_cols)[1:] # remove 'id' col

# Min-Max Normalization
for col in numeric_cols:
    min_col = df[col].min()
    max_col = df[col].max()
    if max_col - min_col == 0:
        df[col] = 0
    else:
        df[col] = (df[col] - min_col) / (max_col - min_col)

#  Extract relevant columns

meta_cols = [col for col in ['id', 'class_name'] if col in df.columns]
# hs_cols =  [col for col in df.columns if col.endswith('_Type:HorizontalSupport')]
hs_cols =  [col for col in df.columns if col.endswith('_Type:MeanDuration')]


# Split into positive and negative class
true_df = df[df['class_name'] == True]
false_df = df[df['class_name'] == False]

true_df = true_df.reindex(columns=meta_cols + hs_cols, fill_value=0)
false_df = false_df.reindex(columns=meta_cols + hs_cols, fill_value=0)

In [None]:
# dfs = [("Hypoglicemia", true_df), ("not-hypoglicemia", false_df)]
dfs = [("Heart_Attack", true_df), ("not-Heart_Attack", false_df)]
# dfs = [("hyperglycemia_over_180", true_df), ("not_hyperglycemia_over_180", false_df)]
# dfs = [("hyperglycemia_over_200", true_df), ("not_hyperglycemia_over_200", false_df)]


in_groups(dfs)


# between groups
# Hypo vs not Hypo
# groups = [("Hypoglicemia", true_df, false_df)]
groups = [("Heart_Attack", true_df, false_df)]
# groups = [("hyperglycemia_over_180", true_df, false_df)]
# groups = [("hyperglycemia_over_200", true_df, false_df)]

between_groups(groups)

KL within Heart_Attack: 0.006
KL within not-Heart_Attack: 0.001
Heart_Attack
KL between Heart_Attack: 0.010704278029007242


Proportion

between

In [None]:
import pandas as pd

for name, group1, group2 in groups:
    test_results = []
    g1 = group1.drop(columns=['id', 'class_name'])
    g2 = group2.drop(columns=['id', 'class_name'])

    total1 = g1[hs_cols].sum().sum()
    total2 = g2[hs_cols].sum().sum()

    for column in hs_cols:
        ratio1 = g1[column].sum() / total1 if total1 > 0 else 0
        ratio2 = g2[column].sum() / total2 if total2 > 0 else 0

        diff = abs(ratio1 - ratio2)

        threshold = 0.0005
        similar = 'similar' if diff <= threshold else 'not similar'

        test_results.append((column, ratio1, ratio2, diff, similar))

    results_df = pd.DataFrame(test_results, columns=['Column', 'Ratio1', 'Ratio2', 'AbsDiff', 'Similarity'])

    not_similar_count = results_df[results_df['Similarity'] == 'not similar'].shape[0]
    total_count = results_df.shape[0]
    percentage_not_similar = (not_similar_count / total_count) * 100

    print(f"{name} (Horizontal Proportion): Not Similar Patterns: {percentage_not_similar:.1f}%")

    top_diff = results_df.sort_values(by='AbsDiff', ascending=False).head(10)
    top_diff[['Column', 'Ratio1', 'Ratio2', 'AbsDiff']].to_csv(f"top_10_significant_patterns_horizontal_proportion_{name}.csv", index=False)



Heart_Attack (Horizontal Proportion): Not Similar Patterns: 28.8%


within

In [None]:
import numpy as np
import pandas as pd

results = {"name": [], "Fold": [], "similar_percent": []}

THRESHOLD = 0.0005

for name, df in dfs:
    g = df.drop(columns=['id', 'class_name'])
    for i in range(10):
        test_results = []

        random_indices = np.random.permutation(len(g))
        mid_index = len(g) // 2
        group1 = g.iloc[random_indices[:mid_index]]
        group2 = g.iloc[random_indices[mid_index:]]

        total1 = group1[hs_cols].sum().sum()
        total2 = group2[hs_cols].sum().sum()

        for column in hs_cols:
            ratio1 = group1[column].sum() / total1 if total1 > 0 else 0
            ratio2 = group2[column].sum() / total2 if total2 > 0 else 0
            diff = abs(ratio1 - ratio2)

            similar = 'similar' if diff <= THRESHOLD else 'not similar'
            test_results.append((column, ratio1, ratio2, diff, similar))

        results_df = pd.DataFrame(test_results, columns=['Column', 'Ratio1', 'Ratio2', 'AbsDiff', 'Similarity'])

        similar_count = results_df[results_df['Similarity'] == 'similar'].shape[0]
        total_count = results_df.shape[0]
        similar_percent = (similar_count / total_count) * 100 if total_count > 0 else 0

        results['name'].append(name)
        results['Fold'].append(i)
        results['similar_percent'].append(similar_percent)

results_df = pd.DataFrame(results)
results_df.to_csv("proportion_within_meanDuration.csv", index=False)
