# Global Features EDA

In [None]:

import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import io, stats

%matplotlib inline

data_dir = Path('../data/processed/GlobalFeatures')
files = sorted(data_dir.glob('*.mat'))

def load_global(path):
    return io.loadmat(path, squeeze_me=True)['globalFeatures']

vectors = [load_global(p) for p in files]
pattern = re.compile(r'u(\d+)s(\d+)')
users, sessions = zip(*[pattern.search(p.stem).groups() for p in files]) if files else ([], [])

df = pd.DataFrame(vectors, columns=[f'f{i+1}' for i in range(40)])
df['user'] = users
df['session'] = sessions


In [None]:

summary_df = pd.DataFrame({
    'n_signatures': [len(df)],
    'n_users': [df['user'].nunique()],
    'n_sessions': [df['session'].nunique()],
    'features_per_signature': [df.shape[1] - 2]
})
print(summary_df.to_string(index=False))


In [None]:

fig, axes = plt.subplots(4, 10, figsize=(20, 8))
for i in range(40):
    sns.histplot(df[f'f{i+1}'], ax=axes[i//10, i%10], kde=True, bins=20, edgecolor='none')
    axes[i//10, i%10].set_title(f'F{i+1}')
plt.tight_layout()
plt.savefig('../figures/global_feature_histograms.png', dpi=300)


In [None]:

z = stats.zscore(df.iloc[:, :40], nan_policy='omit')
outliers = (np.abs(z) > 3)
outlier_counts = outliers.sum(axis=0)
outlier_prop = outlier_counts / len(df)
print(pd.DataFrame({'count': outlier_counts, 'proportion': outlier_prop}))
plt.figure(figsize=(8,4))
sns.barplot(x=np.arange(1,41), y=outlier_counts, color='C0')
plt.ylabel('Outlier count')
plt.xlabel('Feature')
plt.tight_layout()
plt.savefig('../figures/global_feature_outliers.png', dpi=300)


In [None]:

mask = ~np.isfinite(df.iloc[:, :40])
plt.figure(figsize=(10,6))
sns.heatmap(mask, cbar=False)
plt.xlabel('Feature')
plt.ylabel('Signature index')
plt.tight_layout()
plt.savefig('../figures/global_missing_heatmap.png', dpi=300)


In [None]:

pear = df.iloc[:, :40].corr(method='pearson')
spear = df.iloc[:, :40].corr(method='spearman')

sns.clustermap(pear, cmap='coolwarm', center=0)
plt.savefig('../figures/global_correlation_heatmap.png', dpi=300)

sns.clustermap(spear, cmap='coolwarm', center=0)
plt.savefig('../figures/global_correlation_spearman.png', dpi=300)


In [None]:

high_corr = pear.abs().where(np.triu(np.ones(pear.shape), k=1).astype(bool)).stack()
redundant = high_corr[high_corr > 0.9].sort_values(ascending=False)
print('Highly correlated feature pairs:')
print(redundant.head())
