# Global Features EDA

In [1]:
# --- imports -------------------------------------------------------------
import re
from pathlib import Path
import numpy as np
import pandas as pd

import scipy.io as sio
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context("notebook")
sns.set_style("whitegrid")

In [2]:
# --- paths ---------------------------------------------------------------
# adjust if your notebook lives at .../notebooks/
DATA_DIR = (Path.cwd().parent / "data" / "raw" / "GlobalFeatures").resolve()
print("Looking for .mat files in:", DATA_DIR)
files = sorted(DATA_DIR.glob("*.mat"))
print(f"Found {len(files)} files.")

# --- regex for user/session/sample --------------------------------------
# example filename: u1005s0001_sg0001.mat
FNAME_RE = re.compile(r"u(\d{4})s(\d{4})_sg(\d{4})")

def load_global(path: Path):
    """Return 1x40 float array of global features from a .mat file."""
    m = sio.loadmat(path, squeeze_me=True, struct_as_record=False)
    if "globalFeatures" not in m:
        raise KeyError(f"'globalFeatures' not in {path.name}; keys={list(m.keys())}")
    arr = np.asarray(m["globalFeatures"]).squeeze()
    if arr.size != 40:
        raise ValueError(f"{path.name}: expected 40 features, got shape {arr.shape}")
    return arr.astype(float)

records = []
bad_files = []

for p in files:
    try:
        gf = load_global(p)
    except Exception as e:
        print(f"[SKIP] {p.name}: {e}")
        bad_files.append((p, e))
        continue

    # parse ids
    m = FNAME_RE.match(p.stem)
    if m:
        uid, sess, samp = map(int, m.groups())
    else:
        uid = sess = samp = None

    records.append((uid, sess, samp, *gf))

# build DataFrame
cols = ["user", "session", "sample"] + [f"f{i+1}" for i in range(40)]
df = pd.DataFrame.from_records(records, columns=cols)

print(f"Loaded {len(df)} usable records; skipped {len(bad_files)} files.")
df.head()

Looking for .mat files in: C:\Users\mattt\Skripsie\Projects\DTW-project\data\raw\GlobalFeatures
Found 11200 files.
Loaded 11200 usable records; skipped 0 files.


Unnamed: 0,user,session,sample,f1,f2,f3,f4,f5,f6,f7,...,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40
0,1001,1,1,0.49795,0.50036,0.49531,0.49521,0.50643,0.49331,0.5019,...,0.49574,0.50276,0.50541,0.49614,0.49759,0.50879,0.49896,0.49794,0.5023,0.50061
1,1001,1,2,0.49795,0.50341,0.49556,0.49518,0.51022,0.49333,0.50151,...,0.49574,0.50288,0.50457,0.49596,0.49723,0.51281,0.49756,0.49694,0.50514,0.50064
2,1001,1,3,0.49795,0.4988,0.4949,0.49496,0.50958,0.49769,0.50437,...,0.49516,0.50299,0.5047,0.49571,0.49798,0.51194,0.50017,0.49758,0.50107,0.50385
3,1001,1,4,0.49795,0.4988,0.4949,0.49496,0.50958,0.49769,0.50437,...,0.49516,0.50299,0.5047,0.49571,0.49798,0.51194,0.50017,0.49758,0.50107,0.50385
4,1001,1,5,0.49795,0.4988,0.4949,0.49496,0.50958,0.49769,0.50437,...,0.49516,0.50299,0.5047,0.49571,0.49798,0.51194,0.50017,0.49758,0.50107,0.50385


In [3]:
summary_df = pd.DataFrame({
    "n_signatures": [len(df)],
    "n_users":      [df["user"].nunique(dropna=True)],
    "n_sessions":   [df["session"].nunique(dropna=True)],
    "features_per_signature": [40]   # fixed by design
})
print(summary_df.to_string(index=False))


 n_signatures  n_users  n_sessions  features_per_signature
        11200      400           4                      40


In [4]:
fig, axes = plt.subplots(4, 10, figsize=(20, 8), constrained_layout=True)
axes = axes.ravel()

for i in range(40):
    ax = axes[i]
    sns.histplot(df[f"f{i+1}"], ax=ax, kde=True, bins=20, edgecolor="none")
    ax.set_title(f"F{i+1}", fontsize=8)
    ax.tick_params(labelsize=6)

# remove unused axes just in case (shouldn't happen)
for j in range(40, len(axes)):
    axes[j].axis("off")

out_path = (Path.cwd().parent / "figures" / "global_feature_histograms.png").resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close(fig)


Saved: C:\Users\mattt\Skripsie\Projects\DTW-project\figures\global_feature_histograms.png


In [5]:
X = df.loc[:, [f"f{i+1}" for i in range(40)]].to_numpy(dtype=float)
z = stats.zscore(X, nan_policy="omit", axis=0)

outliers = np.abs(z) > 3
outlier_counts = outliers.sum(axis=0)
outlier_prop   = outlier_counts / len(df)

outlier_df = pd.DataFrame({
    "feature": [f"f{i+1}" for i in range(40)],
    "count": outlier_counts,
    "proportion": outlier_prop
})
print(outlier_df)

plt.figure(figsize=(10, 4))
sns.barplot(x="feature", y="count", data=outlier_df, color="C0")
plt.xticks(rotation=90, fontsize=7)
plt.ylabel("Outlier count (>3σ)")
plt.xlabel("Feature")
plt.tight_layout()
out_path = (Path.cwd().parent / "figures" / "global_feature_outliers.png").resolve()
plt.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close()


   feature  count  proportion
0       f1    176    0.015714
1       f2    126    0.011250
2       f3    111    0.009911
3       f4    232    0.020714
4       f5    185    0.016518
5       f6    114    0.010179
6       f7     51    0.004554
7       f8      0    0.000000
8       f9      2    0.000179
9      f10    154    0.013750
10     f11    209    0.018661
11     f12    274    0.024464
12     f13    220    0.019643
13     f14      6    0.000536
14     f15     98    0.008750
15     f16      0    0.000000
16     f17    171    0.015268
17     f18    186    0.016607
18     f19    219    0.019554
19     f20    209    0.018661
20     f21     53    0.004732
21     f22     48    0.004286
22     f23     68    0.006071
23     f24    204    0.018214
24     f25      0    0.000000
25     f26     23    0.002054
26     f27    156    0.013929
27     f28    198    0.017679
28     f29     89    0.007946
29     f30    148    0.013214
30     f31    228    0.020357
31     f32      0    0.000000
32     f33

In [6]:
mask = ~np.isfinite(X)  # True where NaN/inf
plt.figure(figsize=(10, 6))
sns.heatmap(mask, cbar=False)
plt.xlabel("Feature")
plt.ylabel("Signature index")
plt.title("Missing / non-finite values in Global Features")
plt.tight_layout()
out_path = (Path.cwd().parent / "figures" / "global_missing_heatmap.png").resolve()
plt.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close()


Saved: C:\Users\mattt\Skripsie\Projects\DTW-project\figures\global_missing_heatmap.png


In [7]:
feat_cols = [f"f{i+1}" for i in range(40)]
pear  = df[feat_cols].corr(method="pearson")
spear = df[feat_cols].corr(method="spearman")

# Clustered heatmaps return a ClusterGrid; save via its fig
cg1 = sns.clustermap(pear, cmap="coolwarm", center=0, figsize=(10,10))
out_path = (Path.cwd().parent / "figures" / "global_correlation_heatmap.png").resolve()
cg1.fig.suptitle("Pearson correlation (clustered)", y=1.02)
cg1.fig.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(cg1.fig)

cg2 = sns.clustermap(spear, cmap="coolwarm", center=0, figsize=(10,10))
out_path = (Path.cwd().parent / "figures" / "global_correlation_spearman.png").resolve()
cg2.fig.suptitle("Spearman correlation (clustered)", y=1.02)
cg2.fig.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(cg2.fig)

pear.shape, spear.shape


((40, 40), (40, 40))

In [8]:
# upper triangle (excluding diagonal)
tri = np.triu(np.ones_like(pear, dtype=bool), k=1)
high_corr = pear.where(tri).abs().stack().sort_values(ascending=False)

redundant = high_corr[high_corr > 0.9]
print("Highly correlated feature pairs (|r| > 0.9):")
print(redundant)


Highly correlated feature pairs (|r| > 0.9):
f4   f31    0.962012
f5   f36    0.960972
f17  f31    0.952444
f10  f18    0.946778
f4   f17    0.912168
f10  f19    0.905017
dtype: float64
