In [1]:
# 0.1) point your kernel at the project’s src/ folder
import sys
from pathlib import Path
project_root = Path().cwd().parent
sys.path.insert(0, str(project_root / "src"))

# 0.2) common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 0.3) define your data tree
RAW_ROOT   = project_root / "data" / "raw"
PROC_ROOT  = project_root / "data" / "processed"


In [2]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import io

%matplotlib inline

local_dir = Path('../data/raw/LocalFunctions')
files = sorted(local_dir.glob('*.mat'))

def load_local(path):
    return io.loadmat(path, squeeze_me=True)['localFunctions']


# Local Functions EDA

In [3]:
pattern = re.compile(r'u(\d+)s(\d+)')
lengths = []
for fp in files:
    data = load_local(fp)
    n = data.shape[0] if data is not None else 0
    user, session = pattern.search(fp.stem).groups()
    lengths.append({'user': user, 'session': session, 'length': n, 'file': fp.stem})
length_df = pd.DataFrame(lengths)

KeyboardInterrupt: 

In [None]:
print(f"Total signatures: {len(length_df)} from {length_df['user'].nunique()} users")
print(length_df["length"].describe())

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(x='user', y='length', data=length_df, inner='quartile')
plt.tight_layout()
plt.savefig('../figures/local_length_violin.png', dpi=300)

In [None]:
summary = length_df.groupby('user')['length'].agg(['mean','std','min','max'])
summary['range'] = summary['max'] - summary['min']
print(summary)


In [None]:
# optional link to global feature (assumes matching filenames exist)
from scipy import io
from pathlib import Path

global_dir = Path('../data/processed/GlobalFeatures')
get_global = lambda stem: io.loadmat(global_dir / f'{stem}.mat', squeeze_me=True)['globalFeatures'][0] if (global_dir / f'{stem}.mat').exists() else np.nan
length_df['duration'] = [get_global(s) for s in length_df['file']]

plt.figure()
sns.scatterplot(x='duration', y='length', data=length_df)
plt.xlabel('Global feature 1 (duration)')
plt.ylabel('Sequence length')
plt.tight_layout()
plt.savefig('../figures/length_vs_duration.png', dpi=300)

# Global Features EDA

In [None]:
# --- imports -------------------------------------------------------------
import re
from pathlib import Path
import numpy as np
import pandas as pd

import scipy.io as sio
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context("notebook")
sns.set_style("whitegrid")

In [None]:
# --- paths ---------------------------------------------------------------
# adjust if your notebook lives at .../notebooks/
DATA_DIR = (Path.cwd().parent / "data" / "raw" / "GlobalFeatures").resolve()
print("Looking for .mat files in:", DATA_DIR)
files = sorted(DATA_DIR.glob("*.mat"))
print(f"Found {len(files)} files.")

# --- regex for user/session/sample --------------------------------------
# example filename: u1005s0001_sg0001.mat
FNAME_RE = re.compile(r"u(\d{4})s(\d{4})_sg(\d{4})")

def load_global(path: Path):
    """Return 1x40 float array of global features from a .mat file."""
    m = sio.loadmat(path, squeeze_me=True, struct_as_record=False)
    if "globalFeatures" not in m:
        raise KeyError(f"'globalFeatures' not in {path.name}; keys={list(m.keys())}")
    arr = np.asarray(m["globalFeatures"]).squeeze()
    if arr.size != 40:
        raise ValueError(f"{path.name}: expected 40 features, got shape {arr.shape}")
    return arr.astype(float)

records = []
bad_files = []

for p in files:
    try:
        gf = load_global(p)
    except Exception as e:
        print(f"[SKIP] {p.name}: {e}")
        bad_files.append((p, e))
        continue

    # parse ids
    m = FNAME_RE.match(p.stem)
    if m:
        uid, sess, samp = map(int, m.groups())
    else:
        uid = sess = samp = None

    records.append((uid, sess, samp, *gf))

# build DataFrame
cols = ["user", "session", "sample"] + [f"f{i+1}" for i in range(40)]
df = pd.DataFrame.from_records(records, columns=cols)

print(f"Loaded {len(df)} usable records; skipped {len(bad_files)} files.")
df.head()

In [None]:
summary_df = pd.DataFrame({
    "n_signatures": [len(df)],
    "n_users":      [df["user"].nunique(dropna=True)],
    "n_sessions":   [df["session"].nunique(dropna=True)],
    "features_per_signature": [40]   # fixed by design
})
print(summary_df.to_string(index=False))


In [None]:
fig, axes = plt.subplots(4, 10, figsize=(20, 8), constrained_layout=True)
axes = axes.ravel()

for i in range(40):
    ax = axes[i]
    sns.histplot(df[f"f{i+1}"], ax=ax, kde=True, bins=20, edgecolor="none")
    ax.set_title(f"F{i+1}", fontsize=8)
    ax.tick_params(labelsize=6)

# remove unused axes just in case (shouldn't happen)
for j in range(40, len(axes)):
    axes[j].axis("off")

out_path = (Path.cwd().parent / "figures" / "global_feature_histograms.png").resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close(fig)


In [None]:
X = df.loc[:, [f"f{i+1}" for i in range(40)]].to_numpy(dtype=float)
z = stats.zscore(X, nan_policy="omit", axis=0)

outliers = np.abs(z) > 3
outlier_counts = outliers.sum(axis=0)
outlier_prop   = outlier_counts / len(df)

outlier_df = pd.DataFrame({
    "feature": [f"f{i+1}" for i in range(40)],
    "count": outlier_counts,
    "proportion": outlier_prop
})
print(outlier_df)

plt.figure(figsize=(10, 4))
sns.barplot(x="feature", y="count", data=outlier_df, color="C0")
plt.xticks(rotation=90, fontsize=7)
plt.ylabel("Outlier count (>3σ)")
plt.xlabel("Feature")
plt.tight_layout()
out_path = (Path.cwd().parent / "figures" / "global_feature_outliers.png").resolve()
plt.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close()


In [None]:
mask = ~np.isfinite(X)  # True where NaN/inf
plt.figure(figsize=(10, 6))
sns.heatmap(mask, cbar=False)
plt.xlabel("Feature")
plt.ylabel("Signature index")
plt.title("Missing / non-finite values in Global Features")
plt.tight_layout()
out_path = (Path.cwd().parent / "figures" / "global_missing_heatmap.png").resolve()
plt.savefig(out_path, dpi=300)
print("Saved:", out_path)
plt.close()


In [None]:
feat_cols = [f"f{i+1}" for i in range(40)]
pear  = df[feat_cols].corr(method="pearson")
spear = df[feat_cols].corr(method="spearman")

# Clustered heatmaps return a ClusterGrid; save via its fig
cg1 = sns.clustermap(pear, cmap="coolwarm", center=0, figsize=(10,10))
out_path = (Path.cwd().parent / "figures" / "global_correlation_heatmap.png").resolve()
cg1.fig.suptitle("Pearson correlation (clustered)", y=1.02)
cg1.fig.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(cg1.fig)

cg2 = sns.clustermap(spear, cmap="coolwarm", center=0, figsize=(10,10))
out_path = (Path.cwd().parent / "figures" / "global_correlation_spearman.png").resolve()
cg2.fig.suptitle("Spearman correlation (clustered)", y=1.02)
cg2.fig.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(cg2.fig)

pear.shape, spear.shape


In [None]:
# upper triangle (excluding diagonal)
tri = np.triu(np.ones_like(pear, dtype=bool), k=1)
high_corr = pear.where(tri).abs().stack().sort_values(ascending=False)

redundant = high_corr[high_corr > 0.9]
print("Highly correlated feature pairs (|r| > 0.9):")
print(redundant)