# Local Functions EDA

In [None]:

import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import io

%matplotlib inline

local_dir = Path('../data/raw/LocalFunctions')
files = sorted(local_dir.glob('*.mat'))

def load_local(path):
    return io.loadmat(path, squeeze_me=True)['localFunctions']


In [None]:

pattern = re.compile(r'u(\d+)s(\d+)')
lengths = []
for fp in files:
    data = load_local(fp)
    n = data.shape[0] if data is not None else 0
    user, session = pattern.search(fp.stem).groups()
    lengths.append({'user': user, 'session': session, 'length': n, 'file': fp.stem})
length_df = pd.DataFrame(lengths)


In [None]:

print(f"Total signatures: {len(length_df)} from {length_df['user'].nunique()} users")
print(length_df["length"].describe())


In [None]:

plt.figure(figsize=(10,6))
sns.violinplot(x='user', y='length', data=length_df, inner='quartile')
plt.tight_layout()
plt.savefig('../figures/local_length_violin.png', dpi=300)


In [None]:

summary = length_df.groupby('user')['length'].agg(['mean','std','min','max'])
summary['range'] = summary['max'] - summary['min']
print(summary)


In [None]:

# optional link to global feature (assumes matching filenames exist)
from scipy import io
from pathlib import Path

global_dir = Path('../data/processed/GlobalFeatures')
get_global = lambda stem: io.loadmat(global_dir / f'{stem}.mat', squeeze_me=True)['globalFeatures'][0] if (global_dir / f'{stem}.mat').exists() else np.nan
length_df['duration'] = [get_global(s) for s in length_df['file']]

plt.figure()
sns.scatterplot(x='duration', y='length', data=length_df)
plt.xlabel('Global feature 1 (duration)')
plt.ylabel('Sequence length')
plt.tight_layout()
plt.savefig('../figures/length_vs_duration.png', dpi=300)
