In [22]:
import pandas as pd
import numpy as np
from scipy.signal import medfilt
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [23]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import re

RAW_DIR = Path("data/export/tremor-peso-fade@duck.com")
OUT_DIR = Path("data")
OUT_DIR.mkdir(exist_ok=True)

ZHAW_KUERZEL = {
    'oskar': 'steinosk',
    'micheelle': 'linamic',
    'michelle': 'linamic',
    'eros': 'halero01'
}

for f in RAW_DIR.glob("*_int_*.txt"):
    match = re.match(r"([A-Za-z]+)_int_(\d)", f.stem)
    person, intensity = match.groups()
    person_key = person.lower().replace('michelle', 'micheelle')
    
    start_time = datetime(2025, 10, 16, 9, 20, 31)
    rr = pd.read_csv(f, header=None)[0].values
    time_axis = start_time + pd.to_timedelta(rr.cumsum(), unit="ms")
    bpm = 60000 / rr
    
    df = pd.DataFrame({
        "time": time_axis,
        "rr_interval": rr,
        "bpm": bpm
    })
    
    zhaw_id = ZHAW_KUERZEL[person_key]
    out_file = OUT_DIR / f"raw/{zhaw_id}/{zhaw_id}_{intensity}_lsdlpr25.csv"
    if not out_file.parent.exists():
        out_file.parent.mkdir(parents=True)
    df.to_csv(out_file, index=False)
    print(f"✅ {out_file.name}")


✅ linamic_3_lsdlpr25.csv
✅ halero01_1_lsdlpr25.csv
✅ linamic_1_lsdlpr25.csv
✅ halero01_3_lsdlpr25.csv
✅ halero01_2_lsdlpr25.csv
✅ linamic_2_lsdlpr25.csv
✅ steinosk_1_lsdlpr25.csv
✅ steinosk_2_lsdlpr25.csv
✅ steinosk_3_lsdlpr25.csv


In [24]:
METADATA = {
    'oskar': {
        'person_id': '001',  # TODO: Fill in your assigned ID
        'sex': 1,  # 0 = weiblich, 1 = männlich
        'age': 26,  # TODO: Fill in
        'exercise': 'running',  # running, stairs, lunges, mountain_climbers
        'diet': 0,  # 0 = Fleisch, 1 = Vegetarisch, 2 = Vegan
        'caffeine': 1,  # 0 = nein, 1 = ja
        'sleep_q': 2,  # 0 = schlecht, 1 = mittel, 2 = gut
        'sleep_h': 7,  # Schlafdauer in Stunden
        'wellbeeing': 20,  # WHO-5 Fragebogen (0-25 Punkte)
        'cold_hist': 0,  # 0 = nein, 1 = ja
        'asthma': 0,  # 0 = nein, 1 = ja
        'smoker': 1,  # 0 = nein, 1 = ja
        'fitness_level': 0,  # 0 = low, 1 = medium, 2 = high
        'study_modus': 1,  # 0 = vollzeit, 1 = teilzeit
        'bmi': 21,  # Body-Mass-Index
        'rpm': 0,  # Wiederholungen pro Minute (Kadenz)
    },
    'michelle': {
        'person_id': '002',  # TODO: Fill in your assigned ID
        'sex': 0,  # 0 = weiblich, 1 = männlich
        'age': 22,  # TODO: Fill in
        'exercise': 'running',  # running, stairs, lunges, mountain_climbers
        'diet': 0,  # 0 = Fleisch, 1 = Vegetarisch, 2 = Vegan
        'caffeine': 0,  # 0 = nein, 1 = ja
        'sleep_q': 0,  # 0 = schlecht, 1 = mittel, 2 = gut
        'sleep_h': 0.5,  # Schlafdauer in Stunden
        'wellbeeing': 21,  # WHO-5 Fragebogen (0-25 Punkte)
        'cold_hist': 0,  # 0 = nein, 1 = ja
        'asthma': 0,  # 0 = nein, 1 = ja
        'smoker': 0,  # 0 = nein, 1 = ja
        'fitness_level': 1,  # 0 = low, 1 = medium, 2 = high
        'study_modus': 0,  # 0 = vollzeit, 1 = teilzeit
        'bmi': 0.0,  # Body-Mass-Index
        'rpm': 0,  # Wiederholungen pro Minute (Kadenz)
    },
    'eros': {
        'person_id': '003',  # TODO: Fill in your assigned ID
        'sex': 1,  # 0 = weiblich, 1 = männlich
        'age': 26,  # TODO: Fill in
        'exercise': 'running',  # running, stairs, lunges, mountain_climbers
        'diet': 0,  # 0 = Fleisch, 1 = Vegetarisch, 2 = Vegan
        'caffeine': 1,  # 0 = nein, 1 = ja
        'sleep_q': 2,  # 0 = schlecht, 1 = mittel, 2 = gut
        'sleep_h': 8,  # Schlafdauer in Stunden
        'wellbeeing': 20,  # WHO-5 Fragebogen (0-25 Punkte)
        'cold_hist': 0,  # 0 = nein, 1 = ja
        'asthma': 1,  # 0 = nein, 1 = ja
        'smoker': 0,  # 0 = nein, 1 = ja
        'fitness_level': 1,  # 0 = low, 1 = medium, 2 = high
        'study_modus': 0,  # 0 = vollzeit, 1 = teilzeit
        'bmi': 26,  # Body-Mass-Index
        'rpm': 0,  # Wiederholungen pro Minute (Kadenz)
    }
}



ZHAW_KUERZEL = {'oskar': 'steinosk', 'michelle': 'linamic', 'eros': 'halero01'}


In [25]:



def process_person(person, zhaw_id):
    # Only process raw files (not cleaned ones)
    raw_files = sorted(Path("data").glob(f"raw/{zhaw_id}/{zhaw_id}_[1-3]_lsdlpr25.csv"))
    print(raw_files)
    comparisons = []
    
    for raw_file in raw_files:
        intensity = int(raw_file.stem.split('_')[3])
        df = pd.read_csv(raw_file)
        
        bpm_clean = clean_bpm(df['bpm'].values)
        artifacts = np.sum(df['bpm'].values != bpm_clean)
        
        row = {'time': df['time'], 'rr_interval': df['rr_interval'], 'bpm_raw': df['bpm'], 'bpm_clean': bpm_clean, 'intensity': intensity, 'artifacts': artifacts}
        for k, v in METADATA[person].items():
            row[k] = v
        
        df_clean = pd.DataFrame(row)
        
        comparisons.append({'intensity': intensity, 'raw': df['bpm'].values, 'clean': bpm_clean, 'mean_raw': df['bpm'].mean(), 'mean_clean': bpm_clean.mean(), 'std_raw': df['bpm'].std(), 'std_clean': bpm_clean.std()})
        print(f"✅ {zhaw_id}_activity_intensity_{intensity}_cleaned_lsdlpr25.csv (artifacts: {artifacts})")
    
    return comparisons

In [26]:
def clean_bpm(bpm, window=5, person=None):
    cleaned = medfilt(bpm, kernel_size=window)
    cleaned = np.where((cleaned < 40) | (cleaned > 200), np.median(cleaned), cleaned)
    
    # Special handling for Eros
    if person == 'eros':
        # More aggressive median filtering
        cleaned = medfilt(cleaned, kernel_size=11)
        
        # Remove extreme outliers using IQR method
        q75, q25 = np.percentile(cleaned, [75, 25])
        iqr = q75 - q25
        lower_bound = q25 - 1.5 * iqr
        upper_bound = q75 + 1.5 * iqr
        cleaned = np.where((cleaned < lower_bound) | (cleaned > upper_bound), 
                          np.median(cleaned), cleaned)
        
        # Additional smoothing with rolling median
        df_temp = pd.DataFrame({'bpm': cleaned})
        cleaned = df_temp['bpm'].rolling(window=7, center=True, min_periods=1).median().values
    
    return cleaned

def process_person(person, zhaw_id):
    # Only process raw files (not cleaned ones)

    raw_files = sorted(Path("data").glob(f"raw/{zhaw_id}/{zhaw_id}_[1-3]_lsdlpr25.csv"))
    comparisons = []
    
    for raw_file in raw_files:
        intensity = int(raw_file.stem.split('_')[1])
        df = pd.read_csv(raw_file)
        
        # Special handling for Eros - cut off problematic parts
        if person == 'eros':
            if intensity == 2:
                # Cut off first 10 and last 10 data points for intensity 2
                df = df.iloc[70:].reset_index(drop=True)
            elif intensity == 3:
                # Cut off first 30 data points for intensity 3
                df = df.iloc[60:].reset_index(drop=True)
        
        bpm_clean = clean_bpm(df['bpm'].values, person=person)
        artifacts = np.sum(df['bpm'].values != bpm_clean)
        
        row = {
            'time': df['time'], 
            'rr_interval': df['rr_interval'], 
            'bpm_raw': df['bpm'], 
            'bpm_clean': bpm_clean, 
            'intensity': intensity, 
            'artifacts': artifacts
        }
        for k, v in METADATA[person].items():
            row[k] = v
        
        df_clean = pd.DataFrame(row)
        path = f"data/clean/{zhaw_id}/{zhaw_id}_{intensity}_lsdlpr25.csv"
        if not Path(path).parent.exists():
            Path(path).parent.mkdir(parents=True)

        df_clean.to_csv(path, index=False)
        print(f"✅ {zhaw_id}_{intensity}_cleaned_lsdlpr25.csv (artifacts: {artifacts})")

        comparisons.append({
            'intensity': intensity, 
            'raw': df['bpm'].values, 
            'clean': bpm_clean, 
            'mean_raw': df['bpm'].mean(), 
            'mean_clean': bpm_clean.mean(), 
            'std_raw': df['bpm'].std(), 
            'std_clean': bpm_clean.std()
        })
        print(f"✅ {zhaw_id}_{intensity} (artifacts: {artifacts})")
    
    return comparisons


In [27]:

def plot_comparison(person, zhaw_id, data):
    fig = make_subplots(
        rows=2, cols=3, 
        subplot_titles=[f"Intensity {d['intensity']}" for d in data] + [f"Stats {d['intensity']}" for d in data], 
        specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}], 
               [{'type': 'bar'}, {'type': 'bar'}, {'type': 'bar'}]]
    )
    
    for i, d in enumerate(data, 1):
        fig.add_trace(
            go.Scatter(y=d['raw'], name=f"Raw {d['intensity']}", opacity=0.6, line=dict(color='red')), 
            row=1, col=i
        )
        fig.add_trace(
            go.Scatter(y=d['clean'], name=f"Clean {d['intensity']}", line=dict(color='green')), 
            row=1, col=i
        )
        fig.add_trace(
            go.Bar(x=['Mean', 'Std'], y=[d['mean_raw'], d['std_raw']], 
                   name=f"Raw {d['intensity']}", marker=dict(color='red'), opacity=0.6), 
            row=2, col=i
        )
        fig.add_trace(
            go.Bar(x=['Mean', 'Std'], y=[d['mean_clean'], d['std_clean']], 
                   name=f"Clean {d['intensity']}", marker=dict(color='green')), 
            row=2, col=i
        )
        fig.add_trace(
            go.Bar(x=['Min', 'Max'], y=[min(d['clean']), max(d['clean'])], 
                   name=f"Clean {d['intensity']}", marker=dict(color='green')), 
            row=2, col=i
        )
        fig.add_trace(
            go.Bar(x=['Min', 'Max'], y=[min(d['raw']), max(d['raw'])], 
                   name=f"Raw {d['intensity']}", marker=dict(color='red'), opacity=0.6), 
            row=2, col=i
        )
    fig.update_layout(height=800, title_text=f"{person.capitalize()} - Raw vs Cleaned BPM", showlegend=False)
    fig.show()

print("=== Datenaufbereitung & Vergleich ===\n")

def plot_info_table_plotly(person, zhaw_id):
    metadata = METADATA[person]
    table = go.Figure(data=[go.Table(
        header=dict(values=["Attribut", "Wert"], fill_color='paleturquoise', align='left'),
        cells=dict(values=[[k for k in metadata.keys()], [v for v in metadata.values()]], fill_color='lavender', align='left')
    )])
    table.update_layout(title_text=f"{person.capitalize()} - Metadaten")
    table.show()

for person, zhaw_id in ZHAW_KUERZEL.items():
    print(f"Processing {person.capitalize()}...")
    data = process_person(person, zhaw_id)
    plot_comparison(person, zhaw_id, data)
    plot_info_table_plotly(person, zhaw_id)
    print()

print("✅ Datenaufbereitung abgeschlossen!")

=== Datenaufbereitung & Vergleich ===

Processing Oskar...
✅ steinosk_1_cleaned_lsdlpr25.csv (artifacts: 231)
✅ steinosk_1 (artifacts: 231)
✅ steinosk_2_cleaned_lsdlpr25.csv (artifacts: 241)
✅ steinosk_2 (artifacts: 241)
✅ steinosk_3_cleaned_lsdlpr25.csv (artifacts: 291)
✅ steinosk_3 (artifacts: 291)



Processing Michelle...
✅ linamic_1_cleaned_lsdlpr25.csv (artifacts: 224)
✅ linamic_1 (artifacts: 224)
✅ linamic_2_cleaned_lsdlpr25.csv (artifacts: 240)
✅ linamic_2 (artifacts: 240)
✅ linamic_3_cleaned_lsdlpr25.csv (artifacts: 288)
✅ linamic_3 (artifacts: 288)



Processing Eros...
✅ halero01_1_cleaned_lsdlpr25.csv (artifacts: 188)
✅ halero01_1 (artifacts: 188)
✅ halero01_2_cleaned_lsdlpr25.csv (artifacts: 164)
✅ halero01_2 (artifacts: 164)
✅ halero01_3_cleaned_lsdlpr25.csv (artifacts: 235)
✅ halero01_3 (artifacts: 235)



✅ Datenaufbereitung abgeschlossen!


In [28]:
# Final Combined Dataset for Submission
import pandas as pd
from pathlib import Path

# Combine all cleaned datasets into final submission format
cleaned_files = Path("data").glob("*_cleaned_lsdlpr25.csv")

# show head of each cleaned file
for file in sorted(cleaned_files):
    df = pd.read_csv(file)
    display(df.describe())
