In [1]:
import pandas as pd

In [2]:
all_df = pd.read_csv("/home/think/Desktop/all_df.csv")

In [3]:
all_df.drop(all_df.columns[[0]], axis=1, inplace=True)

In [4]:
all_df

Unnamed: 0,id,x,y,z
0,1,0.100000,0.100000,0.100000
1,1,0.095657,0.102960,0.107116
2,1,0.091243,0.105749,0.114304
3,1,0.086763,0.108366,0.121566
4,1,0.082225,0.110810,0.128902
...,...,...,...,...
4999995,500,0.697298,-0.201389,0.617136
4999996,500,0.696305,-0.202385,0.618926
4999997,500,0.695297,-0.203362,0.620717
4999998,500,0.694274,-0.204320,0.622507


In [5]:
import pandas as pd

def process_samples(df_lookup, all_df, select_ids=None, variables=None, calc_funcs=None):
    """
    Combines df_lookup and all_df_cleaned, and applies calculations to selected samples and variables.

    Parameters
    ----------
    df_lookup : pd.DataFrame
        Contains ['id', 'sample_name', 'ic_x', 'ic_y', 'ic_z', ...].
    all_df : pd.DataFrame
        Contains cleaned time series with columns ['id', 'x', 'y', 'z'].
    select_ids : list[int] or None
        If provided, only process these sample IDs. If None, process all.
    variables : list[str] or None
        Which variables ('x', 'y', 'z') to include. None means all present in all_df.
    calc_funcs : dict or None
        Mapping: {'func_name': func}, where func(series) returns a scalar.

    Returns
    -------
    result_df : pd.DataFrame
        Each row: id, sample_name, initial conditions, and calculated columns.
    """

    # Default values
    if select_ids is None:
        select_ids = df_lookup['id'].tolist()
    if variables is None:
        variables = [c for c in ['x', 'y', 'z'] if c in all_df.columns]
    if calc_funcs is None:
        calc_funcs = {
            'mean': pd.Series.mean,
            'std': pd.Series.std,
            'max': pd.Series.max,
            'min': pd.Series.min
        }

    records = []

    for sid in select_ids:
        lookup_row = df_lookup.loc[df_lookup['id'] == sid]
        if lookup_row.empty:
            print(f"⚠️ id {sid} not found in df_lookup — skipping.")
            continue

        entry = lookup_row.iloc[0].to_dict()
        ts = all_df[all_df['id'] == sid]

        if ts.empty:
            print(f"⚠️ id {sid} has no time series data — skipping.")
            continue

        for var in variables:
            if var not in ts.columns:
                print(f"⚠️ Column {var} not in time series — skipping for id {sid}.")
                continue
            series = ts[var].dropna()

            for func_name, func in calc_funcs.items():
                col = f"{var}_{func_name}"
                entry[col] = func(series)

        records.append(entry)

    result_df = pd.DataFrame(records)
    return result_df


In [6]:
import nolds

def extract_chaos_features(series):
    """Extract chaos features from a 1D time series."""
    return {
        'lyap': nolds.lyap_r(series),
        'corr_dim': nolds.corr_dim(series, emb_dim=10),
        'ap_entropy': nolds.sampen(series),
    }

def compute_features_by_id(df_all):
    """
    Agrupa por 'id' y calcula las features para cada grupo.

    Parameters
    ----------
    df_all : pd.DataFrame
        DataFrame con columnas ['id', 'x', 'y', 'z']

    Returns
    -------
    pd.DataFrame
        DataFrame con columnas ['id', 'lyap', 'corr_dim', 'ap_entropy']
    """

    feature_rows = []

    for id_value, group in df_all.groupby("id"):
        try:
            x_series = group["y"].values
            features = extract_chaos_features(x_series)
            features["id"] = id_value
            feature_rows.append(features)
        except Exception as e:
            print(f"Error calculating features for id {id_value}: {e}")

    return pd.DataFrame(feature_rows)

In [7]:
subset_df = all_df.iloc[1:10000]

In [8]:
subset_df_new = subset_df[["id", "y"]]

In [7]:
features_df = compute_features_by_id(subset_df)
features_df = features_df.sort_values("id").reset_index(drop=True)

print(features_df.head())

       lyap  corr_dim  ap_entropy  id
0  0.000575  1.359338    0.068571   1


In [8]:
features_df

Unnamed: 0,lyap,corr_dim,ap_entropy,id
0,0.000575,1.359338,0.068571,1


In [None]:
import nolds

funcs = {
    'lyap': lambda s: nolds.lyap_r(s.values),
    'corr_dim': lambda s: nolds.corr_dim(s.values, emb_dim=10),
    'ap_entropy': lambda s: nolds.sampen(s.values),
}
ids = [302]

df_res = process_samples(df_lookup, 
                         all_df_cleaned,
                         select_ids=None,
                         variables=['x'], 
                         calc_funcs=funcs)

# tsfresh

In [9]:
import pandas as pd
from tsfresh import extract_features

def process_tsfresh_features(
    all_df,
    select_ids=None,
    variables=None,
    default_fc_parameters=None
):
    """
    Extracts tsfresh features per id and per variable separately.

    Parameters
    ----------
    all_df : pd.DataFrame
        Must have columns ['id', 'x', 'y', 'z'], where multiple rows per id.
    select_ids : list[int], optional
        Which ids to process.
    variables : list[str], optional
        Which of ['x', 'y', 'z'] to process. None = all present.
    default_fc_parameters : dict, optional
        Feature calc params passed to tsfresh.extract_features()

    Returns
    -------
    pd.DataFrame
        One row per id, columns like x__feature1, y__feature1, etc.
    """
    if select_ids is None:
        select_ids = all_df['id'].unique().tolist()
    if variables is None:
        variables = [c for c in ['x', 'y', 'z'] if c in all_df.columns]

    records = []

    for sid in select_ids:
        ts = all_df[all_df['id'] == sid]
        record = {'id': sid}
        if ts.empty:
            print(f"⚠️ id {sid} not found in all_df — skipping")
            continue

        for var in variables:
            if var not in ts.columns:
                print(f"⚠️ column '{var}' not found — skipping {sid}")
                continue

            # Shape: id, time index, value
            temp_df = pd.DataFrame({
                'id': sid,
                'time': range(len(ts)),
                'value': ts[var].values
            })
            # extract_features expects column_id, column_sort, column_value
            features_df = extract_features(
                temp_df,
                column_id='id',
                column_sort='time',
                column_value='value',
                default_fc_parameters=default_fc_parameters
            )

            # Prefix the feature columns with the variable name
            features_df.columns = [f"{var}__{c}" for c in features_df.columns]

            # Save features as a dict
            record.update(features_df.iloc[0].to_dict())

        records.append(record)

    return pd.DataFrame(records)


In [None]:
tsdf = process_tsfresh_features(subset_df)

Feature Extraction: 100%|██████████| 1/1 [01:08<00:00, 68.39s/it]
Feature Extraction:   0%|          | 0/1 [00:03<?, ?it/s]Process ForkPoolWorker-5:
Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/think/

KeyboardInterrupt: 

Process ForkPoolWorker-11:
Process ForkPoolWorker-9:
Process ForkPoolWorker-10:
Process ForkPoolWorker-12:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/think/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/think/anacond

In [11]:
tsdf

Unnamed: 0,id,x__value__variance_larger_than_standard_deviation,x__value__has_duplicate_max,x__value__has_duplicate_min,x__value__has_duplicate,x__value__sum_values,x__value__abs_energy,x__value__mean_abs_change,x__value__mean_change,x__value__mean_second_derivative_central,...,z__value__fourier_entropy__bins_5,z__value__fourier_entropy__bins_10,z__value__fourier_entropy__bins_100,z__value__permutation_entropy__dimension_3__tau_1,z__value__permutation_entropy__dimension_4__tau_1,z__value__permutation_entropy__dimension_5__tau_1,z__value__permutation_entropy__dimension_6__tau_1,z__value__permutation_entropy__dimension_7__tau_1,z__value__query_similarity_count__query_None__threshold_0.0,z__value__mean_n_absolute_max__number_of_maxima_7
0,1,0.0,0.0,0.0,0.0,-143.879924,2644.996813,0.012985,-0.000113,2e-06,...,0.045395,0.090729,0.136002,0.707022,0.72883,0.750632,0.77243,0.794223,,1.881531


In [11]:
import pandas as pd
from tsfresh import extract_features

def process_tsfresh_features_long(
    all_df,
    select_ids=None,
    variables=None,
    default_fc_parameters=None
):
    """
    Extracts tsfresh features per id and per variable separately,
    returning one row per (id, axis).
    """
    if select_ids is None:
        select_ids = all_df['id'].unique().tolist()
    if variables is None:
        variables = [c for c in ['x', 'y', 'z'] if c in all_df.columns]

    records = []
    for sid in select_ids:
        ts = all_df[all_df['id'] == sid]
        if ts.empty:
            continue

        for var in variables:
            temp_df = pd.DataFrame(
                {'id': sid, 'time': range(len(ts)), 'value': ts[var].values}
            )
            features_df = extract_features(
                temp_df,
                column_id='id',
                column_sort='time',
                column_value='value',
                default_fc_parameters=default_fc_parameters
            )
            features_df['id'] = sid
            features_df['axis'] = var
            records.append(features_df)

    # Concatenate all results vertically
    result_df = pd.concat(records, ignore_index=True)
    return result_df


In [14]:
df_20 = pd.read_csv('/home/think/Desktop/5x4_sample/5x4_df_samples.csv')

In [16]:
df_20.drop(df_20.columns[[0]], axis=1, inplace=True)

In [17]:
df_20

Unnamed: 0,id,x,y,z
0,1,0.100000,0.100000,0.100000
1,1,0.097913,0.101250,0.096328
2,1,0.095850,0.102482,0.092863
3,1,0.093809,0.103694,0.089594
4,1,0.091790,0.104888,0.086511
...,...,...,...,...
199995,20,1.167056,-0.475198,0.340047
199996,20,1.153929,-0.491741,0.335677
199997,20,1.140256,-0.507738,0.331315
199998,20,1.126071,-0.523168,0.326961


In [19]:
tsdf = process_tsfresh_features_long(df_20,
                                     variables=['y','z'])

Feature Extraction: 100%|██████████| 1/1 [01:08<00:00, 68.06s/it]
Feature Extraction: 100%|██████████| 1/1 [01:07<00:00, 67.67s/it]
Feature Extraction: 100%|██████████| 1/1 [01:07<00:00, 67.70s/it]
Feature Extraction: 100%|██████████| 1/1 [01:07<00:00, 67.15s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.83s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.24s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.74s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.66s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.53s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.34s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.63s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.40s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.40s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.44s/it]
Feature Extraction: 100%|██████████| 1/1 [01:06<00:00, 66.40s/it]
Feature Ex

In [20]:
tsdf

Unnamed: 0,value__variance_larger_than_standard_deviation,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,value__mean_change,value__mean_second_derivative_central,value__median,...,value__fourier_entropy__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__query_similarity_count__query_None__threshold_0.0,value__mean_n_absolute_max__number_of_maxima_7,id,axis
0,1.0,0.0,0.0,0.0,-5810.998895,172503.8,0.034451,-0.000405,-2.403713e-06,-0.135249,...,0.136002,0.717249,0.742774,0.768292,0.793801,0.819302,,10.782609,1,y
1,1.0,0.0,0.0,0.0,6166.263451,53078.9,0.017596,-8e-06,1.873779e-07,0.035871,...,0.226363,0.713542,0.741573,0.770152,0.799008,0.828661,,22.733313,1,z
2,1.0,0.0,0.0,0.0,-5682.592723,186525.9,0.037237,-0.000682,-3.895942e-06,-0.194775,...,0.136002,0.717344,0.743008,0.768665,0.794313,0.819953,,10.784995,2,y
3,1.0,0.0,0.0,0.0,6253.59554,53859.94,0.017854,-1.8e-05,4.635646e-07,0.036477,...,0.226363,0.712023,0.739888,0.767258,0.794619,0.821671,,22.766655,2,z
4,1.0,0.0,0.0,0.0,-6416.173857,186217.2,0.037061,-0.000725,-1.199111e-06,-0.291802,...,0.136002,0.716822,0.742272,0.767715,0.793149,0.818575,,10.748417,3,y
5,1.0,0.0,0.0,0.0,6489.307424,66389.59,0.019601,-2.7e-05,7.380067e-07,0.036729,...,0.226363,0.709625,0.73525,0.760867,0.786811,0.81336,,22.454316,3,z
6,1.0,0.0,0.0,0.0,-6633.287157,198316.8,0.039067,-0.000879,-1.473859e-06,-0.338088,...,0.136002,0.716852,0.742467,0.768073,0.793672,0.819262,,10.787108,4,y
7,1.0,0.0,0.0,0.0,6725.579703,63331.22,0.019808,-3.7e-05,9.956665e-07,0.037139,...,0.226363,0.708066,0.734219,0.760666,0.787392,0.814779,,22.763815,4,z
8,1.0,0.0,0.0,0.0,-6533.249611,716223.5,0.418179,-0.000234,1.882891e-05,-0.716866,...,0.485184,0.856926,1.026291,1.202566,1.387428,1.581708,,24.109279,5,y
9,1.0,0.0,0.0,0.0,214055.218308,5207506.0,0.564407,0.002971,-3.219095e-05,20.932648,...,0.395585,0.801189,0.945321,1.093743,1.244378,1.401122,,42.461762,5,z


In [21]:
tsdf.to_csv("/home/think/Desktop/5x4_sample/5x4_cf_y_z.csv")