In [1]:
!pip install tsfresh



In [2]:
import os
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters, extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

In [3]:
# Read the DataFrame from the HDF5 file
df = pd.read_hdf(r"C:\Users\AMRAN\OneDrive - Aalborg Universitet\8. semester\Projekt\CODE\Machine Learning/Dataset(intrinsic).h5", key='dataset')

In [4]:
print(df)

                  Source  Time (ms)          Type         Value
0        i19052025001000        0.0  Nset (1/min)      0.000000
1        i21062025004001        0.0   Torque (Nm)     -0.000704
2        i21062025004001        0.0   Current (V)     -0.015869
3        i21062025004001        0.0     Angle (°)      0.000000
4        i21062025004001        0.0    Depth (mm)      0.134766
...                  ...        ...           ...           ...
5601810  i19052025001000     4127.0   Current (V)      0.437012
5601811  i19052025001000     4127.0     Angle (°)  18594.267600
5601812  i19052025001000     4127.0  Nset (1/min)      0.000000
5601813  i19052025001000     4127.0    Depth (mm)     20.283203
5601814  i19052025001000     4127.0   Torque (Nm)      0.697212

[5601815 rows x 4 columns]


In [5]:
df_intrinsic = df

In [10]:
df_intrinsic = df[df['Source'].str.startswith('i')]
print(df_intrinsic)

                  Source  Time (ms)          Type         Value
0        i19052025001000        0.0  Nset (1/min)      0.000000
1        i21062025004001        0.0   Torque (Nm)     -0.000704
2        i21062025004001        0.0   Current (V)     -0.015869
3        i21062025004001        0.0     Angle (°)      0.000000
4        i21062025004001        0.0    Depth (mm)      0.134766
...                  ...        ...           ...           ...
5601810  i19052025001000     4127.0   Current (V)      0.437012
5601811  i19052025001000     4127.0     Angle (°)  18594.267600
5601812  i19052025001000     4127.0  Nset (1/min)      0.000000
5601813  i19052025001000     4127.0    Depth (mm)     20.283203
5601814  i19052025001000     4127.0   Torque (Nm)      0.697212

[5601815 rows x 4 columns]


In [11]:
# Rename columns to meet tsfresh requirements
df_intrinsic = df_intrinsic.rename(columns={'Source': 'id', 'Time (ms)': 'time', 'Type': 'kind', 'Value': 'value'})

print(df_intrinsic)


                      id    time          kind         value
0        i19052025001000     0.0  Nset (1/min)      0.000000
1        i21062025004001     0.0   Torque (Nm)     -0.000704
2        i21062025004001     0.0   Current (V)     -0.015869
3        i21062025004001     0.0     Angle (°)      0.000000
4        i21062025004001     0.0    Depth (mm)      0.134766
...                  ...     ...           ...           ...
5601810  i19052025001000  4127.0   Current (V)      0.437012
5601811  i19052025001000  4127.0     Angle (°)  18594.267600
5601812  i19052025001000  4127.0  Nset (1/min)      0.000000
5601813  i19052025001000  4127.0    Depth (mm)     20.283203
5601814  i19052025001000  4127.0   Torque (Nm)      0.697212

[5601815 rows x 4 columns]


In [12]:
df_intrinsic['id'] = 'id_' + df_intrinsic['id']

# Replace "idt" with "id" in the column
df_intrinsic['id'] = df_intrinsic['id'].str.replace('id_t', 'id', regex=True)
df_intrinsic['id'] = df_intrinsic['id'].str.replace('id_i', 'id', regex=True)
print(df_intrinsic)

print("Unique IDs:", df_intrinsic['id'].nunique())
print("Unique kinds:", df_intrinsic['kind'].unique())
print("Rows per ID-kind:")
print(df_intrinsic.groupby(['id', 'kind']).size().sort_values())



                       id    time          kind         value
0        id19052025001000     0.0  Nset (1/min)      0.000000
1        id21062025004001     0.0   Torque (Nm)     -0.000704
2        id21062025004001     0.0   Current (V)     -0.015869
3        id21062025004001     0.0     Angle (°)      0.000000
4        id21062025004001     0.0    Depth (mm)      0.134766
...                   ...     ...           ...           ...
5601810  id19052025001000  4127.0   Current (V)      0.437012
5601811  id19052025001000  4127.0     Angle (°)  18594.267600
5601812  id19052025001000  4127.0  Nset (1/min)      0.000000
5601813  id19052025001000  4127.0    Depth (mm)     20.283203
5601814  id19052025001000  4127.0   Torque (Nm)      0.697212

[5601815 rows x 4 columns]
Unique IDs: 319
Unique kinds: ['Nset (1/min)' 'Torque (Nm)' 'Current (V)' 'Angle (°)' 'Depth (mm)']
Rows per ID-kind:
id                kind        
id21062025004034  Torque (Nm)     1498
                  Nset (1/min)    1498
 

In [13]:
# Define feature extraction settings
#settings = MinimalFCParameters()
settings = EfficientFCParameters()

# Extract features
extracted_features = extract_features(df_intrinsic, column_id="id", column_kind="kind", column_sort="time", column_value="value", default_fc_parameters=EfficientFCParameters())


Feature Extraction: 100%|██████████| 30/30 [04:18<00:00,  8.63s/it]


In [14]:
# Step 2: Impute missing (NaN) values
imputed_features = impute(extracted_features)

 'Angle (°)__query_similarity_count__query_None__threshold_0.0'
 'Current (V)__query_similarity_count__query_None__threshold_0.0'
 'Depth (mm)__friedrich_coefficients__coeff_0__m_3__r_30'
 'Depth (mm)__friedrich_coefficients__coeff_1__m_3__r_30'
 'Depth (mm)__friedrich_coefficients__coeff_2__m_3__r_30'
 'Depth (mm)__friedrich_coefficients__coeff_3__m_3__r_30'
 'Depth (mm)__max_langevin_fixed_point__m_3__r_30'
 'Depth (mm)__query_similarity_count__query_None__threshold_0.0'
 'Nset (1/min)__friedrich_coefficients__coeff_0__m_3__r_30'
 'Nset (1/min)__friedrich_coefficients__coeff_1__m_3__r_30'
 'Nset (1/min)__friedrich_coefficients__coeff_2__m_3__r_30'
 'Nset (1/min)__friedrich_coefficients__coeff_3__m_3__r_30'
 'Nset (1/min)__max_langevin_fixed_point__m_3__r_30'
 'Nset (1/min)__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [15]:
print(imputed_features)

                  Torque (Nm)__variance_larger_than_standard_deviation  \
id19052025001000                                                0.0      
id19052025001001                                                0.0      
id19052025001002                                                0.0      
id19052025001003                                                0.0      
id19052025001004                                                0.0      
...                                                             ...      
id22062025006013                                                0.0      
id22062025006014                                                0.0      
id22062025006015                                                0.0      
id22062025006016                                                0.0      
id22062025006017                                                0.0      

                  Torque (Nm)__has_duplicate_max  \
id19052025001000                             0.0   
id19052

In [16]:
# Drop columns which only contain NaN or infinity after imputation
imputed_features = imputed_features.replace([np.inf, -np.inf], np.nan)
imputed_features = imputed_features.dropna(axis=1, how="all")

In [17]:
imputed_features.to_csv("tsfresh_efficient_features(intrinsic).csv")

In [18]:
print(extracted_features.columns)

Index(['Torque (Nm)__variance_larger_than_standard_deviation',
       'Torque (Nm)__has_duplicate_max', 'Torque (Nm)__has_duplicate_min',
       'Torque (Nm)__has_duplicate', 'Torque (Nm)__sum_values',
       'Torque (Nm)__abs_energy', 'Torque (Nm)__mean_abs_change',
       'Torque (Nm)__mean_change',
       'Torque (Nm)__mean_second_derivative_central', 'Torque (Nm)__median',
       ...
       'Nset (1/min)__fourier_entropy__bins_5',
       'Nset (1/min)__fourier_entropy__bins_10',
       'Nset (1/min)__fourier_entropy__bins_100',
       'Nset (1/min)__permutation_entropy__dimension_3__tau_1',
       'Nset (1/min)__permutation_entropy__dimension_4__tau_1',
       'Nset (1/min)__permutation_entropy__dimension_5__tau_1',
       'Nset (1/min)__permutation_entropy__dimension_6__tau_1',
       'Nset (1/min)__permutation_entropy__dimension_7__tau_1',
       'Nset (1/min)__query_similarity_count__query_None__threshold_0.0',
       'Nset (1/min)__mean_n_absolute_max__number_of_maxima_7'],
   

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assume that labels are in a separate dataframe
labels = pd.read_csv(r"C:\Users\AMRAN\OneDrive - Aalborg Universitet\8. semester\Projekt\CODE\Machine Learning\Data/Labels_names.csv")
labels = labels.rename(columns={'File Name': 'id'})

# Create a LabelEncoder instance
encoder = LabelEncoder()

# Fit the encoder and transform the labels
labels['Label'] = encoder.fit_transform(labels['Label'])

# Print original labels and their encoded values
for original, encoded in zip(encoder.classes_, range(len(encoder.classes_))):
    print(f'Original: {original}, Encoded: {encoded}')

# Merge features and labels
data = extracted_features.merge(labels, left_index=True, right_on='id')

print(data)

Original: N, Encoded: 0
Original: NS, Encoded: 1
Original: RS, Encoded: 2
Original: UT, Encoded: 3
     Torque (Nm)__variance_larger_than_standard_deviation  \
0                                                  0.0      
233                                                0.0      
1                                                  0.0      
2                                                  0.0      
3                                                  0.0      
..                                                 ...      
194                                                0.0      
195                                                0.0      
196                                                0.0      
197                                                0.0      
198                                                0.0      

     Torque (Nm)__has_duplicate_max  Torque (Nm)__has_duplicate_min  \
0                               0.0                             0.0   
233                       

In [20]:
# Split data into features and target variable
X = data.drop(columns=['Label'])
X = data.drop(columns=['id'])
y = data['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.90625


In [21]:
from sklearn.metrics import classification_report

# Evaluate the model's performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98        63
           1       0.75      0.92      0.83        13
           2       0.90      0.56      0.69        16
           3       0.85      0.94      0.89        36

    accuracy                           0.91       128
   macro avg       0.87      0.85      0.85       128
weighted avg       0.91      0.91      0.90       128

