In [3]:
import os
import re
import numpy as np
import pandas as pd
import scipy.stats as sp
import antropy as ant
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def load_data_index(data_path: str, bearing: int, index: int) -> pd.DataFrame:
    """
    Load and filter data for a specific bearing and index from a CSV file.
    """
    index_file_path = f'{data_path}/{index}.csv'
    df_csv = pd.read_csv(index_file_path, delimiter=';')
    df_csv_filtered = df_csv[[col for col in df_csv.columns if str(bearing) in col]]
    return df_csv_filtered

def spectral_flatness(x: pd.Series) -> float:
    """
    Calculate the spectral flatness of a signal.
    """
    fft_spectrum = np.fft.fft(x)
    power_spectrum = np.abs(fft_spectrum)**2
    power_spectrum += 1e-10
    log_power_spectrum = np.log(power_spectrum)
    geometric_mean = np.exp(np.mean(log_power_spectrum))
    arithmetic_mean = np.mean(power_spectrum)
    return geometric_mean / arithmetic_mean

def get_features(df: pd.DataFrame) -> dict:
    """
    Extract statistical features from a DataFrame.
    """
    features = {
        'mean_x': df.iloc[:, 0].mean(),
        'mean_y': df.iloc[:, 1].mean(),
        'std_x': df.iloc[:, 0].std(),
        'std_y': df.iloc[:, 1].std(),
        'rms_x': np.sqrt(np.mean(df.iloc[:, 0]**2)),
        'rms_y': np.sqrt(np.mean(df.iloc[:, 1]**2)),
        'kurtosis_x': sp.kurtosis(df.iloc[:, 0]),
        'kurtosis_y': sp.kurtosis(df.iloc[:, 1]),
        'variance_x': df.iloc[:, 0].var(),
        'variance_y': df.iloc[:, 1].var(),
        'crest_factor_x': np.max(np.abs(df.iloc[:, 0])) / np.sqrt(np.mean(df.iloc[:, 0]**2)),
        'crest_factor_y': np.max(np.abs(df.iloc[:, 1])) / np.sqrt(np.mean(df.iloc[:, 1]**2)),
        'skewness_x': sp.skew(df.iloc[:, 0]),
        'skewness_y': sp.skew(df.iloc[:, 1]),
        'spectral_flatness_x': spectral_flatness(df.iloc[:, 0]),
        'spectral_flatness_y': spectral_flatness(df.iloc[:, 1]),
        'sample_entropy_x': ant.sample_entropy(df.iloc[:, 0]),
        'sample_entropy_y': ant.sample_entropy(df.iloc[:, 1])
    }
    return features

def process_file(data_path: str, bearing: int, index: int) -> pd.DataFrame:
    """
    Process a single file and extract features.
    """
    df = load_data_index(data_path, bearing, index)
    features = get_features(df)
    features['index'] = index
    return pd.DataFrame(features, index=[index])

def generate_dataset(data_path: str) -> pd.DataFrame:
    """
    Generate a dataset by loading, processing, and extracting features from CSV files.
    """
    bearing = 4
    data_length = find_highest_number_in_filenames(data_path)
    if data_length is None:
        return pd.DataFrame()

    dataframe_list = []
    
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_file, data_path, bearing, index): index for index in range(data_length + 1)}
        for future in tqdm(as_completed(futures), total=data_length + 1, desc="Processing files"):
            dataframe_list.append(future.result())

    df = pd.concat(dataframe_list)
    classification = pd.read_csv(f'{data_path}/bearing_conditions.csv', delimiter=';')
    dataset = pd.concat([df, classification], axis=1)
    dataset.set_index('index', inplace=True)

    return dataset

def find_highest_number_in_filenames(folder_path: str) -> int:
    """
    Find the highest numerical filename in a folder.
    """
    files = os.listdir(folder_path)
    pattern = re.compile(r'^(\d+)\.csv$')
    numbers = [int(pattern.match(file).group(1)) for file in files if pattern.match(file)]
    return max(numbers) if numbers else None

def plot_scatter_matrix(df: pd.DataFrame) -> None:
    """
    Plot a scatter matrix of the DataFrame.
    """
    sns.pairplot(df, hue='b4_state')
    plt.show()

# Example usage
data_path = "data"
dataset = generate_dataset(data_path)

Processing files:   1%|▏                           | 15/1724 [00:02<04:57,  5.74it/s]


KeyboardInterrupt: 