In [16]:
import os
import glob
import numpy as np
import pandas as pd

from scipy.fft import fft

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

In [17]:
# pre processing
def extract_features(signal: np.ndarray, n_freqs=20) -> dict:
    signal = signal.astype(float)
    fft_vals = np.abs(fft(signal))

    features = {}
    for x in range(1, n_freqs + 1):
        features[f"fft_{x}"] = float(fft_vals[x])

    return features

In [18]:
# loading data
def load_dataset(base_path: str):
    X = []
    y = []

    # normal
    normal_path = os.path.join(base_path, "normal")
    normal_files = glob.glob(os.path.join(normal_path, "*.csv"))

    for file in normal_files:
        df = pd.read_csv(file, header=None)
        signal = df.iloc[:,0].values
        features = extract_features(signal)
        X.append(features)
        y.append("normal")

    # imbalance
    imbalance_path = os.path.join(base_path, "imbalance")
    imbalance_folders = os.listdir(imbalance_path)

    for folder in imbalance_folders:
        folder_path = os.path.join(imbalance_path, folder)
        files = glob.glob(os.path.join(folder_path, "*.csv"))

        for file in files:
            df = pd.read_csv(file, header=None)
            signal = df.iloc[:,0].values
            features = extract_features(signal)
            X.append(features)
            y.append("imbalance")
    
    return pd.DataFrame(X), pd.Series(y)

In [19]:
# source
BASE_PATH = "/Users/pedrocm/Downloads/archive"

X_fft, y_fft = load_dataset(BASE_PATH)

print(X_fft.shape)
print(y_fft.value_counts())

(382, 20)
imbalance    333
normal        49
Name: count, dtype: int64


In [20]:
# spliting test and train
X_train, X_test, y_train, y_test = train_test_split(
    X_fft, y_fft,
    test_size=0.3,
    random_state=42,
    stratify=y_fft
)

In [None]:
# model training
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree = 0.8,
    eval_metric="logloss",
)
