<a href="https://colab.research.google.com/github/mavikulov/Diploma/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import kagglehub

# Download latest version
path = kagglehub.dataset_download("ucimachinelearning/photoplethysmography-ppg-dataset")

print("Path to dataset files:", path)

In [None]:
!pip install catboost

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.animation as animation
from mlxtend.plotting import (plot_confusion_matrix,
                              plot_decision_regions)
from sklearn.metrics import accuracy_score, confusion_matrix

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("/root/.cache/kagglehub/datasets/ucimachinelearning/photoplethysmography-ppg-dataset/versions/1/PPG_Dataset.csv")
data.head()

In [None]:
print(f'Dataset: {data.shape[0]} rows and {data.shape[1]} columns')

In [None]:
target = data['Label']
label_counts = target.value_counts()
print(label_counts)
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Label Classes Distribution')
plt.show()

В данном датасете распределение классов почти в соотношении 1:1

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_for_test_and_validation = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Label'])
data_valid, data_test = train_test_split(data_for_test_and_validation, test_size=0.5, random_state=42, stratify=data_for_test_and_validation['Label'])

print(f"Data Train Shape: {data_train.shape}")
print(f"Data Valid Shape: {data_valid.shape}")
print(f"Data Test Shape: {data_test.shape}")
print(f"Train Samples: {data_train.shape[0]}")
print(f"Valid Samples: {data_valid.shape[0]}")
print(f"Test Samples: {data_test.shape[0]}")

In [None]:
data_eda = data_train.copy()
data_eda.head()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1, random_state=42)
pca.set_output(transform = 'pandas')
components = pca.fit_transform(data_eda.drop(columns = 'Label'))
components.head()

In [None]:
data_pca = pd.concat([components, data_eda['Label']], axis=1)
data_pca.head()

In [None]:
print(f"Доля вариации после PCA: {pca.explained_variance_ratio_.cumsum()}")

In [None]:
X_train, y_train = data_train.drop(columns='Label'), data_train['Label']
X_valid, y_valid = data_valid.drop(columns='Label'), data_valid['Label']
X_test, y_test = data_test.drop(columns='Label'), data_test['Label']

print(f"X_train Shape: {X_train.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"X_valid Shape: {X_valid.shape}")
print(f"y_valid Shape: {y_valid.shape}")
print(f"X_test Shape: {X_test.shape}")
print(f"y_test Shape: {y_test.shape}")

In [None]:
pca = PCA(n_components=1, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

In [None]:
binary_encoder = {'Normal':0., 'MI':1.}
y_train_encoder = y_train.map(binary_encoder)
y_valid_encoder = y_valid.map(binary_encoder)
y_test_encoder = y_test.map(binary_encoder)

print(Counter(y_train_encoder))
print(Counter(y_valid_encoder))
print(Counter(y_test_encoder))

## ML-решение

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
log_reg = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()
gnb = GaussianNB()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
xgb = XGBClassifier()
cat = CatBoostClassifier(verbose=0)

models = [log_reg, svc, knn, gnb, dtc, rfc, gbc, xgb, cat]

In [None]:
scores_train = {}
scores_valid = {}

for i, model in enumerate(models):
    name_model = type(model).__name__

    model.fit(X_train_pca, y_train_encoder.values)

    y_pred_train = model.predict(X_train_pca)
    y_pred_valid = model.predict(X_valid_pca)

    scores_train[name_model] = accuracy_score(y_train_encoder, y_pred_train)
    scores_valid[name_model] = accuracy_score(y_valid_encoder, y_pred_valid)

    print(f"{i + 1}.- {name_model} finished!!\n")

In [None]:
name_models = list(scores_train.keys())
acc_train = list(scores_train.values())
acc_valid = list(scores_valid.values())

x = np.arange(len(name_models))

width = 0.35

fig,ax = plt.subplots(figsize = (15, 5.2))

rects1 = ax.bar(x - width / 2, acc_train, width = width, label = 'Train')
rects2 = ax.bar(x + width / 2, acc_valid, width = width, label = 'Valid')

ax.set_ylabel("Accuracy", fontsize = 10, fontweight = 'bold', color = 'black')
ax.set_title("Metric Performance: Accuracy", fontsize = 12, fontweight = 'bold', color = 'black')
ax.set_xticks(x)
ax.set_xticklabels(name_models, rotation = 45, horizontalalignment = 'right', fontsize = 8, color = 'black', fontweight = 'bold')
ax.axhline(y = min(scores_valid.values()), lw = 2.0, linestyle = '--', color = 'lime')
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = round(rect.get_height(), 4)
        ax.annotate(f'{height}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight = 'bold', color = 'black', fontsize = 8)

autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
fig.show()

In [None]:
for model in models:
  model.fit(X_train_pca, y_train_encoder.values)
  y_pred_test = model.predict(X_test_pca)
  print(f"{type(model).__name__}: {accuracy_score(y_test_encoder, y_pred_test):.4f}")

In [None]:
best_model = LogisticRegression()
best_model.fit(X_train_pca, y_train_encoder.values)
y_pred_test = best_model.predict(X_test_pca)
cf_matrix = confusion_matrix(y_test_encoder.values, y_pred_test)

fig, ax = plot_confusion_matrix(conf_mat = cf_matrix,
                                show_absolute = True,
                                show_normed=True,
                                class_names = ['Normal', 'MI'])
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300],
}

grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_pca, y_train_encoder.values)

print(f"Лучшие гиперпараметры: {grid_search.best_params_}")
print(f"Лучшее значение accuracy: {grid_search.best_score_:.4f}")

the_best_model = grid_search.best_estimator_
y_pred = the_best_model.predict(X_test_pca)
test_accuracy = accuracy_score(y_test_encoder.values, y_pred)
print(f"Accuracy на тестовых данных: {test_accuracy:.4f}")

## Фильтрация + пороговые классификаторы

In [None]:
X = data.drop(columns=['Label'])
y = data['Label']
y = y.map(binary_encoder)

### Реккурентные фильтры

In [None]:
from scipy.signal import butter, filtfilt

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def apply_bandpass_filter(data, lowcut, highcut, fs):
    b, a = butter_bandpass(lowcut, highcut, fs)
    return filtfilt(b, a, data)

X_rec_filtered = X.apply(lambda col: apply_bandpass_filter(col, 0.5, 45, 100))
plt.plot(X_rec_filtered.iloc[0])