<a href="https://colab.research.google.com/github/palanore1/data-manipulation/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalare dependente

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install sktime

In [None]:
import pandas as pd
import sktime
from sktime.datasets import load_from_tsfile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score ,confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn import preprocessing

## Importarea Racket Sports Dataset



In [None]:
train_x, train_y = load_from_tsfile("/content/drive/MyDrive/ML_datasets/RacketSports_TRAIN.ts")
X_df_train = pd.DataFrame(train_x)
y_df_train = pd.DataFrame(train_y, columns=['type'])
df_racket_train = pd.concat([X_df_train, y_df_train], axis = 1)
df_racket_train.columns=['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z', 'type']

test_x, test_y = load_from_tsfile("/content/drive/MyDrive/ML_datasets/RacketSports_TEST.ts")
X_df_test = pd.DataFrame(test_x)
y_df_test = pd.DataFrame(test_y, columns=['type'])
df_racket_test = pd.concat([X_df_test, y_df_test], axis = 1)
df_racket_test.columns=['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z', 'type']

df_racket = pd.concat([df_racket_train, df_racket_test], ignore_index=True)

## Importarea MITBIH si PTB Datasets

In [None]:
#################### MIT Dataset ##############################
#TRAIN
df_mit_train = pd.read_csv('/content/drive/MyDrive/ML_datasets/mitbih_train.csv', header=None)
num_columns = len(df_mit_train.columns)
col_names = [f"beat_{i+1}" for i in range(num_columns)]
df_mit_train.columns = col_names
df_mit_train.rename(columns={'beat_188': 'type'}, inplace=True)

#TEST
df_mit_test = pd.read_csv('/content/drive/MyDrive/ML_datasets/mitbih_test.csv', header=None)
col_names = [f"beat_{i+1}" for i in range(num_columns)]
df_mit_test.columns = col_names
df_mit_test.rename(columns={'beat_188': 'type'}, inplace=True)



################### PBT Dataset ###############################
#NORMAL
df_normal = pd.read_csv('/content/drive/MyDrive/ML_datasets/ptbdb_normal.csv', header=None)
num_columns = len(df_normal.columns)
col_names = [f"beat_{i+1}" for i in range(num_columns)]
df_normal.columns = col_names
df_normal.rename(columns={'beat_188': 'type'}, inplace=True)

#ABNORMAL
df_abnormal = pd.read_csv('/content/drive/MyDrive/ML_datasets/ptbdb_abnormal.csv', header=None)
col_names = [f"beat_{i+1}" for i in range(num_columns)]
df_abnormal.columns = col_names
df_abnormal.rename(columns={'beat_188': 'type'}, inplace=True)

#NORMAL ++ ABNORMAL
df_ptb = pd.concat([df_normal, df_abnormal], ignore_index=True)


### Prelucrarea dataseturilor

In [None]:
X_ptb = df_ptb.drop(columns=['type'])
y_ptb = df_ptb['type']

X_train, X_test, y_train, y_test = train_test_split(X_ptb, y_ptb, test_size=0.2, random_state=42)

df_PBT_train = pd.concat([X_train, y_train], axis = 1).reset_index(drop=True)
df_PBT_test = pd.concat([X_test, y_test], axis = 1).reset_index(drop=True)

# 3.1. Explorarea Datelor (Exploratory Data Analysis)

## 1. Analiza echilibrului de clase

### Plotting Racket Training set

In [None]:
sns.set_theme(style="whitegrid")

ax=sns.countplot(x=df_racket_train["type"], width=0.5)

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()

### Plotting MIT train set

In [None]:
ax = sns.countplot(data=df_mit_train, x = 'type')

### Plotting PTB train set

In [None]:
ax = sns.countplot(data=df_ptb, x = 'type')

## 2. Vizualizarea seriilor de timp

### 1) Afișați câte un exemplu de serie pentru fiecare tip de acțiune (în setul de date RacketSports)


* a) Afișați valorile de accelerometru pe dimensiunile x, y și z pe același
grafic.
* b) Afișați valorile de giroscop pe dimensiunile x, y și z pe același grafic.

In [None]:
############ plotting function ########################

def plot_graph(hit_type):
    random_row = df_racket_train.loc[df_racket_train['type'] == hit_type].sample()
    row_index = random_row.index[0]

    x_acc = df_racket_train.loc[row_index, 'acc_x']
    y_acc = df_racket_train.loc[row_index, 'acc_y']
    z_acc = df_racket_train.loc[row_index, 'acc_z']
    movement_type = df_racket_train.loc[row_index, 'type']

    x_gyr = df_racket_train.loc[row_index, 'gyr_x']
    y_gyr = df_racket_train.loc[row_index, 'gyr_y']
    z_gyr = df_racket_train.loc[row_index, 'gyr_z']


    fig, axs = plt.subplots(1, 2, figsize =(16, 4), sharey=True)

    axs[0].plot(x_acc, label = 'x_acc')
    axs[0].plot(y_acc, label = 'x_acc')
    axs[0].plot(z_acc, label = 'x_acc')
    axs[0].set_title(f'Accelerometru {movement_type}')

    axs[1].plot(x_gyr, label = 'x_gyr')
    axs[1].plot(y_gyr, label = 'x_gyr')
    axs[1].plot(z_gyr, label = 'x_gyr')
    axs[1].set_title(f'Gyroscope {movement_type}')

    plt.show()

In [None]:
plot_graph('badminton_smash')
print()
plot_graph('badminton_clear')
print()
plot_graph('squash_forehandboast')
print()
plot_graph('squash_backhandboast')
print()

### 2) Afișați câte un exemplu de serie pentru fiecare categorie de aritmie din seturile de date MIT-BIH / PTB.

In [None]:
def plot_mit(heartbeat_type):
    selected_row = df_mit_train.loc[df_mit_train['type'] == heartbeat_type].sample()
    row_index = selected_row.index[0]

    row_series = pd.Series(df_mit_train.iloc[row_index, :-1])
    row_series_filtered = row_series[row_series != 0.0]

    plt.plot(row_series_filtered)
    plt.xticks([])
    plt.title(f'heartbeat_type = {heartbeat_type}')
    plt.show()

In [None]:
def plot_ptb(heartbeat_type):
    selected_row = df_PBT_train.loc[df_PBT_train['type'] == heartbeat_type].sample()
    row_index = selected_row.index[0]

    row_series = pd.Series(df_PBT_train.iloc[row_index, :-1])
    row_series_filtered = row_series[row_series != 0.0]

    plt.plot(row_series_filtered)
    plt.xticks([])
    if heartbeat_type == 0:
        plt.title(f'heartbeat_type = normal')
    else:
        plt.title(f'heartbeat_type = abnormal')
    plt.show()

Plotting MIT for each type of heartbeat

In [None]:
plot_mit(0)  # plotting for MIT type 0
print()
plot_mit(1)  # plotting for MIT type 1
print()
plot_mit(2)  # plotting for MIT type 2
print()
plot_mit(3)  # plotting for MIT type 3
print()
plot_mit(4)  # plotting for MIT type 4

Plotting PTB for both types of heartbeats (normal/abnormal)

In [None]:
plot_ptb(0) # plotting for PTB type normal
print()
plot_ptb(1) # plotting for PTB type abnormal

### 3) **Pentru seturile de date cu aritmii afișați un grafic al mediei și deviației standard per unitate de timp,** pentru fiecare clasă de aritmie. Media și deviația standard se calculează peste toate exemplele (atât din train, cât și din train set).

In [None]:
df_mit_full = pd.concat([df_mit_train, df_mit_test]).reset_index(drop=True)

df_mit_type0 = df_mit_full[df_mit_full['type']==0].reset_index(drop=True)
df_mit_type0 = df_mit_type0.drop(columns=df_mit_type0.columns[-1])

df_mit_type1 = df_mit_full[df_mit_full['type']==1].reset_index(drop=True)
df_mit_type1 = df_mit_type1.drop(columns=df_mit_type1.columns[-1])

df_mit_type2 = df_mit_full[df_mit_full['type']==2].reset_index(drop=True)
df_mit_type2 = df_mit_type2.drop(columns=df_mit_type2.columns[-1])

df_mit_type3 = df_mit_full[df_mit_full['type']==3].reset_index(drop=True)
df_mit_type3 = df_mit_type3.drop(columns=df_mit_type3.columns[-1])

df_mit_type4 = df_mit_full[df_mit_full['type']==4].reset_index(drop=True)
df_mit_type4 = df_mit_type4.drop(columns=df_mit_type4.columns[-1])

###################### type_0 ###########################
df_mit_type0_refined = df_mit_type0.replace(0.0, np.nan)
means_mit_type0 = df_mit_type0_refined.mean()
stds_mit_type0 = df_mit_type0_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_mit_type0.index, means_mit_type0, yerr=stds_mit_type0, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('MIT_type0')
plt.show()
print()
###################### type_1 ###########################
df_mit_type1_refined = df_mit_type1.replace(0.0, np.nan)
means_mit_type1 = df_mit_type1_refined.mean()
stds_mit_type1 = df_mit_type1_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_mit_type1.index, means_mit_type1, yerr=stds_mit_type1, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('MIT_type1')
plt.show()
print()
###################### type_2 ###########################
df_mit_type2_refined = df_mit_type2.replace(0.0, np.nan)
means_mit_type2 = df_mit_type2_refined.mean()
stds_mit_type2 = df_mit_type2_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_mit_type2.index, means_mit_type2, yerr=stds_mit_type2, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('MIT_type2')
plt.show()
print()
###################### type_3 ###########################
df_mit_type3_refined = df_mit_type3.replace(0.0, np.nan)
means_mit_type3 = df_mit_type3_refined.mean()
stds_mit_type3 = df_mit_type3_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_mit_type3.index, means_mit_type3, yerr=stds_mit_type3, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('MIT_type3')
plt.show()
print()
###################### type_4 ###########################
df_mit_type4_refined = df_mit_type4.replace(0.0, np.nan)
means_mit_type4 = df_mit_type4_refined.mean()
stds_mit_type4 = df_mit_type4_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_mit_type4.index, means_mit_type4, yerr=stds_mit_type4, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('MIT_type4')
plt.show()
print()
####################### PTB_type0 ############################

df_ptb_type0 = df_ptb[df_ptb['type']==0].reset_index(drop=True)
df_ptb_type0 = df_ptb_type0.drop(columns=df_ptb_type0.columns[-1])

df_ptb_type0_refined = df_ptb_type0.replace(0.0, np.nan)
means_ptb_type0 = df_ptb_type0_refined.mean()
stds_ptb_type0 = df_ptb_type0_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_ptb_type0.index, means_ptb_type0, yerr=stds_ptb_type0, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('PTB_type0')
plt.show()
print()
####################### PTB_type1 ############################

df_ptb_type1= df_ptb[df_ptb['type']==1].reset_index(drop=True)
df_ptb_type1 = df_ptb_type1.drop(columns=df_ptb_type1.columns[-1])

df_ptb_type1_refined = df_ptb_type1.replace(0.0, np.nan)
means_ptb_type1 = df_ptb_type1_refined.mean()
stds_ptb_type1 = df_ptb_type1_refined.std()

plt.figure(figsize=(24,6))
plt.errorbar(means_ptb_type1.index, means_ptb_type1, yerr=stds_ptb_type1, fmt='o', capsize=3, capthick=2)
plt.xticks([])
plt.title('PTB_type1')
plt.show()
print()

### 4) Pentru setul de date RacketSports afișați distribuția valorilor per fiecare axă de accelerometru și giroscop în parte / per acțiune

In [None]:
df_racket_exploded = df_racket.explode(['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z'])
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "acc_x").add_legend()
print()
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "acc_y").add_legend()
print()
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "acc_z").add_legend()
print()
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "gyr_x").add_legend()
print()
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "gyr_y").add_legend()
print()
sns.FacetGrid(df_racket_exploded, hue = 'type', height=7).map(sns.distplot, "gyr_z").add_legend()

# 3.2. Extragere manuala a atributelor și utilizarea algoritmilor clasici de Învățare Automată

## Standardizare + extragere features toate seturile


### Functii aux

In [None]:
def std_racket(df):

  num_cols_racket = ['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z']
  racket_num_df = df[num_cols_racket]

  scaler = StandardScaler()
  df_racket_scaled = racket_num_df.applymap(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).flatten())

  df_racket_scaled=df_racket_scaled.applymap(pd.Series)

  y = df.iloc[:, -1]

  df_racket_std = pd.concat([df_racket_scaled, y], axis = 1)

  return df_racket_std

In [None]:
def std_heartbeat(df):

  X = df.iloc[:, :-1].values
  y = df.iloc[:, -1].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  data = np.concatenate((X, y.reshape(-1, 1)), axis = 1)

  columns = [f'beat_{i+1}' for i in range(X.shape[1])] + ['type']
  df_MIT_scaled = pd.DataFrame(data = data, columns = columns)

  return df_MIT_scaled

In [None]:
def extract_features_racket(df):
    racket_features = pd.DataFrame()
    sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z']
    stat_functions = {
        'mean': np.mean,
        'std': np.std,
        'aad': lambda x: np.mean(np.absolute(x - np.mean(x))),
        'min': np.min,
        'max': np.max,
        'maxmin_diff': lambda x: np.max(x) - np.min(x),
        'median': np.median,
        'mad': lambda x: np.median(np.absolute(x - np.median(x))),
        'IQR': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'neg_count': lambda x: np.sum(x < 0),
        'pos_count': lambda x: np.sum(x > 0),
        'above_mean_count': lambda x: np.sum(x > np.mean(x))
    }

    for col in sensor_cols:
        for stat in stat_functions:
            func = stat_functions[stat]
            col_name = f'{col}_{stat}'
            racket_features[col_name] = df[col].apply(func)

    racket_features['avg_result_accl'] = [i.mean() for i in ((pd.Series(df['acc_x'])**2 + pd.Series(df['acc_y'])**2 + pd.Series(df['acc_z'])**2 + pd.Series(df['gyr_x'])**2 + pd.Series(df['gyr_y'])**2 + pd.Series(df['gyr_z'])**2)**0.5)]

    racket_features['sma'] = pd.Series(df['acc_x']).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(df['acc_y']).apply(lambda x: np.sum(abs(x)/100)) \
                  + pd.Series(df['acc_z']).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(df['gyr_x']).apply(lambda x: np.sum(abs(x)/100)) \
                  + pd.Series(df['gyr_y']).apply(lambda x: np.sum(abs(x)/100)) + pd.Series(df['gyr_z']).apply(lambda x: np.sum(abs(x)/100))

    y = df.iloc[:, -1]

    racket_features = pd.concat([racket_features, y], axis = 1)

    return racket_features

In [None]:
def window_df(df):
  df.iloc[:, :-1] = df.iloc[:, :-1].replace(0.0, np.nan)

  # Window the data into segments of length 2 seconds, and include the target variable in each segment
  segment_length = 2 * 125  # 2 seconds at 125 Hz
  segments2 = []
  for i in range(0, df.shape[0], segment_length):
      segment = df.iloc[i:i+segment_length, :].values
      if not np.isnan(segment).any():
          target = segment[0, -1]  # Extract the target variable for this segment
          segment_data = segment[:, :-1].flatten()
          segments2.append((segment_data, target))

  return segments2

In [None]:
def extract_features_heartbeat(df):
    segments_MIT = window_df(df)
    features = []
    for segment_data, target in segments_MIT:
        segment_features = {}
        for i in range(df.shape[1]-1):
            col_name = f'beat_{i+1}'
            col_data = segment_data[i::df.shape[1]-1]  # Extract data from this column
            mean = np.mean(col_data)
            std = np.std(col_data)
            avg_abs_dev = np.mean(np.abs(col_data - mean))
            minimum = np.min(col_data)
            maximum = np.max(col_data)
            max_min_diff = maximum - minimum
            median = np.median(col_data)
            med_abs_dev = np.median(np.abs(col_data - median))
            iqr = np.percentile(col_data, 75) - np.percentile(col_data, 25)
            neg_count = np.sum(col_data < 0)
            pos_count = np.sum(col_data > 0)
            above_mean_count = np.sum(col_data > mean)
            segment_features[col_name + '_mean'] = mean
            segment_features[col_name + '_std'] = std
            segment_features[col_name + '_avg_abs_dev'] = avg_abs_dev
            segment_features[col_name + '_minimum'] = minimum
            segment_features[col_name + '_maximum'] = maximum
            segment_features[col_name + '_max_min_diff'] = max_min_diff
            segment_features[col_name + '_median'] = median
            segment_features[col_name + '_med_abs_dev'] = med_abs_dev
            segment_features[col_name + '_iqr'] = iqr
            segment_features[col_name + '_neg_count'] = neg_count
            segment_features[col_name + '_pos_count'] = pos_count
            segment_features[col_name + '_above_mean_count'] = above_mean_count
        segment_features['type'] = target
        features.append(segment_features)

    # Convert the extracted features to a pandas dataframe
    df_features = pd.DataFrame(features)
    return df_features

### Racket Sports

Standardizarea setului de date RacketSports

In [None]:
df_racket_train_std = std_racket(df_racket_train)
df_racket_test_std = std_racket(df_racket_test)

Feature Extraction for RacketSports

In [None]:
df_racket_features_train = extract_features_racket(df_racket_train_std)

In [None]:
df_racket_features_test = extract_features_racket(df_racket_test_std)

### MIT-BIH

Standardizarea setului de date MIT-BIH

In [None]:
df_MIT_train_scaled = std_heartbeat(df_mit_train)
df_MIT_test_scaled = std_heartbeat(df_mit_test)

Feature Extraction for MIT-BIH

In [None]:
df_MIT_features_train = extract_features_heartbeat(df_MIT_train_scaled)

In [None]:
df_MIT_features_test = extract_features_heartbeat(df_MIT_test_scaled)

### PBT

Standardizarea setului de date PBT

In [None]:
df_PBT_train_scaled = std_heartbeat(df_PBT_train)
df_PBT_train_scaled = df_PBT_train_scaled.drop('beat_187', axis=1)
df_PBT_test_scaled = std_heartbeat(df_PBT_test)
df_PBT_test_scaled = df_PBT_test_scaled.drop('beat_187', axis=1)

Feature Extraction for PBT

In [None]:
df_PBT_features_train = extract_features_heartbeat(df_PBT_train_scaled)

In [None]:
df_PBT_features_test = extract_features_heartbeat(df_PBT_test_scaled)

## Feature selection + aplicare algoritmi

### Definire functii

In [None]:
def compute_var_threshold2(df_features_train, df_features_test):
  threshold = 0.1
  selector = VarianceThreshold(threshold)

  selector.fit(df_features_train)

  selected_columns = df_features_train.columns[selector.get_support()]

  selected_features_MIT_train = pd.DataFrame(selector.transform(df_features_train), columns = selected_columns)
  selected_features_MIT_test = pd.DataFrame(selector.transform(df_features_test), columns = selected_columns)

  return selected_features_MIT_train, selected_features_MIT_test

In [None]:
def compute_var_threshold(df_features):
  X_train = df_features.iloc[:, :-1]
  y_train = df_features.iloc[:, -1]

  # Apply variance threshold feature selection
  threshold = 0.1
  selector = VarianceThreshold(threshold)
  selector.fit(X_train)

  # Get the selected features
  selected_features = X_train.iloc[:, selector.get_support()]

  selected_features = pd.concat([selected_features, y_train], axis = 1)

  return selected_features

def compute_select_percentile(df_features):
  X_train = df_features.iloc[:, :-1]
  y_train = df_features.iloc[:, -1]

  # Apply SelectPercentile feature selection
  percentile = 50
  selector = SelectPercentile(f_classif, percentile=percentile)
  selector.fit(X_train, y_train)

  # Get the selected features
  selected_features = X_train.iloc[:, selector.get_support()]
  selected_features = pd.concat([selected_features, y_train], axis = 1)

  return selected_features

In [None]:
def grid_search_RFC(selected_features, cv):
  X_features = selected_features.iloc[:, :-1]
  y_features = selected_features.iloc[:, -1]

  # Define the hyperparameters to search over
  param_grid = {
      'n_estimators': [100, 200, 300],
      'max_depth': [10, 20, 30],
      'min_samples_split': [2, 4, 6],
      'min_samples_leaf': [1, 2, 4]
  }

  # Create a Random Forest classifier
  rf = RandomForestClassifier()

  # Create a Grid Search object
  grid_search = GridSearchCV(rf, param_grid, cv=cv)

  # Fit the Grid Search object to the training data
  grid_search.fit(X_features, y_features)

  best_params_RFC = grid_search.best_params_
  best_score_RFC = grid_search.best_score_

  # Print the best hyperparameters and score
  print(f'Best Hyperparameters: {best_params_RFC}')
  print(f'Best Score: {best_score_RFC}')

  return best_params_RFC

In [None]:
def apply_RFC_full(selected_features, selected_features_test, best_params_RFC):

    X_train_r = selected_features.iloc[:, :-1]
    X_test_r = selected_features_test.iloc[:, :-1]
    y_train_r = selected_features.iloc[:, -1]
    y_test_r = selected_features_test.iloc[:, -1]

    rfc = RandomForestClassifier(n_estimators=best_params_RFC['n_estimators'], max_depth=best_params_RFC['max_depth'],
                             min_samples_split=best_params_RFC['min_samples_split'],min_samples_leaf = best_params_RFC['min_samples_leaf'],
                             random_state=42)

    rfc.fit(X_train_r, y_train_r)

    y_pred_r = rfc.predict(X_test_r)

    accuracy_RFC = accuracy_score(y_test_r, y_pred_r)

    cm_RFC = confusion_matrix(y_test_r, y_pred_r)

    cr_RFC = classification_report(y_test_r, y_pred_r, zero_division = 1)

    print("RFC Accuracy:", accuracy_RFC)
    print("RFC Confusion Matrix:\n", cm_RFC)
    print("RFC Classification Report:\n", cr_RFC)

    return accuracy_RFC, cm_RFC

In [None]:
def apply_RFC(selected_features, best_params_RFC):
    X_features = selected_features.iloc[:, :-1]
    y_features = selected_features.iloc[:, -1]

    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_features, y_features, test_size=0.2, random_state=42)


    rfc = RandomForestClassifier(n_estimators=best_params_RFC['n_estimators'], max_depth=best_params_RFC['max_depth'],
                             min_samples_split=best_params_RFC['min_samples_split'],min_samples_leaf = best_params_RFC['min_samples_leaf'],
                             random_state=42)

    rfc.fit(X_train_r, y_train_r)

    y_pred_r = rfc.predict(X_test_r)

    accuracy_RFC = accuracy_score(y_test_r, y_pred_r)

    cm_RFC = confusion_matrix(y_test_r, y_pred_r)

    cr_RFC = classification_report(y_test_r, y_pred_r, zero_division = 1)

    print("RFC Accuracy:", accuracy_RFC)
    print("RFC Confusion Matrix:\n", cm_RFC)
    print("RFC Classification Report:\n", cr_RFC)

    return accuracy_RFC, cm_RFC

In [None]:
def grid_search_SVM(selected_features, cv):

  X_features = selected_features.iloc[:, :-1]
  y_features = selected_features.iloc[:, -1]

  svm = SVC()

  # Define the hyperparameter space
  param_grid = {
      'svm__C': [0.1, 1, 10],
      'svm__kernel': ['linear', 'rbf'],
      'svm__gamma': ['scale', 'auto']
  }

   # Define the preprocessing steps
  preprocessor = StandardScaler()

  # Define the pipeline
  pipe = Pipeline([
      ('preprocessor', preprocessor),
      ('svm', svm)
  ])

  # Define the Grid Search object
  grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='accuracy')

  # Fit the Grid Search object to the data
  grid.fit(X_features, y_features)

  best_params_SVM = grid.best_params_
  best_score_SVM = grid.best_score_

  # Print the best parameters and score
  print("Best parameters:", best_params_SVM)
  print("Best score:", best_score_SVM)

  return best_params_SVM

In [None]:
def apply_SVM2(selected_features_train, selected_features_test, best_params_SVM):

  X_train_s = selected_features_train.iloc[:, :-1]
  X_test_s = selected_features_test.iloc[:, :-1]
  y_train_s = selected_features_train.iloc[:, -1]
  y_test_s = selected_features_test.iloc[:, -1]

  # Standardize the data
  scaler = StandardScaler()
  X_train_s = scaler.fit_transform(X_train_s)
  X_test_s = scaler.transform(X_test_s)

  # Train a SVM classifier on the training data
  clf = SVC(kernel=best_params_SVM['svm__kernel'], C = best_params_SVM['svm__C'], gamma = best_params_SVM['svm__gamma'])
  clf.fit(X_train_s, y_train_s)

  # Evaluate the performance of the model on the testing data
  y_pred_s = clf.predict(X_test_s)
  accuracy_SVM = accuracy_score(y_test_s, y_pred_s)

  cm_SVM = confusion_matrix(y_test_s, y_pred_s)

  cr_SVM = classification_report(y_test_s, y_pred_s, zero_division = 1)

  print("SVM Accuracy:", accuracy_SVM)
  print("SVM Confusion Matrix:\n", cm_SVM)
  print("SVM Classification Report:\n", cr_SVM)

  return accuracy_SVM, cm_SVM

In [None]:
def apply_SVM(selected_features, best_params_SVM):

  X = selected_features.drop('type', axis=1)
  y = selected_features['type']

  # Split the data into training and testing sets
  X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size=0.2, random_state=42)

  # Standardize the data
  scaler = StandardScaler()
  X_train_s = scaler.fit_transform(X_train_s)
  X_test_s = scaler.transform(X_test_s)

  # Train a SVM classifier on the training data
  clf = SVC(kernel=best_params_SVM['svm__kernel'], C = best_params_SVM['svm__C'], gamma = best_params_SVM['svm__gamma'])
  clf.fit(X_train_s, y_train_s)

  # Evaluate the performance of the model on the testing data
  y_pred_s = clf.predict(X_test_s)
  accuracy_SVM = accuracy_score(y_test_s, y_pred_s)

  cm_SVM = confusion_matrix(y_test_s, y_pred_s)

  cr_SVM = classification_report(y_test_s, y_pred_s, zero_division = 1)

  print("SVM Accuracy:", accuracy_SVM)
  print("SVM Confusion Matrix:\n", cm_SVM)
  print("SVM Classification Report:\n", cr_SVM)

  return accuracy_SVM, cm_SVM

In [None]:
def grid_search_GBC(selected_features, cv):

  X_features = selected_features.iloc[:, :-1]
  y_features = selected_features.iloc[:, -1]

  encoder = LabelEncoder()
  y_encoded = encoder.fit_transform(y_features)

  xgb_clf = xgb.XGBClassifier()

  # Define the hyperparameter space to search over
  param_grid = {
      'n_estimators': [50, 100, 200],
      'learning_rate': [0.01, 0.1, 1],
      'max_depth': [2, 3, 4],
      'min_child_weight': [1, 5, 10]
  }

  # Define the Grid Search object
  grid = GridSearchCV(xgb_clf, param_grid, cv=cv, scoring='accuracy', error_score='raise')

  # Fit the Grid Search object to the data
  grid.fit(X_features, y_encoded)

  best_params_GBC = grid.best_params_
  best_score_GBC = grid.best_score_

  # Print the best parameters and score
  print("Best parameters:", best_params_GBC)
  print("Best score:", best_score_GBC)

  return best_params_GBC

In [None]:
def apply_GBC2(selected_features_train, selected_features_test, best_params_GBC):

  X_train_g = selected_features_train.iloc[:, :-1]
  X_test_g = selected_features_test.iloc[:, :-1]
  y_train_g = selected_features_train.iloc[:, -1]
  y_test_g = selected_features_test.iloc[:, -1]

  model = XGBClassifier(n_estimators = best_params_GBC['n_estimators'], learning_rate = best_params_GBC['learning_rate'], \
                                    max_depth = best_params_GBC['max_depth'], min_child_weight = best_params_GBC['min_child_weight'])

  encoder = LabelEncoder()
  y_train_g = encoder.fit_transform(y_train_g)
  y_test_g = encoder.fit_transform(y_test_g)

  # fit the model on the training data
  model.fit(X_train_g, y_train_g)

  # make predictions on the test data
  y_pred_g = model.predict(X_test_g)

  # evaluate the model using accuracy_score
  accuracy_GBC = accuracy_score(y_test_g, y_pred_g)

  cm_GBC = confusion_matrix(y_test_g, y_pred_g)

  cr_GBC = classification_report(y_test_g, y_pred_g, zero_division = 1)

  print("GBC Accuracy:", accuracy_GBC)
  print("GBC Confusion Matrix:\n", cm_GBC)
  print("GBC Classification Report:\n", cr_GBC)

  return accuracy_GBC, cm_GBC

In [None]:
def apply_GBC(selected_features, best_params_GBC):


  X = selected_features.iloc[:, :-1]
  y = selected_features.iloc[:, -1]

  X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X, y, test_size=0.2, random_state=42)

  model = XGBClassifier(n_estimators = best_params_GBC['n_estimators'], learning_rate = best_params_GBC['learning_rate'], \
                                    max_depth = best_params_GBC['max_depth'], min_child_weight = best_params_GBC['min_child_weight'])

  encoder = LabelEncoder()
  y_train_g = encoder.fit_transform(y_train_g)
  y_test_g = encoder.fit_transform(y_test_g)

  # fit the model on the training data
  model.fit(X_train_g, y_train_g)

  # make predictions on the test data
  y_pred_g = model.predict(X_test_g)

  # evaluate the model using accuracy_score
  accuracy_GBC = accuracy_score(y_test_g, y_pred_g)

  cm_GBC = confusion_matrix(y_test_g, y_pred_g)

  cr_GBC = classification_report(y_test_g, y_pred_g, zero_division = 1)

  print("GBC Accuracy:", accuracy_GBC)
  print("GBC Confusion Matrix:\n", cm_GBC)
  print("GBC Classification Report:\n", cr_GBC)

  return accuracy_GBC, cm_GBC

### Pentru Racket Sports

Feature Selection using Variance Treshold

In [None]:
selected_features_racket_train = compute_var_threshold(df_racket_features_train)
selected_features_racket_test = compute_var_threshold(df_racket_features_test)

Feature selection using Select Percentile

In [None]:
selected_features_racket2 = compute_select_percentile(df_racket_features_train)

Random Forest Classifier si Grid Search cu CV

In [None]:
best_params_racket_RFC = grid_search_RFC(selected_features_racket_train, 5)

In [None]:
racket_accuracy_RFC, racket_cm_RFC = apply_RFC_full(selected_features_racket_train, selected_features_racket_test, best_params_racket_RFC)

SVM si Grid Search cu CV

In [None]:
best_params_racket_SVM = grid_search_SVM(selected_features_racket_train, 5)

In [None]:
racket_accuracy_SVM, racket_cm_SVM = apply_SVM2(selected_features_racket_train, selected_features_racket_test, best_params_racket_SVM)

GradientBoostingClassifier si Grid Search cu CV

In [None]:
best_params_racket_GBC = grid_search_GBC(selected_features_racket_train, 5)

In [None]:
racket_accuracy_GBC, racket_cm_GBC = apply_GBC2(selected_features_racket_train, selected_features_racket_test, best_params_racket_GBC)

In [None]:
display(racket_cm_GBC)

### Pentru MIT-BIH

Feature selection using Variance Threshold


In [None]:
selected_features_MIT_train, selected_features_MIT_test = compute_var_threshold2(df_MIT_features_train, df_MIT_features_test)

Feature selection using Select Percentile

In [None]:
selected_features_MIT_train2 = compute_select_percentile(df_MIT_features_train)
selected_features_MIT_test2 = compute_select_percentile(df_MIT_features_test)

Random Forest Classifier si Grid Search cu CV

In [None]:
best_params_MIT_RFC = grid_search_RFC(selected_features_MIT_train, 3)

In [None]:
MIT_accuracy_RFC, MIT_cm_RFC = apply_RFC_full(selected_features_MIT_train, selected_features_MIT_test, best_params_MIT_RFC)

SVM si Grid Search cu CV

In [None]:
best_params_MIT_SVM = grid_search_SVM(selected_features_MIT_train, 3)

In [None]:
MIT_accuracy_SVM, MIT_cm_SVM = apply_SVM2(selected_features_MIT_train, selected_features_MIT_test, best_params_MIT_SVM)

In [None]:
display(MIT_cm_SVM)

GradientBoostingClassifier si Grid Search cu CV

In [None]:
best_params_MIT_GBC = grid_search_GBC(selected_features_MIT_train, 3)

In [None]:
MIT_accuracy_GBC, MIT_cm_GBC = apply_GBC2(selected_features_MIT_train, selected_features_MIT_train, best_params_MIT_GBC)

### Pentru PBT

Feature Selection using Variance Threashold

In [None]:
selected_features_PBT_train, selected_features_PBT_test = compute_var_threshold2(df_PBT_features_train, df_PBT_features_test)

Feature Selection using Select Percentile

In [None]:
selected_features_PBT2 = compute_select_percentile(df_PBT_features_train)

Random Forest Classifier si Grid Search cu CV

In [None]:
best_params_PBT_RFC = grid_search_RFC(selected_features_PBT_train, 5)

In [None]:
PBT_accuracy_RFC, PBT_cm_RFC = apply_RFC(selected_features_PBT_train, best_params_PBT_RFC)

In [None]:
PBT_accuracy_RFC, PBT_cm_RFC = apply_RFC_full(selected_features_PBT_train, selected_features_PBT_test, best_params_PBT_RFC)

SVM si Grid Search cu CV

In [None]:
best_params_PBT_SVM = grid_search_SVM(selected_features_PBT_train, 5)

In [None]:
PBT_accuracy_SVM, PBT_cm_SVM = apply_SVM(selected_features_PBT_train, best_params_PBT_SVM)

In [None]:
PBT_accuracy_SVM, PBT_cm_SVM = apply_SVM2(selected_features_PBT_train, selected_features_PBT_test, best_params_PBT_SVM)

In [None]:
display(PBT_cm_SVM)

GradientBoostingClassifier si Grid Search cu CV

In [None]:
best_params_PBT_GBC = grid_search_GBC(selected_features_PBT_train, 5)

In [None]:
PBT_accuracy_GBC, PBT_cm_GBC = apply_GBC2(selected_features_PBT_train, selected_features_PBT_train, best_params_PBT_GBC)