# ML para predecir el gesto

Creación y evaluación de modelos de Machine Learning para predecir qué gesto está realizando el paciente

In [None]:
# ---------- importar librerías---------
# Manipular los datos
import pandas as pd
import numpy as np

# Gráficas
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
sns.set_theme(style="darkgrid")

# scikit-learn (ML en python)
## Procesar el dataset
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import LeaveOneGroupOut

## Modelos ML
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
## Evaluación de los modelos
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LearningCurveDisplay
## Hiperparametrizacion
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

## Seleccion de variables
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV # recursive

# Para ignorar los FutureWarning
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
#---------Importar los datos-----------
# Dataframe medidas calculadas por repetición
df = pd.read_csv('../csvFiles/medidasPerRepetition.csv', dtype=object) # salida de leer_dataset.ipynb
df.head() # visualizacion de la cabecera

In [None]:
test_result = pd.DataFrame({'Clasificadores': ['K-Neighbors',
                                            'Decision tree',
                                            'Naive Bayes',
                                            'Suport Vector Machine',
                                            'Random Forest']})


## 1. Preprocesado

### 1.1 Codificar variables

In [None]:
# ------Preparar el dataset-------
encoder = OrdinalEncoder(categories=[list(set(df["Position"].values))])
encoder.fit(df[["Position"]])
df["Position"] = encoder.transform(df[["Position"]])

# pasar variable obj to numeric
df = df.apply(pd.to_numeric, errors='ignore')

### 1.2 Dividir el dataset

In [None]:
# dejar dos pacientes como test

test_df = df.loc[(df.SubjectID==101) | (df.SubjectID==105) | (df.SubjectID==201) | (df.SubjectID==202) | (df.SubjectID==301) | (df.SubjectID==302)]
train_df = df.loc[(df.SubjectID!=101) & (df.SubjectID!=105) & (df.SubjectID!=201) & (df.SubjectID!=202) & (df.SubjectID!=301) & (df.SubjectID!=302)] 

train_X = train_df.drop(['GestureLabel'], axis=1)
train_y=pd.DataFrame(train_df['GestureLabel']) 
test_X= test_df.drop(['GestureLabel'], axis=1) 
test_y =pd.DataFrame(test_df['GestureLabel'])

# # Convertimos los df de target (y) a 1-d
train_y = train_y.values.ravel()
test_y = test_y.values.ravel()

## 2. Modelos de ML
Se ha creado una función para entrenar y evaluar los modelos

In [None]:
#-------Funcion para evaluar los modelos-----------
def test_models(modelos, tX, ty, df, column_name):
   new_evaluation = []
   for modelo in modelos:
      prediction = modelo.predict(tX) #  predicciones en los datos de prueba
      report = classification_report(ty, prediction, zero_division=0) # informe de evaluación
      score = f1_score(test_y, prediction, average='weighted', zero_division=0) 
      new_evaluation.append(score)
      print(f"\nModelo: {modelo.__class__.__name__}") 
      print(report) 
   df.loc[:, column_name] = new_evaluation



In [None]:
#-----------Función matriz de confusión-----
def plot_confusion_matrix(modelo, tX, ty, ax=None):
    """
    Grafica la matriz de confusión para un modelo dado.

    Parámetros:
    * modelo: El modelo de clasificación entrenado.
    * tX: Los datos de prueba.
    * ty: Los valores reales del target para los datos de prueba.
    """

    prediction = modelo.predict(tX)
    cm = confusion_matrix(ty, prediction)
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", ax=ax)
    ax.set_title('Matriz de confusión ' + modelo.__class__.__name__)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')


In [None]:
knn = KNeighborsClassifier() # K-Neighbors classifier
dtree = DecisionTreeClassifier() # Decision tree
nb = GaussianNB() # Naive Bayes
svm = SVC() # Suport Vector Machine
rf = RandomForestClassifier()  # Random Forest

models = [knn, dtree, nb, svm, rf]

for modelo in models:
    modelo.fit(train_X, train_y) # entrenamos el modelo

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))

common_params = {
    "X": train_X,
    "y": train_y,
    "groups": train_X['SubjectID'],
    # "train_sizes": np.linspace(0.1, 1.0, 5),
    "cv": LeaveOneGroupOut(),
    "score_type": "both",
    "n_jobs": 4,
    "line_kw": {"marker": "o"},
    "std_display_style": "fill_between",
    "score_name": "Accuracy",
}
for idx, estimator in enumerate(models):
    row = idx // 2
    col = idx % 2
    ax = axes[row, col]
    LearningCurveDisplay.from_estimator(estimator, **common_params, ax=ax)
    handles, label = ax.get_legend_handles_labels()
    ax.legend(handles[:2], ["Resultado del entrenamiento", "Resultado de la validación"])
    ax.set_title(f"Curva de aprendizaje de {estimator.__class__.__name__}")
    ax.set_ylim(0.5, 1.01)
fig.delaxes(axes.flatten()[5])

plt.tight_layout()
plt.show()

In [None]:
# entrenamiento + evaluación
test_models(models, test_X, test_y, test_result, 'Modelos')

## 3. Hipermetrización

In [None]:
#-------------Parametros---------------
param_grid_knn = {
    'n_neighbors': [6, 7, 8],
    'weights': ('uniform', 'distance'),
    'leaf_size': (3, 4, 5, 6, 8),
    'p': (1,2),
    'metric': ('minkowski', 'chebyshev')
}

param_grid_dtree = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 40, 50, 60],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 2, 5, 7],
    'max_features': [None, 'sqrt', 'log2']
}

param_grid_nb = {'var_smoothing': np.logspace(0, -8, num=100)}


param_grid_svm = {
    'C': [0.1, 0.2, 0.3],#np.linspace(0, 1, num=10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3],
    'coef0': [0.0, 0.1, 0.2]
}

param_grid_rf = {
    'n_estimators': [300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 50, 60],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 5, 6],
}

In [None]:
def aplicar_hiperparametrizacion(X, y):
    logo = LeaveOneGroupOut()
    gs_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=logo, scoring='accuracy', n_jobs=-1)
    gs_dtree = GridSearchCV(DecisionTreeClassifier(), param_grid_dtree, cv=logo, scoring='accuracy', n_jobs=-1)
    gs_nb = GridSearchCV(GaussianNB(), param_grid_nb, cv=logo, scoring='accuracy', n_jobs=-1)
    gs_svm = GridSearchCV(SVC(), param_grid_svm, cv=logo, scoring='accuracy', n_jobs=-1)
    gs_rf = RandomizedSearchCV(RandomForestClassifier(), param_grid_rf, cv=logo, scoring='accuracy', n_jobs=-1)

    subject_ids = X['SubjectID']
    
    modelos = [gs_knn, gs_dtree, gs_nb, gs_svm, gs_rf]


    best_models = []
    for modelo in modelos:
        modelo.fit(X, y, groups=subject_ids)
        best_modelo = modelo.best_estimator_
        print(f"\nModelo: {modelo.estimator.__class__.__name__}") 
        print("Mejores parámetros: ", modelo.best_params_)
        print("Mejor resultado: ", modelo.best_score_)
        best_models.append(best_modelo)
        LearningCurveDisplay.from_estimator(best_modelo, 
                                            X=X,
                                            y=y,
                                            groups=subject_ids,
                                            cv=logo,
                                            score_type="both",
                                            n_jobs=4,
                                            line_kw={"marker": "o"},
                                            std_display_style="fill_between",
                                            score_name="Accuracy"
                                        )
         
        plt.legend(["Resultado del entrenamiento", "Resultado de la validación"])
        plt.title(f"Curva de aprendizaje de {modelo.estimator.__class__.__name__}")
        plt.show()

    return best_models
    
plt.figure(figsize=(6,4))

best_models = aplicar_hiperparametrizacion(train_X, train_y)

In [None]:
test_models(best_models, test_X, test_y, test_result, 'Hiperparametrización (H)')

In [None]:
#-----------matriz de confusión-----
num_models = len(best_models)
num_cols = 2  # Number of matrices per row
num_rows = (num_models + num_cols - 1) // num_cols  # Calculate number of rows needed

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 6*num_rows))


for i, modelo in enumerate(best_models):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    plot_confusion_matrix(modelo, test_X, test_y, ax=ax)
    # Hide unused subplots
for i in range(num_models, num_rows * num_cols):
    axes.flatten()[i].axis('off')

plt.tight_layout()
plt.show()

## 4. Feature Selection

### 4.1 SelectKBest

In [None]:
subject_id = train_X['SubjectID']


# Definimos feature Selection K=50 (selecciona entonces las 50 variables que considere mas importantes)
feature_selection = SelectKBest(k=50)

# Fit Feature Selection (entrena y hace una tranfosmada)
selected_features = feature_selection.fit_transform(train_X.drop(columns=['SubjectID']), train_y)

# Selecionamos las características mas relevantes para nuestro problema
selected = feature_selection.get_support(indices=True)

print(train_X.columns[selected])

# Reduce train_X to the selected features with .transform(X)
#creamos un data frame vacio para hacer una tabla donde esten 
# las variables con la trsnformada de la X.
# esto elimina las variales que no nos intersan.


X_fs = pd.DataFrame(selected_features,
                 columns=train_X.drop(columns=['SubjectID']).columns[selected])

X_fs['SubjectID'] = subject_id.values


### 4.2 Entrenar con las variables sleccionadas

In [None]:
# creación de los modelos
knn_fs = KNeighborsClassifier() # K-Neighbors classifier
dtree_fs = DecisionTreeClassifier() # Decision tree
nb_fs = GaussianNB() # Naive Bayes
svm_fs = SVC() # Suport Vector Machine
rf_fs = RandomForestClassifier()  # Random Forest

models_fs = [knn_fs, dtree_fs, nb_fs, svm_fs, rf_fs]

logo = LeaveOneGroupOut()

for modelo in models_fs:
        print(f"\nModelo: {modelo.__class__.__name__}")
        modelo.fit(X_fs, train_y)
        scores = cross_val_score(modelo, X_fs, train_y, groups=X_fs['SubjectID'],  scoring='accuracy', cv=logo)
        print("%0.2f accuracy con una derivación estandar de %0.2f" % (scores.mean(), scores.std()))

In [None]:
test_subject_id = test_X['SubjectID']


selected_features_test = feature_selection.transform(test_X.drop(columns=['SubjectID']))

X_fs_test = pd.DataFrame(selected_features_test, columns=test_X.drop(columns=['SubjectID']).columns[selected])

X_fs_test['SubjectID'] = test_subject_id.values

test_models(models_fs, X_fs_test, test_y, test_result, 'Selección de variables (FS)')

In [None]:
for modelo in best_models:
    modelo.fit(X_fs, train_y)
test_models(best_models, X_fs_test, test_y, test_result, 'H + Fs')

### 4.3 Recursive feature elimination

In [None]:
dtree_rfe = DecisionTreeClassifier() # Decision tree
rf_rfe = RandomForestClassifier(n_estimators = 5)  # Random Forest

models_with_coef = [dtree_rfe, rf_rfe]

In [None]:
def seleccion_variables(models, X, y):
    for model in models:
        rfecv = RFECV(
            estimator=model,
            scoring="accuracy",
            cv = LeaveOneGroupOut(),
            n_jobs=-1 # Number of cores to run in parallel while fitting across folds. 
        )
        rfecv.fit(X, y, groups=X['SubjectID'])
        print(f"\nModelo: {rfecv.estimator.__class__.__name__}") 
        print(f"Número óptimo de características: {rfecv.n_features_}")
        selected = rfecv.get_feature_names_out()
        print(selected)
        print(rfecv.score)
        cv_results = pd.DataFrame(rfecv.cv_results_)
        plt.title(f"RFECV {rfecv.estimator.__class__.__name__}")
        plt.xlabel("Número de características")
        plt.ylabel("Accuracy")
        plt.plot(range(1, len(cv_results['mean_test_score']) + 1), cv_results['mean_test_score'])
        plt.show()

seleccion_variables(models_with_coef, train_X, train_y)

# Evaluación modelos

In [None]:
test_result

In [None]:
# Melt the dataframe to transform it into long format
df_melted = pd.melt(test_result, id_vars=['Clasificadores'], value_vars=['Modelos', 'Hiperparametrización (H)', 'Selección de variables (FS)', 'H + Fs'],
                    var_name='score_type', value_name='score')

# Plot using seaborn
plt.figure(figsize=(6, 4))
sns.lineplot(x='score_type', y='score', hue='Clasificadores', data=df_melted, marker='o', palette='Set1')

# Set labels and title
plt.xlabel('Clasificador')
plt.ylabel('F1-score')
plt.title('F1-Scores por Clasificador')
plt.ylim(0,1)

# Display the plot
plt.xticks()
plt.grid(True)
plt.tight_layout()
plt.show()