# Explicacion se la selecion de vairbales de la Fase 2

Para entender cómo se clasifica cada repetición de los gestos como correcta o incorrecta, se generarán gráficos que visualicen la importancia de las variables seleccionadas y cómo estas son utilizadas por el modelo.

In [None]:
import pandas as pd
import joblib
import matplotlib as plt
import seaborn as sns
from sklearn import tree # visualizar dtree

In [None]:
df = pd.read_csv('../Resultados/medidasPerRepetition.csv', dtype=object)

In [None]:
gestures = df['GestureName'].unique()

for gesture in gestures:

    df_gesture = df[df['GestureName'] == gesture]
    X = df_gesture.drop(['CorrectLabel'], axis=1)
    y = pd.DataFrame(df_gesture['CorrectLabel'])

    modelo_gesto_path = f'../Resultados/modelo_{gesture}.sav'
    best_pipeline, expected_columns = joblib.load(modelo_gesto_path)

    # Step 1: Get feature names from the 'preprocessing' step
    preprocessor = best_pipeline.named_steps['preprocessing']

    # Extract the feature names after preprocessing (OrdinalEncoder + remainder columns)
    feature_names = preprocessor.get_feature_names_out()

    # Step 2: Access the SelectKBest step and get the selected features
    select_kbest = best_pipeline.named_steps['select_features']

    # Get the boolean mask for the selected features
    selected_features_mask = select_kbest.get_support()

    # Use the mask to extract the selected feature names
    selected_features = feature_names[selected_features_mask]
    selected_features = [name.replace('remainder__', '') for name in selected_features]
    selected_features = [name.replace('encoder__', '') for name in selected_features]
    
    # Crear un DataFrame con las características seleccionadas y la variable objetivo
    df_selected = X[selected_features]
   
    # Violiin plots
    n_features = len(selected_features)
    n_cols = 3  
    n_rows = (n_features + n_cols - 1) // n_cols 
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten()

    for i, feature in enumerate(selected_features):
        sns.violinplot(x=y, y=df_selected[feature], ax=axes[i])
        axes[i].set_title(f'Violin plot of {feature} vs CorrectLabel')
        axes[i].set_xlabel('CorrectLabel')
        axes[i].set_ylabel(feature)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show() 

    # Visualizar los clasificadores
    classifier = best_pipeline.named_steps['classifier']
    classifier_name = classifier.__class__.__name__

    if classifier_name == 'GaussianNB':
        mean_df = pd.DataFrame(classifier.theta_, columns=selected_features, index=['correct', 'incorrect'])
        plt.figure(figsize=(14, 4))
        sns.heatmap(mean_df, annot=True, cmap="coolwarm", center=0)
        plt.title('Media de cada clase seleccionada por clase')
        plt.show()

    elif classifier_name == 'DecisionTreeClassifier':
        fig = plt.figure(figsize=(8, 6), dpi=300) 
        tree.plot_tree(classifier,
                        feature_names=selected_features,
                        class_names=np.unique(y.values).astype(str),
                        filled=True)

    elif classifier_name == 'LogisticRegression':
        plt.figure(1, figsize=(4, 3))
        plt.clf()
        plt.plot(df_selected, label="Logistic Regression Model", color="red", linewidth=3)
        plt.tight_layout()
        plt.show()

    elif classifier_name == 'RandomForestClassifier':
        fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
        for index in range(0, 5):
            tree.plot_tree(classifier.estimators_[index],
                        feature_names=selected_features, 
                        class_names=np.unique(y.values).astype(str),
                        filled = True,
                        ax = axes[index])
        axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
        plt.tight_layout()
        plt.show()