## Aprendizado não supervisionado

In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('../data/data_tfidf.csv')

### Dividir treinamento e teste

In [None]:
labels = data['resultado']

In [None]:
labels = labels.replace('DEFERIDO', 1)
labels = labels.replace('INDEFERIDO', 0)

### Pré-processamento

#### Funções de Pré-Processamento

In [None]:
def get_correlation_df(df, num_features):

    correlation_matrix = df.corr()

    average_correlation = correlation_matrix.abs().mean().sort_values()

    smallest_average_correlations = average_correlation.head(
        num_features).index.tolist()

    return df[smallest_average_correlations]

In [None]:
def get_pca_df(df, num_components):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=num_components)
    return pd.DataFrame(pca.fit_transform(df))

In [None]:
def get_k_nn_accuracy(df, labels):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(
        df, labels, test_size=0.3, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def get_best_representation(df1, df2, labels):
    accuracy1 = get_k_nn_accuracy(df1, labels)
    accuracy2 = get_k_nn_accuracy(df2, labels)

    print(f'1: {accuracy1} x 2: {accuracy2}')

    return df1 if accuracy1 > accuracy2 else df2

#### Execução

In [None]:
labels.value_counts()

In [None]:
data_tfidf = data_tfidf.drop(['advogado_2', 'requerente_3', 'requerente_4',
                              'requerente_5', 'requerente_6', 'requerente_7',
                              'impugnado_2', 'impugnado', 'fiscal_de_lei_nome',
                              'formal_request', 'free_judicial', 'judicial_secrecy', 'id'], axis='columns')

In [None]:
accuracy1 = get_k_nn_accuracy(data_tfidf, labels)
print(accuracy1)

##### Selecionando de atributos pela correlação

In [None]:
reduced_dataset_corr_3 = get_correlation_df(data_tfidf, 3)
reduced_dataset_corr_3.head()

In [None]:
reduced_dataset_corr_9 = get_correlation_df(data_tfidf, 9)
reduced_dataset_corr_9.head()

In [None]:
selected_df_corr = get_best_representation(
    reduced_dataset_corr_3, reduced_dataset_corr_9, labels)
selected_df_corr.head()

##### Redução de dimensionalidade pelo PCA

In [None]:
dataset_pca_90_cov = get_pca_df(data_tfidf, 0.9)
dataset_pca_90_cov.head()

In [None]:
dataset_pca_3_cps = get_pca_df(data_tfidf, 3)
dataset_pca_3_cps.head()

In [None]:
selected_df_pca = get_best_representation(
    dataset_pca_90_cov, dataset_pca_3_cps, labels)
selected_df_pca.head()

### Aplicação dos modelos

#### Funções

In [None]:
import numpy as np

In [None]:
def get_indices_values(df, labels, model_labels):
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    indice_db = davies_bouldin_score(df, model_labels)
    indice_sil = silhouette_score(df, model_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, model_labels)

    return indice_db, indice_sil, indice_cr

In [None]:
def get_kmeans_labels(df, num_clusters):
    from sklearn.cluster import KMeans

    km = KMeans(n_clusters=num_clusters, init='k-means++',
                max_iter=300, n_init=10, random_state=42)

    km.fit(df)

    return km.fit_predict(df)

In [None]:
def get_hierarquico_labels(df, num_clusters, linkage):
    from sklearn.cluster import AgglomerativeClustering

    hiera_aglo = AgglomerativeClustering(
        n_clusters=num_clusters, metric='euclidean', linkage=linkage)

    hiera_aglo.fit(df)

    return hiera_aglo.fit_predict(df)

In [None]:
def get_em_labels(df, num_components, cov_type):
    from sklearn.mixture import GaussianMixture
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    gmm = GaussianMixture(n_components=num_components,
                          covariance_type=cov_type)

    gmm.fit(df)

    return gmm.fit_predict(df)

In [None]:
def get_dbscan_labels(df, eps_value, minimum_samples):
    from sklearn.cluster import DBSCAN

    dbscan = DBSCAN(eps=eps_value, min_samples=minimum_samples)

    dbscan.fit(df)

    return dbscan.fit_predict(df)

In [None]:
def get_models_labels(df, **kwargs):
    import pandas as pd

    labels_data = []

    for index in range(2, 21):
        km_labels = get_kmeans_labels(df, index)
        gmm_labels = get_hierarquico_labels(df, index, kwargs.get('linkage'))
        em_labels = get_em_labels(df, index, kwargs.get('cov_type'))
        dbscan_labels = get_dbscan_labels(df, kwargs.get('eps_value'), index)

        labels_values = {'Grupos': index, 'Kmeans': km_labels,
                         'Hierarquico': gmm_labels, 'EM': em_labels, 'DBSCAN': dbscan_labels}

        labels_data.append(labels_values)

    df_labels = pd.DataFrame(labels_data)
    df_labels.index = df_labels.index + 2

    return df_labels

In [None]:
def get_cluster_indices(df, labels, models_labels, is_corr):
    import pandas as pd

    indices_data = []

    for index in range(2, 21):
        db_value_km, sil_value_km, cr_value_km = get_indices_values(
            df, labels, models_labels.loc[index, 'Kmeans'])
        db_value_hiera, sil_value_hiera, cr_value_hiera = get_indices_values(
            df, labels, models_labels.loc[index, 'Hierarquico'])
        db_value_em, sil_value_em, cr_value_em = get_indices_values(
            df, labels, models_labels.loc[index, 'EM'])
        if not is_corr:
            db_value_dbscan, sil_value_dbscan, cr_value_dbscan = get_indices_values(
                df, labels, models_labels.loc[index, 'DBSCAN'])

        indices_values = {
            'Grupos': index,
            'Kmeans': {'DB': db_value_km, 'SIL': sil_value_km, 'CR': cr_value_km},
            'Hierarquico': {'DB': db_value_hiera, 'SIL': sil_value_hiera, 'CR': cr_value_hiera},
            'EM': {'DB': db_value_em, 'SIL': sil_value_em, 'CR': cr_value_em},
        }

        if not is_corr:
            indices_values.update(
                {'DBSCAN': {'DB': db_value_dbscan, 'SIL': sil_value_dbscan, 'CR': cr_value_dbscan}})

        indices_data.append(indices_values)

    df_indices = pd.DataFrame(indices_data)
    df_indices.index = df_indices.index + 2

    return df_indices

In [None]:
def plot_cluster_indices(df_orig, df_corr, df_pca, cluster_type):
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    indices_list = ['DB', 'SIL', 'CR']

    fig = make_subplots(rows=1, cols=3, subplot_titles=(
        'Índice - Davies-Bouldin', 'Índice - Silhouette', 'Índice - Adjusted Rand Score'))

    for index, indice in enumerate(indices_list, 1):
        fig.add_trace(go.Scatter(x=df_orig['Grupos'], y=df_orig[cluster_type].apply(
            lambda x: x[indice]), name='Base original'), row=1, col=index)
        if cluster_type != 'DBSCAN':
            fig.add_trace(go.Scatter(x=df_corr['Grupos'], y=df_corr[cluster_type].apply(
                lambda x: x[indice]), name='Correlação'), row=1, col=index)
        fig.add_trace(go.Scatter(x=df_pca['Grupos'], y=df_pca[cluster_type].apply(
            lambda x: x[indice]), name='PCA'), row=1, col=index)

    fig.update_layout(title=f'Gráfico de {
                      cluster_type.upper()}', showlegend=True, boxmode='group')
    fig.show()

#### Geração de gráficos

In [None]:
original_models_labels = get_models_labels(
    data_tfidf, linkage='ward', cov_type='full', eps_value=.1)
pca_models_labels = get_models_labels(
    selected_df_pca, linkage='ward', cov_type='full', eps_value=.1)
corr_models_labels = get_models_labels(
    selected_df_corr, linkage='ward', cov_type='full', eps_value=.1)

In [None]:
original_cluster_indices = get_cluster_indices(
    data_tfidf, labels, original_models_labels, False)
pca_cluster_indices = get_cluster_indices(
    selected_df_pca, labels, pca_models_labels, False)
corr_cluster_indices = get_cluster_indices(
    selected_df_corr, labels, corr_models_labels, True)

In [None]:
%pip install --upgrade jupyter notebook jupyterlab plotly

In [None]:
plot_cluster_indices(original_cluster_indices,
                     corr_cluster_indices, pca_cluster_indices, 'Kmeans')

In [None]:
plot_cluster_indices(original_cluster_indices,
                     corr_cluster_indices, pca_cluster_indices, 'Hierarquico')

In [None]:
plot_cluster_indices(original_cluster_indices,
                     corr_cluster_indices, pca_cluster_indices, 'EM')

In [None]:
plot_cluster_indices(original_cluster_indices,
                     corr_cluster_indices, pca_cluster_indices, 'DBSCAN')

### Comitê de Agrupamento

##### Funções

In [None]:
import numpy as np

In [None]:
def build_binary_matrix(clabels):
    from scipy import sparse

    data_len = len(clabels)

    matrix = np.zeros((data_len, data_len))

    for index in range(data_len):
        matrix[index, :] = clabels == clabels[index]

    return matrix

In [None]:
def build_similarity_matrix(models_labels):
    n_runs, n_data = models_labels.shape[0], models_labels.shape[1]

    sim_matrix = np.zeros((n_data, n_data))

    for index in range(n_runs):
        sim_matrix += build_binary_matrix(models_labels[index, :])

    sim_matrix = sim_matrix / n_runs

    return sim_matrix

In [None]:
def get_ensemble_indices(df, final_labels, labels):
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    indice_db = davies_bouldin_score(df, final_labels)
    indice_sil = silhouette_score(df, final_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, final_labels)

    return indice_db, indice_sil, indice_cr

In [None]:
def get_kmeans_labels_en(df: pd.DataFrame, num_clusters):
    from sklearn.cluster import KMeans

    km_model = KMeans(n_clusters=num_clusters, n_init=4, random_state=214)
    km_model.fit(df)

    return km_model.labels_

In [None]:
def get_hierarquico_labels_en(df: pd.DataFrame, num_clusters):
    from sklearn.cluster import AgglomerativeClustering

    agglo_model = AgglomerativeClustering(n_clusters=num_clusters)
    agglo_model.fit(df)

    return agglo_model.labels_

In [None]:
def get_em_labels_en(df: pd.DataFrame, num_components):
    from sklearn.mixture import GaussianMixture

    gmm_model = GaussianMixture(n_components=num_components, random_state=214)

    gmm_model.fit(df)
    labels = gmm_model.predict(df)

    return np.array(labels)

In [None]:
def get_dbscan_labels_en(df, eps, min_samples):
    from sklearn.cluster import DBSCAN

    dbscan_models = DBSCAN(eps=eps, min_samples=min_samples)

    dbscan_models.fit(df)

    return dbscan_models.labels_

In [None]:
def get_similarity_matrix_en(models_labels: np.ndarray):

    return build_similarity_matrix(models_labels)

In [None]:
def get_final_labels_en(sim_matrix, num_clusters):
    from sklearn.cluster import SpectralClustering

    spec_clt = SpectralClustering(n_clusters=num_clusters, affinity='precomputed',
                                  n_init=5, random_state=214)

    final_labels = spec_clt.fit_predict(sim_matrix)

    return final_labels

In [None]:
def get_ensemble_df(df: pd.DataFrame, labels: np.ndarray, **kwargs) -> pd.DataFrame:
    import pandas as pd

    indices_data = []

    for index in range(2, 21):
        db_df, sil_df, cr_df = {}, {}, {}

        kmeans_labels = get_kmeans_labels_en(df, index)
        hierarquico_labels = get_hierarquico_labels_en(df, index)
        em_labels = get_em_labels_en(df, index)
        dbscan_labels = get_dbscan_labels_en(df, .1, index)

        models_labels = np.array(
            [kmeans_labels, hierarquico_labels, em_labels, dbscan_labels])

        sim_matrix = get_similarity_matrix_en(models_labels)

        final_labels = get_final_labels_en(sim_matrix, index)

        db_value, sil_value, cr_value = get_ensemble_indices(
            df, final_labels, labels)

        indice_values = {'Grupos': index, 'DB': db_value,
                         'Silhouette': sil_value, 'CR': cr_value, 'Labels': final_labels}
        indices_data.append(indice_values)

    indices_ensemble_df = pd.DataFrame(indices_data)
    indices_ensemble_df.index = indices_ensemble_df.index + 2

    return indices_ensemble_df

In [None]:
def plot_ensemble_indices(en_df_original: pd.DataFrame, en_df_corr: pd.DataFrame, en_df_pca: pd.DataFrame, indice_name: str) -> None:
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    indice_data = []

    for index in range(2, 21):
        indice_values = {'Grupos': index + 2,
                         'Original': en_df_original.loc[index, indice_name],
                         'Correlação': en_df_corr.loc[index, indice_name],
                         'PCA': en_df_pca.loc[index, indice_name]
                         }
        indice_data.append(indice_values)

    df = pd.DataFrame(indice_data)

    fig = make_subplots(rows=1, cols=1, subplot_titles=(
        f'Índice - {indice_name}'))

    fig.add_trace(go.Scatter(
        x=df['Grupos'], y=df['Original'], name='Base original'), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=df['Grupos'], y=df['Correlação'], name='Correlação'), row=1, col=1)
    fig.add_trace(go.Scatter(x=df['Grupos'],
                  y=df['PCA'], name='PCA'), row=1, col=1)

    fig.update_layout(title=f'Gráfico de comitê de agrupamento - Índice {
                      indice_name}', showlegend=True, boxmode='group')
    fig.show()

##### Execução

In [None]:
ensemble_df_original = get_ensemble_df(data_tfidf, labels)
ensemble_df_corr = get_ensemble_df(selected_df_corr, labels)
ensemble_df_pca = get_ensemble_df(selected_df_pca, labels)

In [None]:
plot_ensemble_indices(ensemble_df_original,
                      ensemble_df_corr, ensemble_df_pca, 'DB')

In [None]:
plot_ensemble_indices(ensemble_df_original, ensemble_df_corr,
                      ensemble_df_pca, 'Silhouette')

In [None]:
plot_ensemble_indices(ensemble_df_original,
                      ensemble_df_corr, ensemble_df_pca, 'CR')

### Teste Estatístico

##### Funções

In [None]:
def get_cr_df() -> pd.DataFrame:
    cr_data = pd.DataFrame(columns=[
        'kmeans', 'hierarquico', 'em', 'dbscan', 'ensemble'
    ], index=range(2, 21))

    return cr_data

In [None]:
def fill_cr_values(df, ensemble_df):
    result_df = get_cr_df()

    km_cr_values = df['Kmeans'].apply(lambda x: x['CR'])
    ag_cr_values = df['Hierarquico'].apply(lambda x: x['CR'])
    em_cr_values = df['EM'].apply(lambda x: x['CR'])
    dbscan_cr_values = df['DBSCAN'].apply(lambda x: x['CR'])

    for num_cluster in range(2, 21):
        result_df.loc[num_cluster] = [
            km_cr_values.loc[num_cluster],
            ag_cr_values.loc[num_cluster],
            em_cr_values.loc[num_cluster],
            dbscan_cr_values.loc[num_cluster],
            ensemble_df.loc[num_cluster, 'CR']
        ]

    return result_df

In [None]:
def fill_cr_values_corr(df, ensemble_df):
    result_df = get_cr_df()
    result_df = result_df.drop('dbscan', axis=1)

    km_cr_values = df['Kmeans'].apply(lambda x: x['CR'])
    ag_cr_values = df['Hierarquico'].apply(lambda x: x['CR'])
    em_cr_values = df['EM'].apply(lambda x: x['CR'])

    for num_cluster in range(2, 21):
        result_df.loc[num_cluster] = [
            km_cr_values.loc[num_cluster],
            ag_cr_values.loc[num_cluster],
            em_cr_values.loc[num_cluster],
            ensemble_df.loc[num_cluster, 'CR']
        ]

    return result_df

In [None]:
def print_friedman_result(df):
    from scipy.stats import friedmanchisquare

    friedman_chi2, friedman_p_value = friedmanchisquare(
        *[df[col] for col in df.columns])
    print("Teste de Friedman")
    print(f"p-valor: {friedman_p_value:.4f}")
    print(f"qui-quadrado: {friedman_chi2:.4f}")

In [None]:
def print_nemenyi_result(df):
    from scikit_posthocs import posthoc_nemenyi_friedman

    nemenyi_results = posthoc_nemenyi_friedman(df)
    print("Teste Nemenyi (pós-hoc):")
    print(nemenyi_results)

In [None]:
def plot_nemenyi_result(df, title):
    import seaborn as sns
    import matplotlib.pyplot as plt
    from scikit_posthocs import posthoc_nemenyi_friedman

    nemenyi_results = posthoc_nemenyi_friedman(df)

    plt.figure(figsize=(5, 3))
    sns.heatmap(nemenyi_results, annot=True, cmap='coolwarm', fmt=".4f", cbar=True,
                linewidths=0.5, linecolor='black', vmin=0, vmax=1)
    plt.title(title)
    plt.show()

##### Execução

In [None]:
%pip install scikit-posthocs

In [None]:
cr_original = fill_cr_values(original_cluster_indices, ensemble_df_original)
cr_pca = fill_cr_values(pca_cluster_indices, ensemble_df_pca)
cr_corr = fill_cr_values_corr(corr_cluster_indices, ensemble_df_corr)

In [None]:
print("Original")
print_friedman_result(cr_original)
print("Corr")
print_friedman_result(cr_corr)
print("PCA")
print_friedman_result(cr_pca)

In [None]:
print("Original")
print_nemenyi_result(cr_original)
print("Corr")
print_nemenyi_result(cr_corr)
print("PCA")
print_nemenyi_result(cr_pca)

In [None]:
plot_nemenyi_result(cr_original, "Teste Nemenyi (pós-hoc) - Original")
plot_nemenyi_result(cr_corr, "Teste Nemenyi (pós-hoc) - Correlação")
plot_nemenyi_result(cr_pca, "Teste Nemenyi (pós-hoc) - PCA")