In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Кластеризация

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


def get_models(data, name='Data'):
    models = []
    inertia = []
    silhouette = []
    print(name, end=': ')
    for n_clusters in range(2, 16):
        print(n_clusters, end=', ')
        # Описываем модель
        model = KMeans(n_clusters=n_clusters, max_iter=5000, random_state=0)

        # Проводим моделирование
        model.fit(data)

        # Предсказание на всем наборе данных
        all_predictions = model.predict(data)

        # Распихиваем точки по кластерам
        clusters = [[] for i in range(n_clusters)]
        for i, num in enumerate(all_predictions):
            clusters[num] += [data[i]]
        
        models += [model]
        inertia += [model.inertia_]
        silhouette += [silhouette_score(data, model.labels_, metric='euclidean')]
    print('done!')
    return models, inertia, silhouette

Тут формируем массивы обучабщих выборок для обоих методов:

In [None]:
quotient_deviation_df_A = pd.read_excel('../output/quotient_deviation_df_A.xlsx')
quotient_deviation_df_B = pd.read_excel('../output/quotient_deviation_df_B.xlsx')

In [None]:
quo_data_A  = np.array(quotient_deviation_df_A.drop(['Year'], axis=1))
quo_data_B  = np.array(quotient_deviation_df_B)

Обучаем модели для каждого метода:

In [None]:
models_quo_A, i_1, s_1 = get_models(quo_data_A, 'A quo')
models_quo_B, i_1, s_1 = get_models(quo_data_B, 'B quo')

A quo: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, done!
B quo: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, done!


Функция сохранения графиков кластеров и их средних объектов:

In [None]:
def save_clusters(models, data, amount=15, num=3, data_type='below', ylim=None):
    all_predictions = models[num-2].predict(data)
    try:
        os.mkdir(f'../output/{data_type}/')
    except Exception as e:
        pass
    try:
        os.mkdir(f'../output/{data_type}/{num}_clust/')
    except Exception as e:
        pass
    for j in range(num):
        fig, ax = plt.subplots( nrows=1, ncols=1)
        ax.plot(range(1,amount+1), models[num-2].cluster_centers_[j])
        ax.set_title(f'Mean object (cluster №{j+1})')
        if ylim:
            ax.set_ylim(ylim)
        fig.savefig(f'output/{data_type}/{num}_clust/mean_obj_cluster_{j+1}.png')
        plt.close(fig)
        
        fig, ax = plt.subplots( nrows=1, ncols=1)
        els = 0
        for i in range(len(all_predictions)):
            if all_predictions[i] == j:
                ax.plot(range(1,amount+1), data[i], label=str(i))
                els += 1
        #legend(frameon=False)
        if ylim:
            ax.set_ylim(ylim)
        ax.set_title(f'Cluster №{j+1} ({els} elements)')
        fig.savefig(f'../output/{data_type}/{num}_clust/cluster_{j+1}.png') 
        plt.close(fig)

In [None]:
def final_save_clusters(models, data, amount=15, num=3, data_type='below', ylim=None):
    all_predictions = models[num-2].predict(data)
    try:
        os.mkdir(f'../output/final_{data_type}/')
    except Exception as e:
        pass
    try:
        os.mkdir(f'../output/final_{data_type}/{num}_clust/')
    except Exception as e:
        pass
    for j in range(num):

        fig, ax = plt.subplots( nrows=1, ncols=1, dpi=300)

        els = 0
        for i in range(len(all_predictions)):
            if all_predictions[i] == j:
                ax.plot(range(1,amount+1), data[i], label=str(i), color='lightgray')
                els += 1

        ax.axhline(y=1, color='dimgray', linestyle='--')
        ax.axvline(x=15, color='dimgray', linestyle='dotted')

        
        ax.plot(range(1,amount+1), models[num-2].cluster_centers_[j], color='black')

        ax.text(0.15, 0.90, 'Diam',
                verticalalignment='bottom', horizontalalignment='right',
                transform=ax.transAxes, fontsize=15)
        ax.text(0.6, 0.90, 'CWT',
                verticalalignment='bottom', horizontalalignment='right',
                transform=ax.transAxes, fontsize=15)
        
        if ylim:
            ax.set_ylim(ylim)
        ax.set_title(f'Cluster №{j+1} ({els} elements)')
        #plt.show()
        fig.savefig(f'../output/final_{data_type}/{num}_clust/cluster_{j+1}.png', dpi=300) 
        plt.close(fig)

Сохраняем графики для всех моделей:

In [None]:
for _ in range(3, 6):
    save_clusters(models_quo_A, quo_data_A, 30, _, 'A_Quotient', [0.5, 1.5])
    save_clusters(models_quo_B, quo_data_B, 30, _, 'B_Quotient', [0.5, 1.5])

In [None]:
for _ in range(4, 5):
    final_save_clusters(models_quo_A, quo_data_A, 30, _, 'A_Quotient', [0.7, 1.3])
    #final_save_clusters(models_quo_B, quo_data_B, 30, _, 'B_Quotient', [0.7, 1.3])

Сохраняем таблицы кластеризованных объектов:

In [None]:
for i in range(2, 16):
    quotient_deviation_df_A[f'Class {i}'] = models_quo_A[i-2].predict(quo_data_A)
    #difference_deviation_df_A[f'Class {i}'] = models_diff_A[i-2].predict(diff_data_A)
    quotient_deviation_df_B[f'Class {i}'] = models_quo_B[i-2].predict(quo_data_B)
    #difference_deviation_df_B[f'Class {i}'] = models_diff_B[i-3].predict(diff_data_B)

quotient_deviation_df_A.to_excel('../output/quotient_deviation_df_A_CLASSIFIED.xlsx', index=False)
#difference_deviation_df_A.to_excel('output/difference_deviation_df_A_CLASSIFIED.xlsx', index=False)
quotient_deviation_df_B.to_excel('../output/quotient_deviation_df_B_CLASSIFIED.xlsx', index=True)
#difference_deviation_df_B.to_excel('output/difference_deviation_df_B_CLASSIFIED.xlsx', index=True)

Строим таблицу коэффициентов корреляции Спирмена для двух методов:

In [None]:
quotient_deviation_df_A = pd.read_excel('../output/quotient_deviation_df_A_CLASSIFIED.xlsx')
quotient_deviation_df_B = pd.read_excel('../output/quotient_deviation_df_B_CLASSIFIED_FIXED.xlsx')

In [None]:
QCorr = quotient_deviation_df_A.drop(['Year'], axis=1).corrwith(
        quotient_deviation_df_B.drop(['Year'], axis=1).reset_index(drop=True),
        method='spearman')

In [None]:
corrs = []
p_values = []
for column in quotient_deviation_df_A.columns:
    _c, _p = spearmanr(quotient_deviation_df_A[column], quotient_deviation_df_B[column])
    corrs += [_c]
    p_values += [_p]

spearman_corr_df = pd.DataFrame({'Feature':quotient_deviation_df_A.columns, 'Spearman': corrs, 'P-value': p_values})
spearman_corr_df.to_excel('../output/spearman_correlation_new.xlsx')

Строим сравнительные графики для двух методов и двух типов отклонений:

In [None]:
d_names = [
    ['D1', 'D2', 'D3'],
    ['D4', 'D5', 'D6'],
    ['D7', 'D8', 'D9'],
    ['D10', 'D11', 'D12'],
    ['D13', 'D14', 'D15']
]

cwt_names = [
    ['CWT1', 'CWT2', 'CWT3'],
    ['CWT4', 'CWT5', 'CWT6'],
    ['CWT7', 'CWT8', 'CWT9'],
    ['CWT10', 'CWT11', 'CWT12'],
    ['CWT13', 'CWT14', 'CWT15']
]

def save_corr_plots(df1, df2, names, corr_row, output_name=''):
    fig, ax = plt.subplots(5,3, figsize=(15,12), dpi=300)
    for i, row in enumerate(names):
        for j, el in enumerate(row):
            ax[i, j].scatter(df1[el], df2[el])
            ax[i, j].text(0.1, 0.8, f"{el},  r={corr_row[el]:0.4f}", transform=ax[i, j].transAxes)
    fig.savefig(f'output/{output_name}.png', dpi=300) 
    plt.close(fig)

save_corr_plots(quotient_deviation_df_A, quotient_deviation_df_B, d_names, spearman_corr_df['Quotient'], 'Quotient_D_corr')
save_corr_plots(quotient_deviation_df_A, quotient_deviation_df_B, cwt_names, spearman_corr_df['Quotient'], 'Quotient_CWT_corr')