# Clusteringするノートブック

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, font_manager
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

In [None]:
# プロットする図やフォントのサイズ設定
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams["font.size"] = 13

# プロットする図のフォント設定
font_manager.fontManager.addfont('/Library/Fonts/ipaexg.ttf')
rc('font', family='IPAEXGothic')

# seabron でのプロット図の設定
sns.set(context='talk', style='ticks', font=["IPAEXGothic"], font_scale=10/6, )

# データ読み込み

In [None]:
df = pd.read_pickle('../../pickles/KGE_1500.pkl')
df.head(3)

- 階層クラスタリングを用いて曖昧さを考慮したまとまりの作成を実施する

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster, cophenet, dendrogram

In [None]:
list_pred_vec = df['pred_vec'].tolist()
pred_vec_index = df.pred_vec.index

result = linkage(list_pred_vec,
                 method='ward',
                 metric='euclidean')

In [None]:
threshold = 0.1 * np.max(result[:, 2])
threshold2 = 0.15 * np.max(result[:, 2])
threshold3 = 0.2 * np.max(result[:, 2])
threshold4 = 0.25 * np.max(result[:, 2])

In [None]:
fig, ax = plt.subplots(figsize = (15,10))

dendrogram(result,
           labels=list(df['label']),
           color_threshold=threshold3)

ax.axhline(threshold3, linestyle='--', color='r')
sns.despine()
# plt.title("pred_cluster_01")
# ax.legend()
ax.set(xlabel = '', ylabel='Threshold')
plt.xticks(fontsize=13)

ax.yaxis.set_major_formatter(ScalarFormatter(useMathText=True)) 
# plt.savefig('./output/015_cluster.png', transparent = True, bbox_inches='tight')  

In [None]:
# criterion は, fcluster 作成でのクラスタ選びのアルゴリズム
# 他にもいろんなアルゴリズムがある
# fcluster の出力_array のindex は, 入力データのindex に属する 

cluster1 = fcluster(result,
                    threshold,
                    criterion='distance')

cluster2 = fcluster(result,
                    threshold2,
                    criterion='distance')

cluster3 = fcluster(result,
                    threshold3,
                    criterion='distance')

cluster4 = fcluster(result,
                    threshold4,
                    criterion='distance')

In [None]:
# 階層クラスター分析の結果をDataFrame化
_cluster = pd.DataFrame({'class_thre_0.1':cluster1,
                         'class_thre_0.15':cluster2,
                         'class_thre_0.2':cluster3,
                         'class_thre_0.25':cluster4,
                        }
                        , index = pred_vec_index)

display(_cluster.head())

In [None]:
_cluster.nunique()

In [None]:
# 元データと分析結果を結合
cluster_df = pd.concat([df, _cluster] ,axis=1)
display(cluster_df.head())

In [None]:
cluster_df[cluster_df.pred.str.contains("木")]

In [None]:
cluster_df[cluster_df['class_thre_0.2'] == 2]

In [None]:
for cluster_num, df in cluster_df.groupby('class_thre_0.2'):
    print('クラスタ'+ str(cluster_num) +'： 要素数' + str(len(df)))
    display(df.pred.value_counts(normalize=True).rename('相対頻度').to_frame().style.bar(vmin=0, vmax=1))
    print("")

In [None]:
for cluster_num, df in cluster_df.groupby('class_thre_0.2'):
    print('クラスタ'+ str(cluster_num) +'： 要素数' + str(len(df)))
    display(df.loc[:,['pred','label']])
    print("")