## 予備実験2までのnotebook

In [1]:
import glob
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib

In [2]:
# data/result/に格納されている全csvファイルの読み込み
csv_files = glob.glob(os.path.join("../../data/result", "*.csv"))
df_list = []
for file in csv_files:
    tmp_df = pd.read_csv(file)
    df_list.append(tmp_df)
df = pd.concat(df_list, ignore_index=True)

In [3]:
# predの要素で, propertyが含まれる値を返す
df_prop = df.query('pred.str.contains("property")', engine='python').dropna()

# 'wikiPage'を含まないpropertyを返す
df_prop = df_prop[~df_prop['pred'].str.contains('wikiPage')]
# '画像'を含まないpropertyを返す
df_prop = df_prop[~df_prop['pred'].str.contains('画像')]

# predの頻出上位10の要素のindex_listを取得
top_10_df_prop = df_prop.groupby('pred').count().sort_values(['obj'], ascending=False).head(10)
top_10_prop_list = top_10_df_prop.index.to_list()

# 頻出上位10を持つdataの抽出
top_10_df = df[df['pred'].isin(top_10_prop_list)]

# predを目的変数(label)として扱うための準備
prop_to_label = dict(zip(top_10_prop_list, range(0,10)))
column_name = dict(map(lambda k: (k[1],k[0].split("/")[-1]), prop_to_label.items()))

In [4]:
# モデル読み込み
model = Word2Vec.load('../../Models/japanese-word2vec-model-builder/word2vec.gensim.model')

In [5]:
# モデルを利用した文字のエンべディング
## 未知語に関しては, 今回はNaNで対応
def vectorize(model, word):
    try:
        output = model.wv[word]
        return output
    except:
        return np.nan

In [6]:
# obj要素に関してはLOD同士が繋がっているため, 基本的にURIで記述されている.
# また, (県の魚:〇〇)の要素や数値データが入っている場合もあるため, それぞれ最後尾の要素を値として扱う. 
def preprocessing(obj):
    if type(obj) != str:
        output = obj
    elif len(obj.split("：")) != 1:
        output = obj.split("：")[-1]
    else:
        output = obj.split("/")[-1]
    return output

In [7]:
top_10_df

Unnamed: 0,key,pred,obj
12,沖縄県,http://ja.dbpedia.org/property/after,http://ja.dbpedia.org/resource/琉球列島米国軍政府
13,沖縄県,http://ja.dbpedia.org/property/after,-----
14,沖縄県,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/琉球列島米国民政府
15,沖縄県,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/琉球政府
16,沖縄県,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/琉球藩
...,...,...,...
56783,高知県,http://ja.dbpedia.org/property/歌など,県の歌：高知県民の歌（1953年制定）
56784,高知県,http://ja.dbpedia.org/property/歌など,県の魚：カツオ
56788,高知県,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/大分県
56789,高知県,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/徳島県


In [8]:
# vectorization
## key要素(本データでは〇〇県)は,単純エンベディング対象.
## obj要素は, 上記の関数で整えた後に, エンベディング.
cp_top_10_df = top_10_df.copy()
cp_top_10_df['key_vec'] = cp_top_10_df['key'].map(lambda x:vectorize(model,x))
cp_top_10_df['label'] = cp_top_10_df['pred'].map(prop_to_label)
obj_list = cp_top_10_df['obj'].map(lambda x:preprocessing(x))
cp_top_10_df['obj_vec'] = obj_list.map(lambda x:vectorize(model,x))

In [9]:
# vectorization
## key要素(本データでは〇〇県)は,単純エンベディング対象.
## obj要素は, 上記の関数で整えた後に, エンベディング.
# cp_df_prop = df_prop.copy()
# cp_df_prop['key_vec'] = cp_df_prop['key'].map(lambda x:vectorize(model,x))
# cp_df_prop['label'] = cp_df_prop['pred']
# obj_list = cp_df_prop['obj'].map(lambda x:preprocessing(x))
# cp_df_prop['obj_vec'] = obj_list.map(lambda x:vectorize(model,x))

In [10]:
# ベクトル化前のデータとベクトル化後のデータの結合 + 未知語のNaNの排除
top_10_df_vec = pd.merge(top_10_df,cp_top_10_df, left_index=True, right_on=top_10_df.index)
top_10_df_vec = top_10_df_vec.reset_index(drop=True)
top_10_df_vec = top_10_df_vec.drop(["key_0","key_y","pred_y","obj_y"], axis=1)
top_10_df_vec = top_10_df_vec.dropna().reset_index(drop=True)

In [11]:
# ベクトル化前のデータとベクトル化後のデータの結合 + 未知語のNaNの排除
# df_vec = pd.merge(df_prop, cp_df_prop, left_index=True, right_on=df_prop.index)
# df_vec = df_vec.reset_index(drop=True)
# df_vec = df_vec.drop(["key_0","key_y","pred_y","obj_y"], axis=1)
# df_vec = df_vec.dropna().reset_index(drop=True)

In [12]:
# 目的変数のバイナライズ + 説明変数の切り出し
lb = LabelBinarizer()
lb.fit(top_10_df_vec.label)
label_dence = lb.fit_transform(top_10_df_vec.label)
input_data = list(top_10_df_vec['obj_vec'].values)
# X_train, X_test, y_train, y_test = train_test_split(input_data, label_dence)

In [13]:
# top_10_df_prop = df_vec.groupby('pred_x').count().sort_values(['obj_x'], ascending=False).head(10)
# top_10_prop_list = top_10_df_prop.index.to_list()

# 頻出上位10を持つdataの抽出
# top_10_df = df_vec[df_vec['pred_x'].isin(top_10_prop_list)]
# top_10_df = top_10_df.reset_index(drop=True)
# 
# predを目的変数(label)として扱うための準備
# prop_to_label = dict(zip(top_10_prop_list, range(0,10)))
# column_name = dict(map(lambda k: (k[1],k[0].split("/")[-1]), prop_to_label.items()))

In [14]:
# マルチラベル分類
forest = RandomForestClassifier(n_estimators = 500, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(input_data, label_dence)

MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=500,
                                                       random_state=1),
                      n_jobs=-1)

In [15]:
estimator = multi_target_forest.estimators_
y_pred = multi_target_forest.predict(input_data)
y_pred_proba = multi_target_forest.predict_proba(input_data)

In [16]:
list(column_name.values())

['隣接都道府県',
 'before',
 '歌など',
 '所在地',
 '説明',
 '表記',
 'years',
 'after',
 'シンボル名',
 'title']

In [17]:
## 一応のclassification_report
# cl_report = classification_report(label_dence, y_pred)
# print(cl_report)

In [19]:
np_pred = np.array(y_pred)
np_proba = np.array(y_pred_proba)
np_proba_trans = np_proba.transpose(1,0,2)

In [20]:
# ラベル有の閾値の設定
pred_0 = (np_proba_trans[:,:,1] > 0.0).astype(int)
pred_1 = (np_proba_trans[:,:,1] > 0.1).astype(int)
pred_2 = (np_proba_trans[:,:,1] > 0.2).astype(int)
pred_3 = (np_proba_trans[:,:,1] > 0.3).astype(int)

In [21]:
df = pd.DataFrame(np_pred)
df_0 = pd.DataFrame(pred_0)
df_1 = pd.DataFrame(pred_1)
df_2 = pd.DataFrame(pred_2)
df_3 = pd.DataFrame(pred_3)

df_1_new = df_1.rename(columns=column_name)
df_2_new = df_2.rename(columns=column_name)
df_3_new = df_3.rename(columns=column_name)

In [22]:
# 要素数の頻度
# Value_count の出力方法をライブラリ見る. 
    ## indexでラベル表示を確認すると, 0が隠れているのが見えた。

# df_3.value_counts().index

In [23]:
df_pre_obj = top_10_df_vec.loc[:,['pred_x','obj_x','obj_vec']]
# df_1_pre_obj = pd.concat([df_pre_obj,df_1_new],axis=1)
df_2_pre_obj = pd.concat([df_pre_obj,df_2_new],axis=1)
# df_3_pre_obj = pd.concat([df_pre_obj,df_3_new],axis=1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_2_pre_obj.loc[:,:'obj_vec'], df_2_pre_obj.loc[:,'隣接都道府県':])

In [26]:
print(len(X_train))
print(len(y_train))

443
443


In [27]:
print(X_train.obj_vec.values.shape)
print(y_train.values.shape)

(443,)
(443, 10)


In [28]:
# マルチラベル分類
multi_target_forest.fit(X_train.obj_vec.values.tolist(), y_train.values)

MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=500,
                                                       random_state=1),
                      n_jobs=-1)

In [29]:
estimator = multi_target_forest.estimators_
y_pred = multi_target_forest.predict(X_test.obj_vec.values.tolist())
y_pred_proba = multi_target_forest.predict_proba(X_test.obj_vec.values.tolist())

In [30]:
file_name = 'multi_tree.png'
dot_data = export_graphviz(estimator[1][1],
                class_names=list(column_name.values()), 
                filled=True, 
                rounded=True)

graph = graph_from_dot_data( dot_data )

graph.write_png(file_name)

True

In [31]:
len(y_test)

148

In [32]:
from sklearn.metrics import recall_score, precision_score, f1_score

In [43]:
forest = RandomForestClassifier(n_estimators = 500, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
data = df_2_pre_obj.obj_vec.values.tolist()
target = df_2_pre_obj.loc[:,'隣接都道府県':].values

In [44]:
print(len(data))
print(len(target))

591
591


In [63]:
from sklearn.model_selection import cross_validate, KFold
kf = KFold(n_splits=5,
            shuffle=True,
            random_state=0)

# 交差検証
score_ring = ['precision_micro','precision_macro',
              'recall_micro', 'recall_macro',
              'f1_micro','f1_macro']

scores = cross_validate(multi_target_forest, data, target, cv=kf, scoring=score_ring)
# 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
# スコアの平均値
# import numpy as np
# print('Average score: {}'.format(np.mean(scores)))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


Cross-Validation scores: {'fit_time': array([12.30914211,  6.74028516,  7.10914898,  6.59875822,  6.4571991 ]), 'score_time': array([3.07269287, 3.10598683, 3.12462807, 3.01742601, 3.08750081]), 'test_precision_micro': array([1.        , 0.95867769, 0.98347107, 0.99186992, 0.9826087 ]), 'test_precision_macro': array([0.7       , 0.76948718, 0.8875    , 0.89230769, 0.78690476]), 'test_recall_micro': array([0.92741935, 0.93548387, 0.93700787, 0.94573643, 0.88976378]), 'test_recall_macro': array([0.63609854, 0.72309524, 0.80424242, 0.81756098, 0.66261905]), 'test_f1_micro': array([0.9623431 , 0.94693878, 0.95967742, 0.96825397, 0.9338843 ]), 'test_f1_macro': array([0.66386765, 0.73218594, 0.83765046, 0.84162257, 0.71098735])}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


In [65]:
micro = np.mean(scores['test_precision_micro'])
macro = np.mean(scores['test_precision_macro'])
print(micro, macro)

micro = np.mean(scores['test_recall_micro'])
macro = np.mean(scores['test_recall_macro'])
print(micro, macro)

micro = np.mean(scores['test_f1_micro'])
macro = np.mean(scores['test_f1_macro'])
print(micro, macro)

0.9833254749363878 0.8072399267399266
0.927082262691657 0.7287232455094722
0.9542195113747963 0.7572627939074765


In [50]:
average = "micro"

# Recallの平均
print(recall_score(y_true=y_test, y_pred=y_pred, average=average))

# Recallの平均
print(precision_score(y_true=y_test, y_pred=y_pred, average=average))
# -> 0.875

# macro-F1
print(f1_score(y_true=y_test, y_pred=y_pred, average=average))
# -> 0.9333333333333333

0.88125
0.986013986013986
0.9306930693069307


In [33]:
## 一応のclassification_report
cl_report = classification_report(y_test, y_pred)
print(cl_report)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        51
           1       0.96      0.89      0.93        28
           2       1.00      0.38      0.55         8
           3       1.00      0.92      0.96        13
           4       0.94      0.83      0.88        18
           5       1.00      1.00      1.00        22
           6       1.00      1.00      1.00        14
           7       1.00      0.20      0.33         5
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1

   micro avg       0.99      0.88      0.93       160
   macro avg       0.79      0.62      0.66       160
weighted avg       0.98      0.88      0.91       160
 samples avg       0.91      0.89      0.90       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
np_pred = np.array(y_pred)
# np_proba = np.array(y_pred_proba)
# np_proba_trans = np_proba.transpose(1,0,2)

In [35]:
pred_df = pd.DataFrame(np_pred)
pred_df_new = pred_df.rename(columns=column_name)

In [36]:
y_test.columns = y_test.columns + '_01'
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [38]:
test_pred_df = pd.concat([test_df,pred_df_new], axis=1)

In [39]:
test_pred_df[test_pred_df.loc[:,'隣接都道府県_01':'title_01'].sum(axis=1) > 1]

Unnamed: 0,pred_x,obj_x,obj_vec,隣接都道府県_01,before_01,歌など_01,所在地_01,説明_01,表記_01,years_01,...,隣接都道府県,before,歌など,所在地,説明,表記,years,after,シンボル名,title
2,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/能登半島,"[0.020314734, 0.20471144, -0.03847818, 0.14281...",1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
12,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/高知県,"[-0.021919195, 0.10427152, 0.04427553, 0.12341...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
13,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/高知県,"[-0.021919195, 0.10427152, 0.04427553, 0.12341...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
21,http://ja.dbpedia.org/property/after,http://ja.dbpedia.org/resource/新川県,"[-0.022987725, -0.05611024, -0.12044697, 0.021...",0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
24,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/石川県,"[-0.031856198, 0.120232046, 0.12296562, 0.0831...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
58,http://ja.dbpedia.org/property/after,http://ja.dbpedia.org/resource/江戸川,"[0.081049874, 0.18472259, -0.08038298, 0.08492...",0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
59,http://ja.dbpedia.org/property/説明,http://ja.dbpedia.org/resource/法華経寺,"[0.037549566, 0.095200725, 0.07064349, 0.07455...",0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
60,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/石川県,"[-0.031856198, 0.120232046, 0.12296562, 0.0831...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
77,http://ja.dbpedia.org/property/title,http://ja.dbpedia.org/resource/日本の首都,"[0.059102356, 0.29593015, -0.1518349, 0.178942...",0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
83,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/鹿児島県,"[-0.025638053, 0.111953475, 0.030292727, 0.164...",1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [40]:
test_pred_df[test_pred_df.loc[:,'隣接都道府県':'title'].sum(axis=1) > 1]

Unnamed: 0,pred_x,obj_x,obj_vec,隣接都道府県_01,before_01,歌など_01,所在地_01,説明_01,表記_01,years_01,...,隣接都道府県,before,歌など,所在地,説明,表記,years,after,シンボル名,title
12,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/高知県,"[-0.021919195, 0.10427152, 0.04427553, 0.12341...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
13,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/高知県,"[-0.021919195, 0.10427152, 0.04427553, 0.12341...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
24,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/石川県,"[-0.031856198, 0.120232046, 0.12296562, 0.0831...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
60,http://ja.dbpedia.org/property/before,http://ja.dbpedia.org/resource/石川県,"[-0.031856198, 0.120232046, 0.12296562, 0.0831...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
83,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/鹿児島県,"[-0.025638053, 0.111953475, 0.030292727, 0.164...",1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
90,http://ja.dbpedia.org/property/隣接都道府県,http://ja.dbpedia.org/resource/高知県,"[-0.021919195, 0.10427152, 0.04427553, 0.12341...",1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
