In [1]:
import glob
import os
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier

Slow version of gensim.models.doc2vec is being used


In [2]:
files = glob.glob(os.path.join("../../data/result", "*.csv"))
df_list = []
for file in files:
    tmp_df = pd.read_csv(file)
    tmp_df['filename'] = os.path.basename(file)
    df_list.append(tmp_df)
df = pd.concat(df_list, ignore_index=True)

In [3]:
# predの要素で, propertyが含まれる値を返す
df_prop = df.query('pred.str.contains("property")', engine='python')
# 'wikiPage'を含まないpropertyを返す
df_prop = df_prop[~df_prop['pred'].str.contains('wikiPage')]
df_prop = df_prop[~df_prop['pred'].str.contains('画像')]
top_10_df_prop = df_prop.groupby('pred').count().sort_values(['obj'], ascending=False).head(10)
top_10_prop_list = top_10_df_prop.index.to_list()
top_10_df = df[df['pred'].isin(top_10_prop_list)]
prop_to_label = dict(zip(top_10_prop_list, range(0,10)))

In [4]:
#top_10_prop_list

In [5]:
model = KeyedVectors.load('../../Models/japanese-word2vec-model-builder/word2vec.gensim.model')

In [6]:
def vectorize(model, word):
    try:
        output = model.wv[word]
        return output
    except:
        return "NaN"

In [7]:
def preprocessing(obj):
    if type(obj) != str:
        output = obj
    elif len(obj.split("：")) != 1:
        output = obj.split("：")[-1]
    else:
        output = obj.split("/")[-1]
    return output

In [8]:
# vectorization
cp_top_10_df = top_10_df.copy()
cp_top_10_df['key_vec'] = cp_top_10_df['key'].map(lambda x:vectorize(model,x))
cp_top_10_df['label'] = cp_top_10_df['pred'].map(prop_to_label)
obj_list = cp_top_10_df['obj'].map(lambda x:preprocessing(x))
cp_top_10_df['obj_vec'] = obj_list.map(lambda x:vectorize(model,x))

In [9]:
top_10_df_vec = pd.merge(top_10_df,cp_top_10_df, left_index=True, right_on=top_10_df.index)
top_10_df_vec = top_10_df_vec.reset_index(drop=True)

In [10]:
top_10_df_vec = top_10_df_vec.drop(["key_0","filename_x","key_y","pred_y","obj_y","filename_y"], axis=1)

In [11]:
#top_10_df_vec_exclusion_null = top_10_df_vec[top_10_df_vec['obj_vec'] != "Null"].reset_index(drop=True)
top_10_df_vec_exclusion_nan = top_10_df_vec[top_10_df_vec['obj_vec'] != "NaN"].reset_index(drop=True)

  result = libops.scalar_compare(x.ravel(), y, op)


In [12]:
input_data = list(top_10_df_vec_exclusion_nan['obj_vec'].values)
label_data = top_10_df_vec_exclusion_nan['label'].values
label_dence = LabelBinarizer().fit_transform(label_data)
X_train, X_test, y_train, y_test = train_test_split(input_data, label_dence)

In [13]:
forest = RandomForestClassifier(n_estimators = 10,random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [14]:
len(input_data[0]) # 591 * 50
len(label_dence)

591

In [15]:
multi_target_forest.fit(input_data, label_dence)
y_pred = multi_target_forest.predict(input_data)
y_pred_proba = multi_target_forest.predict_proba(input_data)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(label_dence, y_pred)

0.9695431472081218

In [17]:
np_pred = np.array(y_pred)
np_proba = np.array(y_pred_proba)
np_proba_trans = np_proba.transpose(1,0,2)

In [53]:
# ラベル有の閾値の設定
pred_0 = (np_proba_trans[:,:,1] > 0.0).astype(int)
pred_1 = (np_proba_trans[:,:,1] > 0.1).astype(int)
pred_2 = (np_proba_trans[:,:,1] > 0.2).astype(int)
pred_3 = (np_proba_trans[:,:,1] > 0.3).astype(int)

In [54]:
import pandas as pd
df = pd.DataFrame(np_pred)
df_0 = pd.DataFrame(pred_0)
df_1 = pd.DataFrame(pred_1)
df_2 = pd.DataFrame(pred_2)
df_3 = pd.DataFrame(pred_3)

In [85]:
# 要素数の頻度
#df_1[df_1 == 1]

#labels = df_1.value_counts(normalize=True).index.tolist()