In [None]:
import tensorflow as tf
from sklearn.decomposition import PCA
import numpy as np
import torch

from tqdm import tqdm
import pickle

In [None]:
import makeindex

In [None]:
def make_pca_vector(all_vector, n_components):
    pca = PCA(n_components = n_components)
    all_vector_pca = pca.transform(all_vector)
    print('寄与率 : ' + str(sum(pca.explained_variance_ratio_)))
    return all_vector_pca

In [None]:
def get_keys_from_value(d, val):
    return [k for k, v in d.items() if v == val]

In [None]:
#保存するファイルの名前
model_name = 'model_name'

In [None]:
#データの読み込み 
with open('data/commit_files.bin', 'rb') as f:
    commit_files = pickle.load(f) # load commit_set

#データの読み込み (文章ベクトル)
with open('data/all_vector.bin', 'rb') as f:
    all_vector = pickle.load(f) # load commit_set

with open('data/vector_set.bin', 'rb') as f:
    vector_set = pickle.load(f) # load commit_set

In [None]:
#pcaの実行
all_vector = make_pca_vector(all_vector, n_components=65)

In [None]:
#ベクトルをnumpy, tensor型へ
sentence_vector = np.array(all_vector)
sentence_tensor = tf.cast(sentence_vector, tf.float32)

In [None]:
#ファイルとアイテムにインデックスを振り分ける。
file_index, file_index_list = makeindex.make_file_index_list(commit_files)
version_index = makeindex.make_version_index(commit_files)
vector_index_set = makeindex.make_vector_index_set(vector_set)
Fi_num = makeindex.make_Fi_num(commit_files)
z_tensor = makeindex.z_init(file_index, file_index_list, all_vector, vector_index_set, Fi_num)
z_index = [(z1, z2)
                for z1 in range(z_tensor.shape[0])
                for z2 in range(z_tensor.shape[1])]

In [None]:
#損失関数
def loss_func(x, z, model):
    bx = tf.matmul(model.B , np.array(x).T)
    T_bx = tf.transpose(bx)
    log_p = tf.transpose(tf.nn.log_softmax(T_bx))
    return -tf.math.reduce_sum(log_p*z)

In [None]:
class Model(object):
    def __init__(self, sentence_vector, z):
        m = z.shape[0]
        self.B = tf.Variable(tf.ones([m,len(sentence_vector[0])]))
        
    def __call__(self, sentence_vector):
        bx_matrix = tf.matmul(self.B , np.array(sentence_vector).T)
        T_bx_matrix = tf.transpose(bx_matrix)
        p_matrix = tf.transpose(tf.nn.softmax(T_bx_matrix))
        return  p_matrix

In [None]:
#Zの更新
def update_z(x, z, model, file_index, file_index_list, vector_index_set, z_index):
    new_z = np.ones(z.shape)
    bx = tf.matmul(model.B , np.array(x).T)
    T_bx = tf.transpose(bx)
    p = np.array(tf.transpose(tf.nn.softmax(T_bx)))

    for z1,z2 in z_index:
        F_i = file_index_list[vector_index_set[z2][0]]
        if z1 in F_i:
            sum_p = p.T[z2][F_i].sum()
            new_z[z1][z2] = p[z1][z2] / sum_p
        else:
            new_z[z1][z2] = 0
    return new_z

In [None]:
# Adamで回帰係数の最適化(10000回)
def train(model, x, z):
    for i in range(10000):
        optimizer = tf.optimizers.Adam(0.001)
        with tf.GradientTape() as t:
            current_loss = loss_func(x, z, model)
            if torch.isnan(torch.tensor([loss_func(sentence_tensor, z_tensor, model).numpy()])):
                print(i)
                print('nan')
                break
            grads = t.gradient(current_loss, [model.B])
            optimizer.apply_gradients(zip(grads, [model.B]))
            
            if abs(loss_func(x, z, model).numpy() - current_loss.numpy()) <= 1:
                break

In [None]:
#モデル生成
model = Model(sentence_vector, z_tensor)

In [None]:
#学習の実行(100回)
for i in tqdm(range(100)):
    train(model, sentence_tensor, z_tensor)
    new_z = update_z(sentence_tensor, z_tensor, model, file_index, file_index_list, vector_index_set, z_index)
    z_tensor = tf.cast(new_z, tf.float32)

In [None]:
#確率Pの計算
pred_p = model(sentence_vector)

In [None]:
with open('result/z_'+model_name+'.bin', 'wb') as f:
    pickle.dump(z_tensor, f)

In [None]:
with open('result/'+model_name+'.bin', 'wb') as f:
    pickle.dump(model, f)