In [None]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm 
import openpyxl
import pickle

In [None]:
#file_nameのファイルに隣接するノードのリストを出力
def adjacent_node(file_name, java_module):
    
    adjacent_list = []
    for edge in java_module:
        if file_name == edge[0]:
            adjacent_list.append(edge[1])

    return adjacent_list

In [None]:
#類似度計算
def similarity(listA, listB):
    return len(set(listA) & set(listB)) / len(set(listA) | set(listB))

In [None]:
#類似度の最小値を計算
def min_similarity(file_list, java_module):
    
    min_sim = 1
    for f1 in tqdm(file_list):
        for f2 in file_list:
            an1 = adjacent_node(f1, java_module)
            an2 = adjacent_node(f2, java_module)
            if len(set(an1) | set(an2)) == 0:
                continue
            sim = similarity(an1, an2)
            if sim < min_sim and sim != 0 and f1 != f2:
                min_sim = sim
    return min_sim

In [None]:
#データ研磨
#java_moduleは無向グラフを使う
#出力も無向グラフになる
def data_polish(java_line ,java_module, threshold):

    adjacent_list = []
    new_java_module = []

    #隣接するファイルの作成
    for i in java_line:
        sub_adjacent = [i[0]]
        for j in java_module:
            if i[0] == j[1]:
                sub_adjacent.append(j[0])
        adjacent_list.append([i[0],list(set(sub_adjacent))])
        
    #データ研磨
    for i in tqdm(adjacent_list):
        for j in adjacent_list:
            if threshold <= similarity(i[1],j[1]) and i != j:
                new_java_module.append([i[0],j[0]])
    
    return new_java_module

In [None]:
#データ研磨の実行
def data_polish_repeat(java_line ,java_module, threshold, times):
    
    new_java_module = data_polish(java_line ,java_module, threshold)
    
    for i in range(times-1):
        new_java_module = data_polish(java_line, new_java_module, threshold)
        
    return new_java_module

In [None]:
#与えられたファイル名が入っている閉グラフを検出
def single_creek_detection(file_name,java_module):  
    
    group_Z = [file_name]

    while True:
        adj_list = []
        for i in group_Z:
            for j in java_module:
                if (i == j[1]) and (j[0] not in group_Z) and (j[0] not in adj_list):
                    adj_list = adj_list + [j[0]]
                if (i == j[0]) and (j[1] not in group_Z) and (j[1] not in adj_list):
                    adj_list = adj_list + [j[1]]

        if adj_list == []:
            break

        adj_list = adj_list + group_Z

        if set(group_Z) != set(adj_list):
            group_Z = adj_list
        else:
            break
    
    return group_Z

In [None]:
#閉グラフをすべて検出
def creek_detection(java_commit, java_module):
    
    commit_files = []
    commit_module = []
    
    for i in java_commit:
        commit_files.append(i[0])

    for i in java_module:
        if i[0] in commit_files and i[1] in commit_files:
            commit_module.append(i)

    creek = []
    creek_list = []
    exclude_files = []

    for file_name in tqdm(commit_files):
        if file_name not in exclude_files:
            creek = single_creek_detection(file_name,commit_module)
            exclude_files = exclude_files + creek
            creek_list.append(creek)

    return creek_list

In [None]:
def commit_filter(java_commit, creek):
    
    commit_files = []

    for i in java_commit:
        commit_files.append(i[0])
    
    for c in creek:
        union = set(c) & set(commit_files)
        if union != set():
            print('クラスタの大きさ' + str(len(c)))
            print(union)

In [None]:
def include_commit_creek(java_commit, creek):
    
    commit_files = []

    for i in java_commit:
        commit_files.append(i[0])
    
    for c in creek:
        union = set(c) & set(commit_files)
        if union != set():
            print(c)

In [None]:
def select_file_creek(file_name, creek):
    
    for c in creek:
        if file_name in c:
            print(c)

In [None]:
with open('java_line.bin', 'rb') as f:
    java_line = pickle.load(f) # load commit_set
with open('java_commit.bin', 'rb') as f:
    java_commit = pickle.load(f)
with open('prob_list.bin', 'rb') as f:
    prob_list = pickle.load(f)
with open('java_module.bin', 'rb') as f:  #astの方
    java_module = pickle.load(f)

In [None]:
len(java_module)

In [None]:
commit_module = []
commit_files = []

for i in java_commit:
    commit_files.append(i[0])

for i in java_module:
    if i[0] in commit_files and i[1] in commit_files:
        commit_module.append(i)

len(commit_module)

In [None]:
for j in java_commit:
    if adjacent_node(j[0], commit_module) == []:
        print(j[0])

In [None]:
times = 10
threshold = 0.25
polished_graph = data_polish_repeat(java_line , java_module, threshold, times)

In [None]:
creek = creek_detection(java_line, polished_graph)
creek

In [None]:
commit_filter(java_commit, creek)

In [None]:
include_commit_creek(java_commit, creek)

In [None]:
file_name = 'java/javax/el/ImportHandler.java'
select_file_creek(file_name, creek)

In [None]:
with open('creek_java_module.bin', 'wb') as f:
    pickle.dump(result, f)

In [None]:
#結果の保存
result_path = '../result'

if not os.path.exists(result_path):
    os.mkdir(result_path)
    print(result_path + 'を作成しました.')

#保存ファイル名の決定
Excel_name = 'result_'+str(times)+'回_dependency'
Excel_name2 = 'result_'+str(times)+'回_creek'

#データフレームへの変換
df = pd.DataFrame(polished_graph, columns=['source','target'])
df2 = pd.DataFrame(creek)


#エクセルデータとして結果を保存
df.to_excel(result_path+'/'+Excel_name+'.xlsx', sheet_name='new_sheet_name')
df2.to_excel(result_path+'/'+Excel_name2+'.xlsx', sheet_name='new_sheet_name')

In [None]:
#結果の保存
result_path = '../result'

#ノードとエッジの保存
Excel_name = 'edge'  #-->エッジの保存
df = pd.DataFrame(commit_module, columns=['source','target'])
df.to_excel(result_path+'/'+Excel_name+'.xlsx', sheet_name='new_sheet_name')

# コミットのファイル
node = []
for i in java_commit:
    node.append([i[0],i[0]])

Excel_name = 'node'  #-->エッジの保存
df2 = pd.DataFrame(node, columns=['id','label'])
df2.to_excel(result_path+'/'+Excel_name+'.xlsx', sheet_name='new_sheet_name')