In [None]:
import base64
import requests
import json
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
from tqdm import tqdm 

In [None]:
def get_tag_name(repository_url, client_id, client_secret):
    
    tag_list = []
    
    repository_url = repository_url.replace('.git', '')
    api = repository_url.replace('https://github.com/', 'https://api.github.com/repos/')
    r = requests.get(api,auth=(client_id, client_secret))
    data = json.loads(r.text)
    url2 = data['tags_url']
    r2 = requests.get(url2,auth=(client_id, client_secret))
    data2 = json.loads(r2.text)
    for tag in data2:
        tag_list.append(tag['name'])
    tag_list.reverse()
    return tag_list

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

class BertTextVectorixer:
    def __init__(self, model_name):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        
    def tokenize_(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)        
        return tokenized_sentence
    
    def vectorize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        tokens_tensor = torch.tensor([indexed_tokens])
        outputs, _ = self.model(tokens_tensor)
        last_hidden_states = outputs[0]
        embedding = last_hidden_states[0].detach().numpy()
        sentencevec = np.mean(embedding, axis=0)

        return sentencevec

In [None]:
def commit_message(repository_url, client_id, client_secret, from_ver, to_ver):

    commit_files = {}
    sentence_set = {}
    
    repository_url = repository_url.replace('.git', '/compare/{f}...{t}')
    api = repository_url.replace('https://github.com/', 'https://api.github.com/repos/')
        
    url = api.format(f=from_ver, t=to_ver)
    r = requests.get(url,auth=(client_id, client_secret))
    data = json.loads(r.text)

    
    for j in range(len(data['commits'])):
        key_name = to_ver+'_commit'+str(j)
        url = data['commits'][j]['url']
        r = requests.get(url,auth=(client_id, client_secret))
        data2 = json.loads(r.text)
        message = data2['commit']['message'].replace('\n', ' ')
        message = '[CLS] '+ message +' [SEP]'

        commit_files_list = []
        for k in range(len(data2['files'])):
            filename = data2['files'][k]['filename']
            commit_files_list.append(filename)

        commit_files[key_name] = commit_files_list
        sentence_set[key_name] = message

    return commit_files,sentence_set

In [None]:
#BERTモデルの選択
#model_name = 'bert-large-cased'
#model_name = 'bert-large-uncased'
#model_name = 'bert-base-cased'
model_name = 'bert-base-uncased'

#インスタンス化
BTV = BertTextVectorixer(model_name)

In [None]:
#リポジトリの指定
repository_url = "https://github.com/okamumu/gospn.git"
#api使用のid, クライアントシークレット
client_id = ''
client_secret = ''

In [None]:
tag_list = get_tag_name(repository_url, client_id, client_secret)

In [None]:
#コミットで変更されたファイル(commit_files)、コミットのメッセージ(sentence_set)
commit_files = {}
sentence_set = {}
for i in tqdm(range(len(tag_list))):
    if i == (len(tag_list)-1):
        break
    else:
        from_ver = tag_list[i]
        to_ver = tag_list[i+1]
        commit_files_iter = {}
        sentence_set_iter = {}
        commit_files_iter, sentence_set_iter = commit_message(repository_url, client_id, client_secret, from_ver, to_ver) 
        commit_files.update(commit_files_iter)
        sentence_set.update(sentence_set_iter)

In [None]:
#文書のベクトル化
vector_set = {}
for k in sentence_set.keys():
    text = sentence_set[k]
    vector_set[k] = [BTV.vectorize(text)]

In [None]:
#全ベクトルのリストを作成
all_vector = []
for k in vector_set.keys():
    all_vector.append(vector_set[k][0])

In [None]:
with open('commit_files_commit.bin', 'wb') as f:
    pickle.dump(commit_files, f)
with open('all_vector_commit.bin', 'wb') as f:
    pickle.dump(all_vector, f)
with open('vector_set_commit.bin', 'wb') as f:
    pickle.dump(vector_set, f)