In [14]:
from dataclasses import dataclass
from random import randint
from typing import Dict, List, Set

In [30]:
@dataclass
class CommitInfo:
    commit: str
    circlet_link: str
    original_message: str
    gum_tree: str
    gum_tree_confidence: str
    schnb: str
    schnb_confidence: str
    code2seq: str
    nb: str
        
    @staticmethod
    def parse_from_list(list_: List[str]) -> 'CommitInfo':
        index_commit = 0 
        index_circlet_link = 1
        index_orig_message = 3 
        index_gt = 4
        index_gt_confidence = 5
        index_schnb = 6
        index_schnb_confidence = 7
        index_c2s = 8
        index_nb = 9
        
        return CommitInfo(commit=list_[index_commit],
                          circlet_link=list_[index_circlet_link],
                          original_message=list_[index_orig_message],
                          gum_tree=list_[index_gt],
                          gum_tree_confidence=list_[index_gt_confidence],
                          schnb=list_[index_schnb],
                          schnb_confidence=list_[index_schnb_confidence],
                          code2seq=list_[index_c2s],
                          nb=list_[index_nb])
    
    def almost_pretty_print_for_labeling(self, sep='^'):
        return f'=HYPERLINK("{self.circlet_link}";"circlet"){sep}{sep}' \
               f'{self.original_message}{sep}{self.gum_tree}{sep}{self.gum_tree_confidence}{sep}' \
               f'{self.schnb}{sep}{self.schnb_confidence}{sep}' \
               f'{self.code2seq}{sep}{self.nb}{sep}'
        

In [31]:
# get all merged data

commits: List[CommitInfo] = []
input_file = "merged_4_system_results.csv"

with open(input_file, 'r') as input_file:
    for line in input_file:
        if line.startswith("Commit"):
            continue
            
        commits.append(CommitInfo.parse_from_list(line.split("^")))

In [32]:
len(commits)

37077

In [33]:
def is_commit_ok_for_labeling(commit: CommitInfo) -> bool:
    total_score = 0
    
    if commit.original_message == "no message":
        return False
    
    if commit.gum_tree != "":
        total_score += 1
    if commit.schnb != "":
        total_score += 1
    if commit.code2seq != "":
        total_score += 1
    if commit.nb != "":
        total_score += 1
    
    return total_score >= 3

In [35]:
users = ["Miller", "Usov", "Murycheva"]
max_count_commits_per_user: int = 30
max_commits_number: int = len(commits)
selected_commits: Set[int] = set()
sep = '^'

for user in users:
    current_user_selected_commits = 0
    file_name_output = f"for_{user}_labeling.csv"  
    with open(file_name_output, 'w') as output_file:
        output_file.write(f'Circlet Link{sep}Your Answer{sep}Original Message{sep}' \
                          f'GumTree{sep}GumTree Confidence{sep}Small Changes NB{sep}SCHNB Confidence{sep}' \
                          f'code2seq{sep}Naive Bayes{sep}\n')
        
        while current_user_selected_commits <= max_count_commits_per_user:
            random_index_for_cur_commit = randint(0, max_commits_number - 1)
            cur_commit = commits[random_index_for_cur_commit]
            
            if is_commit_ok_for_labeling(cur_commit) and not random_index_for_cur_commit in selected_commits:
                selected_commits.add(random_index_for_cur_commit)
                current_user_selected_commits += 1
                output_file.write(f"{cur_commit.almost_pretty_print_for_labeling()}\n")
                
                
        
