<center><h1>General Global Doc2Vec</h1></center> 

### Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import os
from os import listdir
from os.path import isfile, join
import operator
import traceback
import logging

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.test.utils import get_tmpfile
from gensim import similarities
from gensim import models

import warnings
import javalang
import re
import glob
import math 
import time
from scipy import spatial
import scipy.spatial.distance
import xml.etree.ElementTree as ET
import requests
import multiprocessing
from tqdm import tqdm_notebook
from time import gmtime, strftime
from random import randint


from __future__ import division
warnings.simplefilter(action='ignore', category=FutureWarning)

<center><h1>Evaluators</h1></center> 

In [15]:
def evaluate_helper(ranked_files,fixes):
    """
    @Receives: list of ranked files(which is predicted by algorithm) and the ground truth(fixes)
    @Process: This is a function aimed to help evaluate, and calculates the AP and first_pos 
    @Returns: A dictionary containing the AP(average precision) and first_pos (first retrieved fix file)
    """
    found=0
    first_pos=-1
    average_precision=0
    for i,predictionFix in enumerate(ranked_files):
        for actualFix in fixes:
            if actualFix.split('.')[-2] == predictionFix:
                if first_pos==-1:
                    first_pos=i+1
                found+=1
                average_precision+=found/(i+1)        

    AP=average_precision/found if found>0 else 0
    return {"AP":AP,"first_pos":first_pos}


def evaluate(all_bugs_df,source_codes_df):
    """
    @Receives: The main dataframe and the path to ClassName.txt 
    which contains the name of all sourcefiles in this version of code
    @Process: Evaluates the predicted files for each bugreport in all_bugs_df
    @Returns: MAP and MRR calculated from eligible bugreports(the ones with
    at least one fix file in this version of code) in the dataframe and number of eligible bug reports.
    """
    all_results=[]
    top_founds=[]
    average_precisions=[]
    for i,br in all_bugs_df.iterrows():
        if not source_codes_df.loc[source_codes_df.filename.apply(lambda filename: any(fix in filename for fix in br['fix']))].empty:
            predicted_files=br['total_score'].keys()
            result=evaluate_helper(predicted_files,br['fix'])
            top_founds.append(result['first_pos'])
            average_precisions.append(result['AP'])
            all_results.append(result)
        else:
            top_founds.append(-1)
            average_precisions.append(0.0)
    all_bugs_df["top_found"]=top_founds
    all_bugs_df["average_precision"]=average_precisions
    
    #Calculating the MAP and MRR
    MAP,MRR=(0,0)
    if len(all_results)>0:
        for result in all_results:
            MAP+=result['AP']
            MRR+=1/result['first_pos'] if result['first_pos']>0 else 0
        MAP/=len(all_results)
        MRR/=len(all_results)
        print("eligible_br_count: ",len(all_results))
    return (MAP,MRR,len(all_results))


<center><h1>Splitting code and natural language</h1></center> 

In [16]:
def camel_case_split(doc):
    """
    @Receives: a document
    @Process: split it by Capital letters
    @Return: a list of lower cased words
    """
    splitted=re.split('\W+',doc)
    return [term.lower() for each in splitted for term in re.sub('(?!^)([A-Z][a-z]+)', r' \1', each).split()]

def split_natural_lang(doc):
    """
    @Receives: a document in natural language
    @Process: splits it as described in BugLocator
    @Return: a list of lower cased words
    """
    wordList=[]
    word=''
    for char in doc:
        if char.isalnum() or char=='\'':
            word+=char
        else:
            if len(word)>0:
                wordList.append(word)
                word=''
    if len(word)>0:
        wordList.append(word)
    return wordList


def code_splitter(sourceCode):

    contentBuf = []
    wordBuf = []
    for char in sourceCode:
        if ((char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z')):
            wordBuf.append(char)
            continue
        length = len(wordBuf)
        if (length != 0):
            k = 0
            for i in range(length-1):
                j=i+1
                first_char = wordBuf[i]
                second_char = wordBuf[j]
                if ((first_char >= 'A' and first_char <= 'Z') and (second_char >= 'a' and second_char <= 'z')):
                    contentBuf.append(wordBuf[k:i])
                    contentBuf.append(' ')
                    k = i
                    continue
                if ((first_char >= 'a' and first_char <= 'z') and (second_char >= 'A' and second_char <= 'Z')):
                    contentBuf.append(wordBuf[k:j])
                    contentBuf.append(' ')
                    k = j
                    continue
            if (k < length):
                contentBuf.append(wordBuf[k:])
                contentBuf.append(" ")
            wordBuf=[]
    words=''
    for each in contentBuf:
        if isinstance(each,str):
            words+=each
        else: 
            for term in each: 
                words+=term
    words= words.split()
    contentBuf = []
    for i in range(len(words)):
        if (words[i].strip()!="" and len(words[i]) >= 2):
            contentBuf.append(words[i])
    return contentBuf

def split_code(doc):
    """
    @Receives: a code
    @Process: splits it as described in BugLocator
    @Return: a list of lower cased words
    """
    wordList=[]
    word=''
    for char in doc:
        if char.isalpha():
            word+=char
        else:
            if len(word)>0:
                wordList+=camel_case_split(word)
                word=''
    if len(word)>0:
        wordList+=camel_case_split(word)
    return wordList



def general_preprocessor(doc,mode):
    JavaKeywords=["abstract", "continue", "for", 
                "new", "switch", "assert", "default", "goto", "package", 
                "synchronized", "boolean", "do", "if", "private", "this", 
                "break", "double", "implements", "protected", "throw", "byte", 
                "else", "import", "public", "throws", "case", "enum", 
                "instanceof", "return", "transient", "catch", "extends", "int", 
                "short", "try", "char", "final", "interface", "static", "void", 
                "class", "finally", "long", "strictfp", "volatile", "const", 
                "float", "native", "super", "while", "org", "eclipse", "swt", 
                "string", "main", "args", "null", "this", "extends", "true", 
                "false"]
    stop_words=["a", "a's", "able", "about", "above",
                "according", "accordingly", "across", "actually", "after",
                "afterwards", "again", "against", "ain't", "all", "allow",
                "allows", "almost", "alone", "along", "already", "also",
                "although", "always", "am", "among", "amongst", "an", "and",
                "another", "any", "anybody", "anyhow", "anyone", "anything",
                "anyway", "anyways", "anywhere", "apart", "appear",
                "appreciate", "appropriate", "are", "aren't", "around", "as",
                "aside", "ask", "asking", "associated", "at", "available",
                "away", "awfully", "b", "be", "became", "because", "become",
                "becomes", "becoming", "been", "before", "beforehand",
                "behind", "being", "believe", "below", "beside", "besides",
                "best", "better", "between", "beyond", "both", "brief", "but",
                "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot",
                "cant", "cause", "causes", "certain", "certainly", "changes",
                "clearly", "co", "com", "come", "comes", "concerning",
                "consequently", "consider", "considering", "contain",
                "containing", "contains", "corresponding", "could", "couldn't",
                "course", "currently", "d", "definitely", "described",
                "despite", "did", "didn't", "different", "do", "does",
                "doesn't", "doing", "don't", "done", "down", "downwards",
                "during", "e", "each", "edu", "eg", "eight", "either", "else",
                "elsewhere", "enough", "entirely", "especially", "et", "etc",
                "even", "ever", "every", "everybody", "everyone", "everything",
                "everywhere", "ex", "exactly", "example", "except", "f", "far",
                "few", "fifth", "first", "five", "followed", "following",
                "follows", "for", "former", "formerly", "forth", "four",
                "from", "further", "furthermore", "g", "get", "gets",
                "getting", "given", "gives", "go", "goes", "going", "gone",
                "got", "gotten", "greetings", "h", "had", "hadn't", "happens",
                "hardly", "has", "hasn't", "have", "haven't", "having", "he",
                "he's", "hello", "help", "hence", "her", "here", "here's",
                "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
                "hi", "him", "himself", "his", "hither", "hopefully", "how",
                "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie",
                "if", "ignored", "immediate", "in", "inasmuch", "inc",
                "indeed", "indicate", "indicated", "indicates", "inner",
                "insofar", "instead", "into", "inward", "is", "isn't", "it",
                "it'd", "it'll", "it's", "its", "itself", "j", "just", "k",
                "keep", "keeps", "kept", "know", "knows", "known", "l", "last",
                "lately", "later", "latter", "latterly", "least", "less",
                "lest", "let", "let's", "like", "liked", "likely", "little",
                "look", "looking", "looks", "ltd", "m", "mainly", "many",
                "may", "maybe", "me", "mean", "meanwhile", "merely", "might",
                "more", "moreover", "most", "mostly", "much", "must", "my",
                "myself", "n", "name", "namely", "nd", "near", "nearly",
                "necessary", "need", "needs", "neither", "never",
                "nevertheless", "new", "next", "nine", "no", "nobody", "non",
                "none", "noone", "nor", "normally", "not", "nothing", "novel",
                "now", "nowhere", "o", "obviously", "of", "off", "often", "oh",
                "ok", "okay", "old", "on", "once", "one", "ones", "only",
                "onto", "or", "other", "others", "otherwise", "ought", "our",
                "ours", "ourselves", "out", "outside", "over", "overall",
                "own", "p", "particular", "particularly", "per", "perhaps",
                "placed", "please", "plus", "possible", "presumably",
                "probably", "provides", "q", "que", "quite", "qv", "r",
                "rather", "rd", "re", "really", "reasonably", "regarding",
                "regardless", "regards", "relatively", "respectively", "right",
                "s", "said", "same", "saw", "say", "saying", "says", "second",
                "secondly", "see", "seeing", "seem", "seemed", "seeming",
                "seems", "seen", "self", "selves", "sensible", "sent",
                "serious", "seriously", "seven", "several", "shall", "she",
                "should", "shouldn't", "since", "six", "so", "some",
                "somebody", "somehow", "someone", "something", "sometime",
                "sometimes", "somewhat", "somewhere", "soon", "sorry",
                "specified", "specify", "specifying", "still", "sub", "such",
                "sup", "sure", "t", "t's", "take", "taken", "tell", "tends",
                "th", "than", "thank", "thanks", "thanx", "that", "that's",
                "thats", "the", "their", "theirs", "them", "themselves",
                "then", "thence", "there", "there's", "thereafter", "thereby",
                "therefore", "therein", "theres", "thereupon", "these", "they",
                "they'd", "they'll", "they're", "they've", "think", "third",
                "this", "thorough", "thoroughly", "those", "though", "three",
                "through", "throughout", "thru", "thus", "to", "together",
                "too", "took", "toward", "towards", "tried", "tries", "truly",
                "try", "trying", "twice", "two", "u", "un", "under",
                "unfortunately", "unless", "unlikely", "until", "unto", "up",
                "upon", "us", "use", "used", "useful", "uses", "using",
                "usually", "uucp", "v", "value", "various", "very", "via",
                "viz", "vs", "w", "want", "wants", "was", "wasn't", "way",
                "we", "we'd", "we'll", "we're", "we've", "welcome", "well",
                "went", "were", "weren't", "what", "what's", "whatever",
                "when", "whence", "whenever", "where", "where's", "whereafter",
                "whereas", "whereby", "wherein", "whereupon", "wherever",
                "whether", "which", "while", "whither", "who", "who's",
                "whoever", "whole", "whom", "whose", "why", "will", "willing",
                "wish", "with", "within", "without", "won't", "wonder",
                "would", "would", "wouldn't", "x", "y", "yes", "yet", "you",
                "you'd", "you'll", "you're", "you've", "your", "yours",
                "yourself", "yourselves", "z", "zero","quot"]
    
    porter = PorterStemmer()
    Java_keyWords=[porter.stem(each.strip().lower()) for each in JavaKeywords]
    natural_stop_words=[porter.stem(each.strip().lower()) for each in stop_words]
#     stop_words = set(stopwords.words('english'))
    
    processed_doc=[]
    if mode=="code":
        splitted_doc=[porter.stem(term.lower()) for term in code_splitter(doc)]
        processed_doc=[term for term in splitted_doc if not(term in Java_keyWords or
                                                            term in natural_stop_words or len(term)<2)]
    elif mode=="text":
        splitted_doc=[porter.stem(term.lower()) for term in split_natural_lang(doc)]
        processed_doc=[term for term in splitted_doc if not(term in natural_stop_words or len(term)<2)]
    return processed_doc

<center><h1>Loading Bugreports and Code into pandas Dataframe</h1></center> 

In [17]:
def loadBugs2df(PATH,project):
    """
    @Receives: the path to bug repository (the xml file)
    @Process: Parses the xml file and reads the fix files per bug id. 
    @Returns: Returns the dataframe
    """
    print("Loading Bug reports ... ")
    all_bugs_df=pd.DataFrame([],columns=["id","fix","text","fixdate"])
    bugRepo = ET.parse(PATH).getroot()
    buglist=[]                   
    for bug in tqdm_notebook(bugRepo.findall('bug')):
        bugDict=dict({"id":bug.attrib['id'],"fix":[],"fixdate":bug.attrib['fixdate']
                      ,"summary":None,"description":None,"project":project,"average_precision":0.0})
        for bugDetail in bug.find('buginformation'):
            if bugDetail.tag=='summary':
                bugDict["summary"]=bugDetail.text
            elif bugDetail.tag=='description':
                bugDict["description"]=bugDetail.text
        bugDict["fix"]=np.array([fixFile.text.replace('/','.').lower() for fixFile in bug.find('fixedFiles')])
        summary=str(bugDict['summary']) if str(bugDict['summary']) !=np.nan else ""
        description=str(bugDict['description']) if str(bugDict['description']) !=np.nan else ""
        processed_text=general_preprocessor(summary+" "+description,"text")
        bugDict["text"]=processed_text
        buglist.append(bugDict)
    all_bugs_df=all_bugs_df.append(pd.DataFrame(buglist))
    return all_bugs_df.set_index('id')

def classNames_methodNames(node):
    result=''
    if isinstance(node,javalang.tree.MethodDeclaration) or isinstance(node,javalang.tree.ClassDeclaration):
        return node.name.lower()+' '
    if not (isinstance(node,javalang.tree.PackageDeclaration) or
        isinstance(node,javalang.tree.FormalParameter) or
       isinstance(node,javalang.tree.Import)):
        if node:
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=classNames_methodNames(childNode)
    return result
    
def traverse_node(node,i=0):
    i+=1
    result=''
    if not(isinstance(node,javalang.tree.PackageDeclaration)
            or isinstance(node,javalang.tree.FormalParameter)            
            or isinstance(node,javalang.tree.Import)
            or isinstance(node,javalang.tree.CompilationUnit)):
        if node:
            if (isinstance(node,int) or isinstance(node,str) or isinstance(node,float)) and i==2:
                result+=node+' '
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=traverse_node(childNode,i)
    return result

def code_parser(code):
    try:
        tree = javalang.parse.parse(code)
        return ''.join([traverse_node(node) for path, node in tree]) + ' ' + ''.join([classNames_methodNames(node)
                                                                                      for path, node in tree])
    except Exception as e: 
        print(e)
        return ''


def loadSourceFiles2df(PATH,group,project):
    """
    Receives: group name and project name 
    Process: open the source file directory and finds all the java files,
             and after preprocessing(using code_preprocessor) load them into a pandas dataframe 
    Returns: dataframe >> "filename","code","size"
    """
    print('Loading source files of {} from group:{} ...'.format(project,group))
    PATH=os.path.join("../Bench4BL/data",group,project,"gitrepo")
    all_source_files=glob.glob(PATH+'/**/*.java', recursive=True)
    source_codes_df=pd.DataFrame([])
    sourceCodesList=[]

    for filename in tqdm_notebook(all_source_files):
        code=open(filename,encoding='ISO-8859-1').read()
        processed_code=general_preprocessor(code_parser(code),'code')
        if 'src/' in filename:
            sourceCodesList.append(dict({"filename":filename.split('src/')[1].replace('/','.').lower(),
                                         "code":processed_code,"unprocessed_code":code,
                                         "size":len(processed_code),'project':project}))
        else:
            sourceCodesList.append(dict({"filename":filename.split(project)[1].replace('/','.').lower(),
                                         "code":processed_code,"unprocessed_code":code,
                                         "size":len(processed_code),'project':project}))
    source_codes_df=source_codes_df.append(pd.DataFrame(sourceCodesList))
    return source_codes_df


<center><h1>TFIDF</h1></center> 

In [18]:
def getNormValue(x,maximum,minimum):
    return 6*((x - minimum)/(maximum - minimum))

def getLenScore(length):
    return (math.exp(length) / (1 + math.exp(length)))

def calulateLengthScore(source_codes_df):
    """
    Receives: a list of sizes of codes and the index
    Process: calculate a boost score for the specified index based on length of that code
    Returns: length boosting score 
    """
    average_size=source_codes_df['size'].mean()
    standard_deviation=source_codes_df['size'].std() 
    low=average_size-3*standard_deviation
    high= average_size+3*standard_deviation
    minimum=int(low) if low>0 else 0
        
    len_scores=[]
    for i,eachLen in source_codes_df['size'].items():
        score=0
        nor=getNormValue(eachLen,high,minimum)
        if eachLen!=0:
            if eachLen>=low and eachLen<=high:
                score=getLenScore(nor)
            elif eachLen<low:
                score=0.5
            elif eachLen>high:
                score = 1.0
        len_scores.append(score)
    source_codes_df['lengthScore']=len_scores

    return source_codes_df
    
def inverse_doc_freq(idf,D):
    return math.log(D/idf)

def term_freq(tf_list):
    return [(math.log(tf+1)) for tf in tf_list]

def np_normalizer(arr):
    """
    @Receives: a list of numbers
    @Process: normalizes all the values and map them to range of [0,1]
    @Returns: list of normalized numbers
    """
    if len(arr)>0:
        maximum=np.amax(arr)
        minimum=np.amin(arr)
        if maximum!=minimum:
            return (arr-minimum)/(maximum-minimum)
    return arr

def normalizer(Dict):
    """
    @Receives: a list of numbers
    @Process: normalizes all the values and map them to range of [0,1]
    @Returns: list of normalized numbers
    """
    if len(Dict)>0:
        maximum=max(Dict.items(), key=operator.itemgetter(1))[1]
        minimum=min(Dict.items(), key=operator.itemgetter(1))[1]
        for key,value in Dict.items():
            if maximum!=minimum:
                Dict[key]=(value-minimum)/(maximum-minimum)
            else:
                Dict[key]=1.0
            
    return Dict
    

def TFIDF_transform(all_bugs_df,source_codes_df):
    
    print("\tTransforming to TF.IDF ...")
    dictionary = gensim.corpora.Dictionary(list(source_codes_df['code']))
    corpus = [dictionary.doc2bow(doc) for doc in list(source_codes_df['code'])]
    tfidf_weights = models.TfidfModel(corpus,wlocal=term_freq,wglobal=inverse_doc_freq,normalize=False)
    source_codes_df['tfidf_vector']=tfidf_weights[corpus]
    all_bugs_df['tfidf_vector']=all_bugs_df.text.apply(lambda x: tfidf_weights[dictionary.doc2bow(x)])
    return (all_bugs_df,source_codes_df,len(dictionary))

def cos_matrix_multiplication(matrix, vector):
    """
    Calculating pairwise cosine distance using matrix vector multiplication.
    """
    dotted = matrix.dot(vector)
    matrix_norms = np.linalg.norm(matrix, axis=1)
    vector_norm = np.linalg.norm(vector)
    matrix_vector_norms = np.multiply(matrix_norms, vector_norm)
    neighbors = np.divide(dotted, matrix_vector_norms)
    return neighbors

<center><h1> Genral Global Doc2Vec Model </h1></center> 


In [19]:
def load_all_BRs(dataPath):
    print('\tLoading all bug reports ... ')
    all_bugs_df=pd.DataFrame([])
    all_groups=[folder for folder in listdir(dataPath)]
    for group in tqdm_notebook(all_groups):
        all_projects= [folder for folder in listdir(os.path.join(dataPath,group))]
        for project in all_projects:
            data_path=os.path.join(dataPath,group,project,"bugrepo","repository.xml")
            all_bugs_df=all_bugs_df.append(loadBugs2df(data_path,project))
            print(len(all_bugs_df))
    return all_bugs_df

def load_all_SCs(dataPath):
    print('\tLoading all source codes ... ')
    source_codes_df=pd.DataFrame([])
    all_groups=[folder for folder in listdir(dataPath)]
    for group in tqdm_notebook(all_groups):
        all_projects= [folder for folder in listdir(os.path.join(dataPath,group))]
        for project in all_projects:
            source_path=os.path.join(dataPath,group,project,"gitrepo")
            source_codes_df=source_codes_df.append(loadSourceFiles2df(source_path,group,project))
    return source_codes_df

In [20]:
def build_Doc2Vec_models(vec_size,alpha,window_size,all_bugs_df,source_codes_df):
    """
    Process: 1- Loads all the bug reports from all the group/projects in Data directory
             2- Makes a Doc2Vec model and trains it based on all the bugreports
    Returns: Trained model
    """
    print("\n\t Now building the Combined Doc2Vec model ... ")
    dmm_model_path=os.path.join(os.getcwd(),'Models','combined_doc2vec_model_dmm')
    dbow_model_path=os.path.join(os.getcwd(),'Models','combined_doc2vec_model_dbow')
    fname_dmm = get_tmpfile(dmm_model_path)
    fname_dbow = get_tmpfile(dbow_model_path)
    if os.path.isfile(dmm_model_path) and os.path.isfile(dbow_model_path):
        revectorize=False
        model_dmm = Doc2Vec.load(fname_dmm)
        model_dbow = Doc2Vec.load(fname_dbow)
        print("*** Combined Doc2Vec Model is Loaded. ***")            
    else:
        revectorize=True
        documents = [TaggedDocument(all_bugs_df.iloc[i].text, [i]) for i in range(len(all_bugs_df))]
        documents = documents + [TaggedDocument(source_codes_df.iloc[i].code, [len(all_bugs_df)+i]) for i in range(len(source_codes_df))]

        model_dmm = Doc2Vec(vector_size=vec_size, window=window_size, min_count=2,
                        workers=multiprocessing.cpu_count(),
                        alpha=alpha, min_alpha=alpha/2,dm=1)
        model_dmm.build_vocab(documents)
        model_dmm.train(documents,total_examples=model_dmm.corpus_count,epochs=20)
        model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        model_dmm.save(fname_dmm)
        
        model_dbow = Doc2Vec(dm=0, vector_size=vec_size, negative=5,
                             hs=0,min_count=2, sample = 0, workers=multiprocessing.cpu_count(),
                             alpha=alpha, min_alpha=alpha/3)
        model_dbow.build_vocab(documents)
        model_dbow.train(documents,total_examples=model_dbow.corpus_count,epochs=20)
        model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        model_dbow.save(fname_dbow)
        print("*** Combined Doc2Vec Model is Trained. ***")
    concatinated_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    print(">> Size of Vocabulary is: {}".format(len(model_dmm.wv.vocab)))
    print(">> Number of whole Documents: {}".format(model_dmm.corpus_count))
    
    return (concatinated_model,revectorize)

In [21]:
def synthesize(sourceCodeScores,bugReportScores):
    sourceCodeScores=normalizer(sourceCodeScores)
    bugReportScores=normalizer(bugReportScores)
    for file in bugReportScores.keys():
        if file in sourceCodeScores.keys():
            sourceCodeScores[file]=sourceCodeScores[file]*0.8+bugReportScores[file]*0.2
    return sourceCodeScores


<center><h1>Main BugLocalization class</h1></center> 

In [22]:
class BugLocalizer:
    
    TFIDF_transformed=False
    dictionary_length=0
    all_projects_source_codes=pd.DataFrame([])
    all_projects_bugreports=pd.DataFrame([])
    
    def __init__(self,group,project,result_path,dataPath):
        self.group=group
        self.project=project
        self.resultPath=result_path
        self.dataPath=dataPath
        self.dataFolder=os.path.join(os.getcwd(),'Data')
        if not os.path.exists(self.dataFolder):
            os.makedirs(self.dataFolder)
            
    def execute(self):
        print("\t ****** Localizing Bugs for group: {} , project: {} ******".format(self.group,self.project))
        vectorize=self.loadEverything()
        revectorize=False
        
        if not BugLocalizer.TFIDF_transformed:
            (self.all_projects_bugreports,BugLocalizer.all_projects_source_codes,BugLocalizer.dictionary_length)=TFIDF_transform(all_bugs_df=BugLocalizer.all_projects_bugreports,
                                                                                                     source_codes_df=BugLocalizer.all_projects_source_codes)                                       
            BugLocalizer.TFIDF_transformed=True

        self.loadBugCurpus()
        self.loadSourceFiles()
        self.localize()
        self.evaluate()
        self.to_csv()
        self.write_result()
        
    def loadEverything(self):
        vectorize=False
        if BugLocalizer.all_projects_bugreports.empty:
            bugReportFile=os.path.join(self.dataFolder,'allBugReports.pickle')
            if not os.path.isfile(bugReportFile):
                BugLocalizer.all_projects_bugreports=load_all_BRs(dataPath=self.dataPath)
                vectorize=True
            else: 
                BugLocalizer.all_projects_bugreports=pd.read_pickle(bugReportFile)
                print("*** All Bug Reports are Loaded. ***")

        if BugLocalizer.all_projects_source_codes.empty:
            sourceCodeFile=os.path.join(self.dataFolder,'allSourceCodes.pickle')
            if not os.path.isfile(sourceCodeFile):
                BugLocalizer.all_projects_source_codes=load_all_SCs(dataPath=self.dataPath)
                vectorize=True
            else:
                BugLocalizer.all_projects_source_codes=pd.read_pickle(sourceCodeFile)
                print("*** All Source Codes are Loaded. ***")
        return vectorize
    
    def loadBugCurpus(self):
        self.all_bugs_df=BugLocalizer.all_projects_bugreports.loc[BugLocalizer.all_projects_bugreports['project']==self.project,:]
    
    def loadSourceFiles(self):
        self.source_codes_df=BugLocalizer.all_projects_source_codes.loc[BugLocalizer.all_projects_source_codes['project']==self.project,:]
    
    def vectorizeBugreports(self):
        BugLocalizer.all_projects_bugreports['doc2vec_vector']=np.array(BugLocalizer.all_projects_bugreports.text.apply(BugLocalizer.combined_Doc2vec.infer_vector))
        
    def vectorizeSourceCodes(self):
        BugLocalizer.all_projects_source_codes['doc2vec_vector']=np.array(BugLocalizer.all_projects_source_codes.code.apply(BugLocalizer.combined_Doc2vec.infer_vector))
     

    def localize(self):
        
        print("Localizing Now ...")
        self.source_codes_df=calulateLengthScore(self.source_codes_df)
        scores=[]
        direct_tfidf_index = similarities.SparseMatrixSimilarity(list(self.source_codes_df.tfidf_vector),num_features=BugLocalizer.dictionary_length)
        indirect_tfidf_index = similarities.SparseMatrixSimilarity(list(self.all_bugs_df.tfidf_vector),num_features=BugLocalizer.dictionary_length)
        indirectScores=[]
        BRRanks=[]
        for i, br in tqdm_notebook(self.all_bugs_df.iterrows()):
            
            try:
                direct_tfidf_similarities=direct_tfidf_index[br.tfidf_vector]
                direct_tfidf_similarities=np_normalizer(direct_tfidf_similarities)
                sourceCodeScores={self.source_codes_df.iloc[j].filename.split('.')[-2]: (direct_tfidf_similarities[j])*self.source_codes_df.iloc[j].lengthScore 
                                                  for j in range(len(self.source_codes_df))
                                                  if len(self.source_codes_df.iloc[j].filename.split('.'))>1}
                
                indirect_tfidf_similarities=indirect_tfidf_index[br.tfidf_vector]                
                bugReportScores=dict({})
                for j,(idx,other_br) in enumerate(self.all_bugs_df.iterrows()):
                    for fixFile in other_br.fix:
                        if idx != i:
                            if fixFile.split('.')[-2] in bugReportScores.keys():
                                if indirect_tfidf_similarities[j]>=bugReportScores[fixFile.split('.')[-2]]:
                                    bugReportScores[fixFile.split('.')[-2]]=indirect_tfidf_similarities[j]
                            else:
                                bugReportScores[fixFile.split('.')[-2]]=indirect_tfidf_similarities[j]
                indirectScores.append(bugReportScores)
                BRRanks.append({idx:indirect_tfidf_similarities[j] for j,(idx,other_br) in enumerate(self.all_bugs_df.iterrows())})
                ranking=synthesize(sourceCodeScores,bugReportScores)
                scores.append({file:score for file,score in sorted(ranking.items(),key=lambda tup: tup[1],reverse=True)})
            except Exception as e:
                logging.error(traceback.format_exc())
                scores.append({})
        self.all_bugs_df['Indirect_scores']=indirectScores
        self.all_bugs_df['BR_scores']=BRRanks
        self.all_bugs_df['total_score']=scores

    def evaluate(self):
        self.result=evaluate(self.all_bugs_df,self.source_codes_df)
        print("Result/"+self.group+"_"+self.project+":\n\t",'*'*4," MAP: ",self.result[0],'*'*4,'\n\t','*'*4," MRR: ",self.result[1],'*'*4,"\n","-"*100)

    def to_csv(self):
        BugReports_path=os.path.join(self.resultPath,'BugReports') 
        SourceFiles_path=os.path.join(self.resultPath,'SourceFiles')
        if not os.path.exists(BugReports_path):
            os.makedirs(BugReports_path)
        if not os.path.exists(SourceFiles_path):
            os.makedirs(SourceFiles_path)
        result_Bug_file=os.path.join(BugReports_path,self.project+"_BugReports.csv")
        result_source_file=os.path.join(SourceFiles_path,self.project+"_SourceFiles.csv")
        self.all_bugs_df.to_csv(result_Bug_file)
        if len(self.source_codes_df)<300:
            self.source_codes_df.to_csv(result_source_file)
        
    def write_result(self):
        group_result=open(os.path.join(self.resultPath,"results_{}_{}.csv".format(self.group,self.project)),'w')
        group_result.write(group+" , MAP , MRR , #ofBugReports\n")
        group_result.write(project+','+str(self.result[0])+','+str(self.result[1])+','+str(self.result[2])+"\n")
        group_result.close()
        

### MAIN

In [23]:
def folder_structure(run_name):
    result_path=os.path.join(os.getcwd(),"Result",run_name,strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    return result_path
    
def get_all_groups(config):
    return [folder for folder in listdir(config['DATA_PATH'])] if config['specefic_groups']==[] else config['specefic_groups']

def get_all_projects(config,group):
    return [folder for folder in listdir(os.path.join(config['DATA_PATH'],group))] if config['specefic_projects']==[] else config['specefic_projects']


if __name__=="__main__":

    config={'specefic_groups':['Spring'],'specefic_projects':[],'DATA_PATH':os.path.join('../Bench4BL','data')}
    run_name='7-GlobalBugLocator'
    result_path=folder_structure(run_name)
    for group in get_all_groups(config):
        for project in get_all_projects(config,group):
            core=BugLocalizer(group=group,project=project,
                              result_path=result_path,dataPath=config['DATA_PATH'])
            core.execute()


	 ****** Localizing Bugs for group: Spring , project: DATAREST ******
*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***
	Transforming to TF.IDF ...
Localizing Now ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

eligible_br_count:  121
Result/Spring_DATAREST:
	 ****  MAP:  0.4490201506512615 **** 
	 ****  MRR:  0.6522439736024456 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SECOAUTH ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  66
Result/Spring_SECOAUTH:
	 ****  MAP:  0.4224606945689348 **** 
	 ****  MRR:  0.5125508176380477 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: ROO ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  558
Result/Spring_ROO:
	 ****  MAP:  0.35704445449433353 **** 
	 ****  MRR:  0.458686195544525 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: DATAGRAPH ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  12
Result/Spring_DATAGRAPH:
	 ****  MAP:  0.1330306942586024 **** 
	 ****  MRR:  0.19904055941998589 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SHL ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  10
Result/Spring_SHL:
	 ****  MAP:  0.336721955491016 **** 
	 ****  MRR:  0.40970892839927836 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SOCIALLI ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  4
Result/Spring_SOCIALLI:
	 ****  MAP:  0.4609445701357466 **** 
	 ****  MRR:  0.7083333333333333 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SGF ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  98
Result/Spring_SGF:
	 ****  MAP:  0.39043793555768747 **** 
	 ****  MRR:  0.637547546406265 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SEC ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  362
Result/Spring_SEC:
	 ****  MAP:  0.5050957635861384 **** 
	 ****  MRR:  0.6268171765666212 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: LDAP ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  52
Result/Spring_LDAP:
	 ****  MAP:  0.40048466252502823 **** 
	 ****  MRR:  0.4999351799516273 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: DATAREDIS ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  49
Result/Spring_DATAREDIS:
	 ****  MAP:  0.5308210096719018 **** 
	 ****  MRR:  0.7435523660013456 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: AMQP ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  97
Result/Spring_AMQP:
	 ****  MAP:  0.3838264704708613 **** 
	 ****  MRR:  0.5383982058312548 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SOCIAL ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  13
Result/Spring_SOCIAL:
	 ****  MAP:  0.5684048476942464 **** 
	 ****  MRR:  0.5869433198380567 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SPR ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  123
Result/Spring_SPR:
	 ****  MAP:  0.28079401649646407 **** 
	 ****  MRR:  0.3717372366713278 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SOCIALTW ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  8
Result/Spring_SOCIALTW:
	 ****  MAP:  0.7854166666666667 **** 
	 ****  MRR:  0.8375 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: DATACMNS ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  152
Result/Spring_DATACMNS:
	 ****  MAP:  0.5377867771288761 **** 
	 ****  MRR:  0.6943900593827476 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: BATCHADM ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  20
Result/Spring_BATCHADM:
	 ****  MAP:  0.43024239385057966 **** 
	 ****  MRR:  0.559749968862872 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: MOBILE ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  11
Result/Spring_MOBILE:
	 ****  MAP:  0.6524257125057903 **** 
	 ****  MRR:  0.8409090909090909 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: BATCH ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  354
Result/Spring_BATCH:
	 ****  MAP:  0.407679293770265 **** 
	 ****  MRR:  0.5850184952462338 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SHDP ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  45
Result/Spring_SHDP:
	 ****  MAP:  0.44097836022922016 **** 
	 ****  MRR:  0.5880282626734297 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: ANDROID ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  9
Result/Spring_ANDROID:
	 ****  MAP:  0.3276150336084019 **** 
	 ****  MRR:  0.6462962962962963 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SOCIALFB ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  15
Result/Spring_SOCIALFB:
	 ****  MAP:  0.5561141986283488 **** 
	 ****  MRR:  0.648994708994709 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: DATAMONGO ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  264
Result/Spring_DATAMONGO:
	 ****  MAP:  0.3601272880644268 **** 
	 ****  MRR:  0.5138057186752736 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SWF ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  105
Result/Spring_SWF:
	 ****  MAP:  0.45044285334664264 **** 
	 ****  MRR:  0.5490116009868069 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: SWS ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  159
Result/Spring_SWS:
	 ****  MAP:  0.42290301283378834 **** 
	 ****  MRR:  0.5403831617190844 **** 
 ----------------------------------------------------------------------------------------------------
	 ****** Localizing Bugs for group: Spring , project: DATAJPA ******
Localizing Now ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

eligible_br_count:  144
Result/Spring_DATAJPA:
	 ****  MAP:  0.48877886650884034 **** 
	 ****  MRR:  0.6813327250867564 **** 
 ----------------------------------------------------------------------------------------------------


In [12]:

method='7-GlobalBugLocator'
runNumber="run"
all_results_csv=[os.path.join(os.getcwd(),"Result",method,runNumber,folder) 
                 for folder in listdir(os.path.join(os.getcwd(),"Result",method,runNumber)) if '.csv' in folder]
results_df=pd.DataFrame([])
for result_csv in all_results_csv:
    res=pd.read_csv(result_csv,index_col=[0],header=0)
    results_df=results_df.append(res)

project_size_df=pd.read_csv('project_size.csv',index_col=[0],header=0)
results_df
results_df=pd.merge(results_df, project_size_df,
                                      left_index=True,
                                      right_index=True)
results_df=results_df.reset_index()
results_df=results_df.set_index(' #ofBugReports')

results_df.to_csv(os.path.join(os.getcwd(),"Result",method,'result.csv'))



<center><h1>Testing</h1></center> 

In [None]:
import numpy
from gensim.models.deprecated import keyedvectors
from gensim.models.deprecated.keyedvectors import EuclideanKeyedVectors
import numpy as np
import scipy.spatial.distance

# print(len(BugLocalizer.all_projects_source_codes.loc[BugLocalizer.all_projects_source_codes['project']=='IO']))

vectors_all=np.array(BugLocalizer.all_projects_source_codes.iloc[:20].doc2vec_vector)
vector_1=BugLocalizer.all_projects_bugreports.iloc[0].doc2vec_vector
print(vector_1.shape,vectors_all.shape)
EuclideanKeyedVectors.cosine_similarities(vector_1=vector_1,vectors_all=vectors_all)

In [30]:
sourceCodes_df=pd.DataFrame(BugLocalizer.all_projects_source_codes.groupby('project').size())
sourceCodes_df.columns=["SourceCodeSize"]
sourceCodes_df.index.name='Project'
sourceCodes_df.to_csv('project_size.csv')