<center><h1>Global Doc2Vec</h1></center> 

- In this method, we first create a general global Doc2Vec using all bug reports and all source codes of all projects. the model is expected to learn the common language used in codes and reports and we latter use this globally trained model for connecting new bug reports to the relevant source codes. 
- Note that TF.IDF scores are calculated using the __global data__. That means that for each term we calculate the term frequency and inverse document frequency of that term using all bug reports and source codes of all projects. 
- Refer to the third row of the following table (Method5 - Global Doc2Vec) to better understand the details of this method.

<img src="Methods.png">

### Required Libraries

In [None]:
from __future__ import division

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import os
from os import listdir
from os.path import isfile, join

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.test.utils import get_tmpfile
from gensim import models

import warnings
import multiprocessing
from tqdm import tqdm_notebook
from time import gmtime, strftime

warnings.simplefilter(action='ignore', category=FutureWarning)

<center><h1>Evaluators</h1></center> 

In [29]:
def evaluate_helper(ranked_files,fixes):
    """
    @Receives: list of ranked files(which is predicted by algorithm) and the ground truth(fixes)
    @Process: This is a function aimed to help evaluate, and calculates the AP and first_pos 
    @Returns: A dictionary containing the AP(average precision) and first_pos (first retrieved fix file)
    """
    found=0
    first_pos=-1
    average_precision=0
    for i,predictionFix in enumerate(ranked_files):
        for actualFix in fixes:
            if actualFix in predictionFix:
                if first_pos==-1:
                    first_pos=i+1
                found+=1
                average_precision+=found/(i+1)        

    AP=average_precision/found if found>0 else 0
    return {"AP":AP,"first_pos":first_pos}


def evaluate(all_bugs_df,source_codes_df):
    """
    @Receives: The main dataframe
    @Process: Evaluates the predicted files for each bugreport in all_bugs_df
    @Returns: MAP and MRR calculated from eligible bugreports(the ones with
    at least one fix file in this version of code) in the dataframe and number of eligible bug reports.
    """
    all_results=[]
    top_founds=[]
    average_precisions=[]
    for i,br in all_bugs_df.iterrows():
        if not source_codes_df.loc[source_codes_df.filename.apply(lambda filename: any(fix in filename for fix in br['fix']))].empty:
            predicted_files=br['total_score'].keys()
            result=evaluate_helper(predicted_files,br['fix'])
            top_founds.append(result['first_pos'])
            average_precisions.append(result['AP'])
            all_results.append(result)
        else:
            top_founds.append(-1)
            average_precisions.append(0.0)
    all_bugs_df["top_found"]=top_founds
    all_bugs_df["average_precision"]=average_precisions
    
    #Calculating the MAP and MRR
    MAP,MRR=(0,0)
    if len(all_results)>0:
        for result in all_results:
            MAP+=result['AP']
            MRR+=1/result['first_pos'] if result['first_pos']>0 else 0
        MAP/=len(all_results)
        MRR/=len(all_results)
        print("eligible_br_count: ",len(all_results))
    return (MAP,MRR,len(all_results))


<center><h1> Genral Global Doc2Vec Model </h1></center> 


In [None]:

def build_Doc2Vec_models(vec_size,alpha,window_size,all_bugs_df,source_codes_df):
    """
    Process: 1- Loads all the bug reports from all the group/projects in Data directory
             2- Makes a Doc2Vec model and trains it based on all the bugreports
    Returns: Trained model
    """
    print("\n\t Now building the Combined Doc2Vec model ... ")
    dmm_model_path=os.path.join(os.getcwd(),'Models','combined_doc2vec_model_dmm')
    dbow_model_path=os.path.join(os.getcwd(),'Models','combined_doc2vec_model_dbow')
    fname_dmm = get_tmpfile(dmm_model_path)
    fname_dbow = get_tmpfile(dbow_model_path)
    if os.path.isfile(dmm_model_path) and os.path.isfile(dbow_model_path):
        revectorize=False
        model_dmm = Doc2Vec.load(fname_dmm)
        model_dbow = Doc2Vec.load(fname_dbow)
        print("*** Combined Doc2Vec Model is Loaded. ***")            
    else:
        revectorize=True
        documents = [TaggedDocument(all_bugs_df.iloc[i].text, [i]) for i in range(len(all_bugs_df))]
        documents = documents + [TaggedDocument(source_codes_df.iloc[i].code, [len(all_bugs_df)+i]) for i in range(len(source_codes_df))]

        model_dmm = Doc2Vec(vector_size=vec_size, window=window_size, min_count=2,
                        workers=multiprocessing.cpu_count(),
                        alpha=alpha, min_alpha=alpha/2,dm=1)
        model_dmm.build_vocab(documents)
        model_dmm.train(documents,total_examples=model_dmm.corpus_count,epochs=20)
        model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        model_dmm.save(fname_dmm)
        
        model_dbow = Doc2Vec(dm=0, vector_size=vec_size, negative=5,
                             hs=0,min_count=2, sample = 0, workers=multiprocessing.cpu_count(),
                             alpha=alpha, min_alpha=alpha/3)
        model_dbow.build_vocab(documents)
        model_dbow.train(documents,total_examples=model_dbow.corpus_count,epochs=20)
        model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
        model_dbow.save(fname_dbow)
        print("*** Combined Doc2Vec Model is Trained. ***")
    concatinated_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    print(">> Size of Vocabulary is: {}".format(len(model_dmm.wv.vocab)))
    print(">> Number of whole Documents: {}".format(model_dmm.corpus_count))
    
    return (concatinated_model,revectorize)


<center><h1>Main BugLocalization class</h1></center> 

In [None]:
class BugLocalizer:

    combined_Doc2vec=None
    all_projects_source_codes=pd.DataFrame([])
    all_projects_bugreports=pd.DataFrame([])
    vectorize=True
    dataFolder=""
    
    def __init__(self,project,result_path):
        self.project=project
        self.resultPath=result_path
        if not os.path.exists(self.dataFolder):
            os.makedirs(self.dataFolder)
            
    def execute(self):
        print("\t ****** Localizing Bugs for project: {} ******".format(self.project))
        revectorize=False
        if BugLocalizer.combined_Doc2vec==None:
            (BugLocalizer.combined_Doc2vec,revectorize)=build_Doc2Vec_models(vec_size=100,alpha=0.045,window_size=5,
                                               all_bugs_df=BugLocalizer.all_projects_bugreports,
                                               source_codes_df=BugLocalizer.all_projects_source_codes)
            
        if BugLocalizer.vectorize:
            print('\n\t Vectorizing Now ...')
            self.vectorizeBugreports()
            self.vectorizeSourceCodes()
            bugReportFile=os.path.join(self.dataFolder,'allBugReports.pickle')
            sourceCodeFile=os.path.join(self.dataFolder,'allSourceCodes.pickle')
            BugLocalizer.all_projects_bugreports.to_pickle(bugReportFile)
            BugLocalizer.all_projects_source_codes.to_pickle(sourceCodeFile)
            BugLocalizer.vectorize=False
            
        self.loadBugCurpus()
        self.loadSourceFiles()
        self.localize()
        self.evaluate()
        self.to_csv()
        self.write_result()
        
    def loadEverything():
        if BugLocalizer.all_projects_bugreports.empty:
            bugReportFile=os.path.join(BugLocalizer.dataFolder,'allBugReports.pickle')
            if not os.path.isfile(bugReportFile):
                print("The bug reports file (allBugReports.pickle) does not exist. please run the step0 first")
            else: 
                BugLocalizer.all_projects_bugreports=pd.read_pickle(bugReportFile)
                print("*** All Bug Reports are Loaded. ***")

        if BugLocalizer.all_projects_source_codes.empty:
            sourceCodeFile=os.path.join(BugLocalizer.dataFolder,'allSourceCodes.pickle')
            if not os.path.isfile(sourceCodeFile):
                print("The source codes file (allSourceCodes.pickle) does not exist. please run the step0 first")
            else:
                BugLocalizer.all_projects_source_codes=pd.read_pickle(sourceCodeFile)
                print("*** All Source Codes are Loaded. ***")
    
    def loadBugCurpus(self):
        self.all_bugs_df=BugLocalizer.all_projects_bugreports.loc[BugLocalizer.all_projects_bugreports['project']==self.project,:]
        self.group=self.all_bugs_df["group"][0]
        
    def loadSourceFiles(self):
        self.source_codes_df=BugLocalizer.all_projects_source_codes.loc[BugLocalizer.all_projects_source_codes['project']==self.project,:]
    
    def vectorizeBugreports(self):
        BugLocalizer.all_projects_bugreports['doc2vec_vector']=np.array(BugLocalizer.all_projects_bugreports.text.apply(BugLocalizer.combined_Doc2vec.infer_vector))

    def vectorizeSourceCodes(self):
        BugLocalizer.all_projects_source_codes['doc2vec_vector']=np.array(BugLocalizer.all_projects_source_codes.code.apply(BugLocalizer.combined_Doc2vec.infer_vector))
     
    def localize(self):
        
        print("Localizing Now ...")
        scores=[]
        self.source_codes_df=calulateLengthScore(self.source_codes_df)        
        doc2vec_index=np.array(list(self.source_codes_df.doc2vec_vector))
        for i, br in tqdm_notebook(self.all_bugs_df.iterrows()):
            try:
                doc2vec_similarities=cos_matrix_multiplication(doc2vec_index, br.doc2vec_vector)
                doc2vec_similarities=np_normalizer(doc2vec_similarities)
                sourceCodeScores={self.source_codes_df.iloc[j].filename: (doc2vec_similarities[j])*self.source_codes_df.iloc[j].lengthScore 
                                                  for j in range(len(self.source_codes_df))}
                scores.append({file:score for file,score in sorted(sourceCodeScores.items(),key=lambda tup: tup[1],reverse=True)})
            except Exception as e:
                logging.error(traceback.format_exc())
                scores.append({})                
        self.all_bugs_df['total_score']=scores

    def evaluate(self):
        self.result=evaluate(self.all_bugs_df,self.source_codes_df)
        print("Result/"+self.group+"_"+self.project+":\n\t",'*'*4," MAP: ",self.result[0],'*'*4,'\n\t','*'*4," MRR: ",self.result[1],'*'*4,"\n","-"*100)

    def to_csv(self):
        BugReports_path=os.path.join(self.resultPath,'BugReports') 
        SourceFiles_path=os.path.join(self.resultPath,'SourceFiles')
        if not os.path.exists(BugReports_path):
            os.makedirs(BugReports_path)
        if not os.path.exists(SourceFiles_path):
            os.makedirs(SourceFiles_path)
        result_Bug_file=os.path.join(BugReports_path,self.project+"_BugReports.csv")
        result_source_file=os.path.join(SourceFiles_path,self.project+"_SourceFiles.csv")
        self.all_bugs_df.to_csv(result_Bug_file)
        if  len(self.source_codes_df)<100:
            self.source_codes_df.to_csv(result_source_file)
        
    def write_result(self):
        group_result=open(os.path.join(self.resultPath,"results_{}_{}.csv".format(self.group,self.project)),'w')
        group_result.write(group+" , MAP , MRR , #ofBugReports\n")
        group_result.write(project+','+str(self.result[0])+','+str(self.result[1])+','+str(self.result[2])+"\n")
        group_result.close()
        

### MAIN

In [None]:
def folder_structure(run_name):
    result_path=os.path.join(os.getcwd(),"Result",run_name,strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    return result_path
    

if __name__=="__main__":

    run_name='1-GlobalDoc2Vec'
    result_path=folder_structure(run_name)
    BugLocalizer.dataFolder=os.path.join(os.getcwd(),'Data')
    BugLocalizer.loadEverything()
    
    all_projects=set(BugLocalizer.all_projects_bugreports.project)
    for project in all_projects:
        core=BugLocalizer(project=project,result_path=result_path)
        core.execute()



<center><h1>Result</h1></center> 

In [None]:

method='1-GlobalDoc2Vec'
runNumber="run1"
all_results_csv=[os.path.join(os.getcwd(),"Result",method,runNumber,folder) 
                 for folder in listdir(os.path.join(os.getcwd(),"Result",method,runNumber)) if '.csv' in folder]
results_df=pd.DataFrame([])
for result_csv in all_results_csv:
    res=pd.read_csv(result_csv,index_col=[0],header=0)
    results_df=results_df.append(res)

project_size_df=pd.read_csv('project_size.csv',index_col=[0],header=0)
results_df
results_df=pd.merge(results_df, project_size_df,
                                      left_index=True,
                                      right_index=True)
results_df=results_df.reset_index()
results_df=results_df.set_index(' #ofBugReports')

results_df.to_csv(os.path.join(os.getcwd(),"Result",method,'result.csv'))
