In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
DATA_ROOT = '../data/raw'

## LOADING DATA

In [41]:
GIT_COMMITS_PATH = f"{DATA_ROOT}/GIT_COMMITS.csv"
GIT_COMMITS_CHANGES = f"{DATA_ROOT}/GIT_COMMITS_CHANGES.csv"
SONAR_MEASURES_PATH = f"{DATA_ROOT}/SONAR_MEASURES.csv"
SZZ_FAULT_INDUCING_COMMITS = f"{DATA_ROOT}/SZZ_FAULT_INDUCING_COMMITS.csv"
JIRA_ISSUES = f"{DATA_ROOT}/JIRA_ISSUES.csv"

In [42]:
git_commits = pd.read_csv(GIT_COMMITS_PATH)
git_commits_changes = pd.read_csv(GIT_COMMITS_CHANGES)
sonar_measures = pd.read_csv(SONAR_MEASURES_PATH)
szz_fault_inducing_commits = pd.read_csv(SZZ_FAULT_INDUCING_COMMITS)
jira_issues = pd.read_csv(JIRA_ISSUES)

In [43]:
git_commits_changes[git_commits_changes['linesAdded'].isna()]

Unnamed: 0,projectID,commitHash,oldPath,newPath,changeType,diff,linesAdded,linesRemoved,nloc,complexity,tokenCount,methods


In [44]:
len(git_commits_changes.commitHash.unique())

128279

## FILTERING COLUMNS

-------------------------------------------------------------------------------------------------------------------------------

In [45]:
git_dates = git_commits[['commitHash','committerDate']]

In [46]:
agg = {
    'linesAdded': ['sum'],
    'linesRemoved': ['sum'],
    'projectID': ['count'],
}
gcg_by_commit = git_commits_changes.groupby(['projectID', 'commitHash']).agg(agg)

In [47]:
len(gcg_by_commit)

128279

In [48]:
gcg_by_commit = gcg_by_commit.reset_index()

In [49]:
gcg_by_commit.columns = ['projectID', 'commitHash', 'lines_added', 'lines_removed', 'entropylike']

In [50]:
gcg_by_commit = pd.merge(gcg_by_commit, git_dates, on='commitHash', how='inner')

In [51]:
gcg_by_commit = gcg_by_commit.sort_values(by=['projectID', 'committerDate'])

In [52]:
total_lines = []
project = 'accumulo'
la_counter = 0
lr_counter = 0
for i, row in gcg_by_commit.iterrows():
  if project!=row['projectID']:
    project=row['projectID']
    la_counter = 0
    lr_counter = 0
  la_counter+=row['lines_added']
  lr_counter+=row['lines_removed']
  total_lines.append(la_counter-lr_counter)


gcg_by_commit['total_lines'] = total_lines

In [53]:
gcg_by_commit = gcg_by_commit[gcg_by_commit['total_lines']>=0] #to avoid 2 lines of wrong data in te commons-cli project

In [54]:
gcg_by_commit['added/total_lines'] = gcg_by_commit['lines_added']/gcg_by_commit['total_lines']

In [55]:
gcg_by_commit = gcg_by_commit[gcg_by_commit['added/total_lines']<=1] #to avoid 1 line of wrong data in commons-cli project 

In [56]:
gcg_by_commit = gcg_by_commit[['commitHash', 'entropylike', 'added/total_lines']]

In [57]:
jira_bugs = jira_issues[jira_issues['type'] == 'Bug']
jira_bugs = jira_bugs[['key', 'priority']]

In [58]:
szz_fault_inducing_commits = szz_fault_inducing_commits[['faultInducingCommitHash', 'key']]
szz_fault_inducing_commits = szz_fault_inducing_commits.rename(columns={'faultInducingCommitHash':'commitHash'})
szz_fault_inducing_commits.head()

Unnamed: 0,commitHash,key
0,49cbb142a2b5d7d89aab077dc63f7646828c9408,DAEMON-370
1,43d485a0e99c613daffba7d5b410e8e4a302e4f7,DAEMON-370
2,4f232e47002359f1a8156b5f03b227c7bae9874a,DAEMON-370
3,4f232e47002359f1a8156b5f03b227c7bae9874a,DAEMON-370
4,c880c8ebad0e83a068301d0bbf8c76be36a59962,BEAM-5145


In [59]:
Y = pd.merge(szz_fault_inducing_commits, jira_bugs, on='key')

In [60]:
def priorityToCategory(p: str):
    """
    """
    if p == 'No bug': return 0
    if p == 'Trivial': return 1
    if p == 'Minor': return 2
    if p == 'Blocker': return 3
    if p == 'Major': return 4
    if p == 'Critical': return 5


Y['priority'] = Y['priority'].apply(lambda p: priorityToCategory(p))

In [61]:
Y = Y[['commitHash', 'priority']]

In [62]:
multitarget = True #in case we are predicting multiple bugs for each commit

In [63]:
if not multitarget:
  Y = Y.sort_values(by='commitHash')
  Y = Y.groupby('commitHash').max().reset_index() #otherwise, we predict the one with highest priority

In [64]:
git_commits = git_commits[['commitHash', 'inMainBranch',	'merge']]

In [65]:
sonar_measures.drop(['projectID', 'SQAnalysisDate', 'functionComplexityDistribution', 'fileComplexityDistribution', \
                     'lastCommitDate', 'nclocLanguageDistribution', 'alertStatus', 'qualityGateDetails', 'qualityProfiles', 'files'], axis=1, inplace=True)

In [66]:
X = pd.merge(git_commits, sonar_measures, how='inner', on='commitHash')

In [67]:
X2 = pd.merge(X, gcg_by_commit, on='commitHash', how='inner')

In [68]:
df = pd.merge(X2, Y, on='commitHash', how='left')

In [69]:
df['priority'] = df['priority'].fillna(0)

In [70]:
df = df.fillna(df.mean()) #just for one of the multilables

In [71]:
if multitarget:
  df.to_csv('../data/processed/bugs-multitarget.csv', index=False)
else:
  # df.to_csv('../data/processed/bugs-singletarget_with_mean.csv', index=False)
  pass