# **Mining Task**
### By Mohammadreza Saeidi

Using Jira, this project collects all Bug Reports (BR) submitted. It then extracts all commits that add lines to the source code (not modify or delete lines). Finally, it determines corresponding Bug Fixing Changes for each BR.

Some tips tht are used in this project are inspired by Borg et al.'s paper [1]. e.g., Regex and JQL for Jira.

## Reference:
> [1] M. Borg, O. Svensson, K. Berg, and D. Hansson, “SZZ Unleashed: An Open Implementation of the SZZ Algorithm - Featuring Example Usage in a Study of Just-in-Time Bug Prediction for the Jenkins Project,” in *Proceedings of the 3rd ACM SIGSOFT International Workshop on Machine Learning Techniques for Software Quality Evaluation*, 2019, pp. 7–12. [doi:10.1145/3340482.3342742](https://arxiv.org/abs/1903.01742)




In [None]:
# Using PyDriller library to retrieve commits
!pip install pydriller

# Using Jira library to retrieve Issues
!pip install jira

In [2]:
# Configurations
GIT_URL = "https://github.com/apache/activemq.git"
GIT_BRANCH_NAME = "main"
JIRA_URL = "https://issues.apache.org/jira/"
JIRA_MAX_RESULT_COUNT = 1000
PROJECT_NAME = "ActiveMQ"
PROJECT_BRIEF_NAME = "AMQ"
#Obtained from [1]
ISSUE_TYPES = ["Bug"]
ISSUE_STATUS = ["Resolved", "Closed"]
ISSUE_RESOLUTION = ["Fixed"]

In [3]:
import pandas as pd
import dateutil.parser
from datetime import datetime
import re

# Retrieving Bug Reports from Jira

In [None]:
from jira import JIRA
def fetch_issues(jira_server_url:str, jql:str) -> pd.DataFrame:
  """
  It takes the URL of a Jira server and a query to retrieve and 
  return all issues matching the given query in pd.DataFrame.
  """
  fetched_issues = pd.DataFrame(columns=[
      'issue_id', 
      'issue_key', 
      'issue_created_date', 
      'issue_summary', 
      'issue_description'
      ])
  
  last_fetched_index = 0
  
  jira_server = JIRA(server = jira_server_url)
  
  # Retrieve Issues iteratively and JIRA_MAX_RESULT_COUNT (default = 1000) issues in each iteration
  while True:
    print(f"Fetching Issues: iteraation #{(last_fetched_index//JIRA_MAX_RESULT_COUNT) + 1}")

    result = jira_server.search_issues(
        jql_str = jql, 
        fields = ['created', 'summary', 'description'], 
        startAt = last_fetched_index, 
        maxResults = JIRA_MAX_RESULT_COUNT
        )
    
    for issue in result:
      new_row = {
          'issue_id' : issue.id, 
          'issue_key' : issue.key, 
          'issue_created_date': dateutil.parser.isoparse(issue.get_field('created')), 
          'issue_summary' : issue.get_field('summary'), 
          'issue_description': issue.get_field('description')
          }
      fetched_issues = fetched_issues.append(new_row, ignore_index = True)

    print(f"{len(result)} new Issues Recieved, All fetched issues: {len(fetched_issues)} out of {result.total}")

    # Breaking the loop when all issues have been retrieved
    if fetched_issues.shape[0] == result.total:
      print("All issues have been recieved.")
      break

    last_fetched_index += len(result)

  return fetched_issues


def create_jql(project_name:str, issue_types:list, issue_status:list, issue_resolution:list) -> str:
  """
  Given project name, issue types, issue status, and issue resolution, 
  this method generates a Query that can be used for retrieving issues from Jira.
  """
  jql = ""

  if project_name:
    jql += f"project = {project_name}"
  
  if issue_types:
    if jql:
      jql += " AND "
    jql += f"issuetype in ({','.join(issue_types)})"

  if issue_status:
    if jql:
      jql += " AND "
    jql += f"status in ({','.join(issue_status)})"

  if issue_resolution:
    if jql:
      jql += " AND "
    jql += f"resolution in ({','.join(issue_resolution)})"

  if jql:
    jql += " ORDER BY created DESC"

  return jql

jql = create_jql(PROJECT_NAME, ISSUE_TYPES, ISSUE_STATUS, ISSUE_RESOLUTION)
all_issues = fetch_issues(JIRA_URL, jql)

# Retrieving Commits from GitHub

In [None]:
from pydriller import Repository
def fetch_commits(git_url:str, start_date:datetime) -> pd.DataFrame:
  """
  The method retrieves commits that (1) have been submitted after start_date, 
  (2) don't belong to a merge request, cherry-pick, or nothing commit, 
  (3) contain only new lines (not modified or deleted).
  """

  commits = pd.DataFrame(columns=['commit_hash', 'commit_message'])
  repository = Repository(path_to_repo = git_url, since = start_date, only_in_branch = GIT_BRANCH_NAME, order = 'reverse')
  excluding_pattern = re.compile("merge|cherry|nothing", flags = re.IGNORECASE)

  for commit in repository.traverse_commits():

    # Eliminating commits that belong to a merge request, cherry-pick, or nothing commit
    if re.search(excluding_pattern, commit.msg):
      continue

    added_lines_count = 0
    deleted_lines_count = 0

    # Calculating the total number of added lines and deleted lines by traversing through each modified file
    for file in commit.modified_files:
      added_lines_count += file.added_lines
      deleted_lines_count += file.deleted_lines
    
    # Eliminating commits that have deleted lines or have no added lines
    if deleted_lines_count > 0 or added_lines_count == 0:
      continue
  
    new_row = {
        'commit_hash' : commit.hash, 
        'commit_message' : commit.msg 
        }
    commits = commits.append(new_row, ignore_index = True)

    print(f"{commits.shape[0]} commits were added")

  print("All commits have been recieved")
  return commits

# Retrieving commits that have been submitted after the oldest bug report
oldest_bug_report = all_issues.iloc[-1]['issue_created_date']
all_commits = fetch_commits(GIT_URL, oldest_bug_report)

# Merging BRs with their corresponding BFCs

In [None]:
def merge_issues_commits(issues:pd.DataFrame, commits:pd.DataFrame) -> pd.DataFrame:
  """
  Given all commits and all BRs, this method links each BR to its corresponding BFCs
  """
  merged_issues_commits = pd.DataFrame(columns=['bug_id', 'bug_summery', 'bug_description', 'bfc_hash', 'bfc_message'])

  for _, issue in issues.iterrows():
    print(f"Searching for {issue.issue_key}'s commits")

    num = issue.issue_key.split('-')[1]
    # Using Regex to find BFCs [1]
    patern = re.compile(f"(?:{PROJECT_BRIEF_NAME}(?:-|_|\s)*|#){num}(?:\D|$)", flags = re.IGNORECASE)
    found_commits = commits[commits.commit_message.str.contains(patern, regex = True)]
    
    for _, commit in found_commits.iterrows():
      new_row = {
          'bug_id' : issue.issue_key, 
          'bug_summery' : issue.issue_summary, 
          'bug_description' : issue.issue_description, 
          'bfc_hash' : commit.commit_hash, 
          'bfc_message' : commit.commit_message 
      }
      merged_issues_commits = merged_issues_commits.append(new_row, ignore_index = True)

  return merged_issues_commits

merged_issues_commits = merge_issues_commits(all_issues, all_commits)

# Storing the resulted dataset into a CSV file

In [10]:
merged_issues_commits.to_csv("BRs_and_BFCs.csv")

In [None]:
merged_issues_commits