# Imports

In [None]:
import requests  # To get the data
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from pymongo import MongoClient  # Database to store the data
import json  # File IO
from time import time  # To time the duration of the requests
from time import sleep
from IPython.display import display, clear_output
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Load Program Data

In [None]:
# Get list of Jira sources
with open('../0. DataDefinition/jira_data_sources.json') as f:
    jira_data_sources = json.load(f)
    
# Connect to the database
db = MongoClient()['JiraRepos']

# Now-Invalid Jiras
INVALID_JIRAS = ['Mindville', 'MariaDB']

# Investigate Jira Data Accessibility

In [None]:
def check_jira_url(jira_url):
    print('')
    print(f"💡 Check Jira: {jira_url}")
    print('')
    ## CHECK PROVIDED JIRA URL AVAILABILITY ##
    print(f'Checking Jira url existence with GET: {jira_url}')
    try:
        requests.head(jira_url)
    except ConnectionError:
        print('❌ Provided Jira base url does not exist')
        return
    else:
        print('✅ Provided Jira base url is reachable')
        
    ## CHECK PROVIDED JIRA URL API AVAILABILITY ##
    response = requests.get(jira_url + '/rest/api/2/issuetype')
    print('')
    print(f'Checking Jira api with GET: {response.url}')
    # Check response code
    if response.status_code < 300:
        print('✅ Jira API returned a successful response')
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return
    
    ## CHECK NUMBER OF ISSUES ##
    response = requests.get(jira_url + '/rest/api/2/search?jql=&maxResults=0')
    print('')
    print(f"Retrieving total issue count with GET: {response.url}")
    # Check response code
    if response.status_code < 300:
        print(f"Total Number of Issues: {response.json()['total']}")
        print('✅ Jira API returned a successful response')
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return

In [None]:
# Check all Jira URLs in provided jira_data_sources
for jira_name, jira_obj in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue
    
    check_jira_url(jira_obj['jira_url'])

# Download Jira Data

### Helper Functions

In [None]:
def format_duration(start_time, end_time):
    # Get the total seconds of the duration
    seconds = end_time - start_time
    # Calculate the other time 
    milliseconds = int((seconds % 1) * 10000)
    minutes = int(seconds / 60)
    hours   = int(minutes / 60)
    # Trim the values to fit in their appopriate slots
    display_minutes = int(minutes % 60)
    display_seconds = int(seconds % 60)

    return f"{hours:02}:{display_minutes:02}:{display_seconds:02}.{milliseconds:04}"

### Download Jira Issue Type Information

In [None]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuetype_url = jira_data['jira_url'] + '/rest/api/2/issuetype'

    # Get the issuetype definitions
    documented_issuetypes = {
        issuetype['name']: issuetype
        for issuetype in requests.get(jira_issuetype_url).json()
    }

    # Save the information
    output_json[jira_name] = documented_issuetypes

# Write JSON to file
with open('jira_issuetype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

### Download Jira Issue Link Type Information

In [None]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuelinktype_url = jira_data['jira_url'] + '/rest/api/2/issueLinkType'

    # Get the issuelinktype definitions
    documented_issuelinktypes = {
        issuelinktype['name']: issuelinktype
        for issuelinktype in requests.get(jira_issuelinktype_url).json()['issueLinkTypes']
    }

    # Save the information
    output_json[jira_name] = documented_issuelinktypes

# Write JSON to file
with open('jira_issuelinktype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

### Download Jira Issue Field Information

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json

jiras_fields_information = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue
    
    # Query Jira for field information
    response = requests.get(f"{jira_data['jira_url']}/rest/api/2/field")
    # Store result in JSON
    jiras_fields_information[jira_name] = response.json()

# Write JSON to file for later use
with open('jira_field_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(jiras_fields_information, json_file, ensure_ascii=False, indent=4)

### Download Jira Data Commands

In [None]:
def download_and_write_data_mongo(
    jira_data_source,
    num_desired_results = None,  # Leave as "None" to download all, otherwise specify a number
    iteration_max = 250,  # Recommended to keep at or below 500
    start_index = 0,  # This allows you to start back up from a different place
    ):
    
    def build_url(base_url, start_index, iteration_max=100):
        return (
            base_url +
            f"/rest/api/2/search?"
            f"jql="
            f"&ORDER%20BY%20created%20ASC"
            f"&startAt={start_index}"
            f"&maxResults={iteration_max}"
            f"&expand=changelog"
            )
    
    collection = db[jira_data_source['name']]

    # iteration_max is the number of issues the script will attempt to get at one time.
    # The Jira default max is 1000. Trying with 1000 consistently returned errors after a short while
    # as the object being returned was likely too large. Values of 500 or less serve no particular issue
    # to the script except that more calls (of smaller size) have to be made.
    
    # How many issues to collect before writing to MongoDB
    num_issues_per_write = 10000

    last_write_start_index = start_index
    issues = []

    # Available and requested number of results
    num_available_results = requests.get(build_url(jira_data_source['jira_url'], 0,0)).json()['total']
    print(f'Number of Desired Results   : {num_desired_results if num_desired_results else "All"}')
    print(f'Number of Available Results : {num_available_results}')
    print('')

    # Set the number of results to retrieve based on information from Jira server
    if not num_desired_results:
        num_remaining_results = num_available_results
    else:
        num_remaining_results = min(int(num_desired_results), num_available_results)
    # Adjust remaining results based on their start index
    num_remaining_results -= start_index

    # Collect results while there are more results to gather
    issues_downloaded = 0
    max_count_width = len(str(num_remaining_results)) + 1
    print(f"Total Remaining:{num_remaining_results:< {max_count_width}}")
    while(num_remaining_results > 0):

        # Start a timer for this particular chunk
        start_time = time()

        # Number of items to retrieve
        num_items_to_retrieve = min(iteration_max, num_remaining_results)
        
        # Get issues from Jira
        url = build_url(jira_data_source['jira_url'], start_index, num_items_to_retrieve)
        response = requests.get(url)
        response_json = response.json()
        if 'issues' in response_json:
            # Add issues to program list
            issues.extend(response_json['issues'])
            num_returned_issues = len(response_json['issues'])

        # Adjust the remaining results to get
        num_remaining_results -= num_returned_issues

        # Print progress for user
        end_index = start_index + num_returned_issues - 1
        print(
            f"Total Remaining:{num_remaining_results:< {max_count_width}}  "
            f"Retrieved Items: {start_index:< {max_count_width}} - {end_index:< {max_count_width}}  "
            f"Duration: {format_duration(start_time, time())}")

        # Move the start index
        start_index += num_returned_issues

        # Write the issues to file IF there are enough of them. This is a nice way to save state and start over at a
        # certain place if there are too many to download in one go.
        if len(issues) >= num_issues_per_write or num_remaining_results == 0 or num_returned_issues == 0:
            # Write the data to mongodb
            collection.insert_many(issues)

            print('... Issues written to database ...')
            last_write_start_index = start_index

            issues_downloaded += len(issues)
            issues = []  # Clear the issues so that our memory doesn't get too full

        # If we have for some reason run out of results, we may want to react to this in some way
        if num_returned_issues == 0:
            print('Number of Returned Issues is 0. This is strange and should not happen. Investigate.')
            return

    print('')
    print(f"Number of Downloaded Issues: {issues_downloaded}")

In [None]:
# Last download time: 6h 47m
download_and_write_data_mongo(jira_data_sources['Apache'])

In [None]:
# Last download time: 0h 27m
download_and_write_data_mongo(jira_data_sources['Hyperledger'])

In [None]:
# Last download time: 0h 5m
download_and_write_data_mongo(jira_data_sources['IntelDAOS'])

In [None]:
# Last download time: 0h 12m
download_and_write_data_mongo(jira_data_sources['JFrog'])

In [None]:
# Last download time: 6h 26m
download_and_write_data_mongo(jira_data_sources['Jira'])

In [None]:
# Last download time: 0h 30m
download_and_write_data_mongo(jira_data_sources['JiraEcosystem'])

In [None]:
# download_and_write_data_mongo(jira_data_sources['MariaDB'])

In [None]:
# download_and_write_data_mongo(jira_data_sources['Mindville'])

In [None]:
# Last download time: 1h 26m
download_and_write_data_mongo(jira_data_sources['Mojang'])

In [None]:
# Last download time: 3h 23m
download_and_write_data_mongo(jira_data_sources['MongoDB'])

In [None]:
# Last download time: 0h 50m
download_and_write_data_mongo(jira_data_sources['Qt'])

In [None]:
# Last download time: 3h 58m
download_and_write_data_mongo(jira_data_sources['RedHat'])

In [None]:
# Last download time: 0h 24m
download_and_write_data_mongo(jira_data_sources['Sakai'])

In [None]:
# Last download time: 1h 25m
download_and_write_data_mongo(jira_data_sources['SecondLife'])

In [None]:
# Last download time: 1h 25m
download_and_write_data_mongo(jira_data_sources['Sonatype'])

In [None]:
# Last download time: 0h 20m
download_and_write_data_mongo(jira_data_sources['Spring'])

### Download Jira Issue Comments

In [None]:
def download_jira_issue_comments(jira_data_source, *, max_comments_per_query=50, resume_at_date='0', query_wait_time_minutes=None):
    
    def build_url(base_url, issue_key, comments_start_index=0, max_comments_per_query=max_comments_per_query):
        return (
            base_url +
            f"/rest/api/2/issue/{issue_key}/comment"
            f"?orderBy=created"
            f"&startAt={comments_start_index}"
            f"&maxResults={max_comments_per_query}"
            )
    
    # Initialise requests object with configurations to make it more stable
    session = requests.Session()
    retry = Retry(total=4, connect=4, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    acceptable_failure_messages = [
        'Login Required',
        'Issue Does Not Exist',
        'Issue does not exist or you do not have permission to see it.',
    ]
    
    progress_bar_num_chunks = 100
    progress_bar_last = -1

    collection = db[jira_data_source['name']]

    # Ignore Jiras that we know are now unreachable or empty
    if jira_data_source['name'] in INVALID_JIRAS:
        print(f"Cannot download comments for {jira_data_source['name']} due to innaccessible Jira repo.")
        return
    
    print('Querying MongoDB for list of remaining issue keys to get comments for ...')

    # The data is downloaded per issue key, so we must get a complete list of all issue keys
    # This could be kept as a gnerator (by removing the explicit "list()" operator), but then we have to query the dataset twice.
    # This is a space-time tradoff, and I have chosen time.
    jira_issue_keys = list(collection.aggregate([
        # Only get issues without comments already
        { '$match': { 'fields.comments': { '$eq': None } } },
        # We only need the issue id, key, created date
        { '$project': { 'key': 1, 'created': '$fields.created' } },
        # Only get issues at or equal to our "resume" date
        { '$match': { 'created': { '$gte': resume_at_date } } },
        # Sort the results by created date so we can resume if failure occurs
        { '$sort': { 'created': 1 } },
    ], allowDiskUse=True))
    
    # Get the total count of issues so we can get an understanding of progress
    # jira_query_issue_count = collection.count_documents({ 'fields.created': { '$gte': resume_at_date } })
    jira_query_issue_count = len(jira_issue_keys)
    num_issues_complete = 0
    
    print(f"There are {jira_query_issue_count} remaining issues.")

    # Loop through the issue keys, downloading the comments one at a time
    for issue in jira_issue_keys:
        
        # print(f"Working on issue {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
        
        issue_comments = []
        comments_index = 0
        
        # Loop through comment downloads until all are downloaded. We don't know how many comments exist until we ask for the first N, so we need a while loop
        while True:
            
            # Get the issue comments
            url = build_url(jira_data_source['jira_url'], issue['key'], comments_start_index=comments_index)
            response = session.get(url, verify=False)
            response_json = response.json()
            
            # Check if the response is valid. If not, we skip to the next issue. Some issues are private, etc., so we skip them
            if 'errorMessages' in response_json:
                # Here are the error messages we may run into that we simply skip. We don't want to skip all error messages,
                # so we only check a few here and otherwise break the script to investigate.
                if [message for message in acceptable_failure_messages if message in response_json['errorMessages']]:
                    break  # Break the while loop collecting comments for this issue, and move on to the next issue
                # Otherwise ...
                print(f"\nWas working on {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
                print('\nLast response json:')
                display(response_json)
                raise Exception('Not sure why the network request has failed.')
            
            # Save this round of issue comments
            issue_comments.extend(response_json['comments'])
            comments_index = len(issue_comments)
            
            # Wait a little as to not overload the number of requests being sent
            if query_wait_time_minutes:
                sleep(query_wait_time_minutes)
            
            # Check if we have collected all of the possible comments
            if comments_index == response_json['total']:
                break

        # Write all of the comments to the MongoDB Jira issue at once
        collection.update_one(
            {'_id': issue['_id']},
            { '$set': { 'fields.comments': issue_comments } }
        )
        
        num_issues_complete += 1
        
        # Output progress
        clear_output(wait=True)
        print(f"Jira: {jira_data_source['name']}")
        print(f"Number of issues to download comments from: {jira_query_issue_count:,}")
        print(f"resume_at_date: {resume_at_date}")
        print('')
        print(f"Last confirmed issue {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
        print(f"Progress: [{'#'*round((num_issues_complete/jira_query_issue_count)*progress_bar_num_chunks):.<{progress_bar_num_chunks}}]")

In [None]:
# Last download time: Multiple weeks due to extreme rate-limiting.
download_jira_issue_comments(
    jira_data_sources['Apache'],
    query_wait_time_minutes=0.1
)

In [None]:
# Last download time: 1h 40m (~290/m)
download_jira_issue_comments(
    jira_data_sources['Hyperledger']
)

In [None]:
# Last download time: 3h 10m (~50/m)
download_jira_issue_comments(
    jira_data_sources['IntelDAOS']
)

In [None]:
# Last download time: 1h 20m (~200/m)
download_jira_issue_comments(
    jira_data_sources['JFrog']
)

In [None]:
# Last download time: 21h 20m (~215/m)
download_jira_issue_comments(
    jira_data_sources['Jira']
)

In [None]:
# Last download time: 6h 40m (~105/m)
download_jira_issue_comments(
    jira_data_sources['JiraEcosystem']
)

In [None]:
# # Last download time: h m (/m)
# download_jira_issue_comments(
#     jira_data_sources['MariaDB']
# )

In [None]:
# # Last download time: h m (/m)
# download_jira_issue_comments(
#     jira_data_sources['Mindville']
# )

In [None]:
# Last download time: 14h 20m (~490/m)
download_jira_issue_comments(
    jira_data_sources['Mojang']
)

In [None]:
# Last download time: 18h 20m (~125/m)
download_jira_issue_comments(
    jira_data_sources['MongoDB']
)

In [None]:
# Last download time: 6h 10m (~400/m)
download_jira_issue_comments(
    jira_data_sources['Qt']
)

In [None]:
# Last download time: 18h 5m (~325/m)
download_jira_issue_comments(
    jira_data_sources['RedHat']
)

In [None]:
# Last download time: 16h 50m (~50/m)
download_jira_issue_comments(
    jira_data_sources['Sakai']
)

In [None]:
# Last download time: 0h 16m (~115/m)
download_jira_issue_comments(
    jira_data_sources['SecondLife']
)

In [None]:
# Last download time: 10h 50m (~135/m)
download_jira_issue_comments(
    jira_data_sources['Sonatype']
)

In [None]:
# Last download time: 8h 0m (~145/m)
download_jira_issue_comments(
    jira_data_sources['Spring']
)