# Collect data of issues in github repositories

In [109]:
import json
import time
import datetime

import pandas as pd
import numpy as np

import requests
import re
from collections import OrderedDict


### Gather data via GitHub's GraphQL

In [25]:
# # See how to get a token here: https://github.blog/2013-05-16-personal-api-tokens/
api_token = "<Your Token>"

headers = {'Authorization': 'token %s' % api_token}


In [26]:
def run_query(query): 
    ''' A simple function to use requests.post to make the API call. Note the json= section. '''
    
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
    

In [51]:
# Create a GraphQL query that we will use to collect data        
def create_query(repo_name, hasNextPage=True, afterCursor=None):
    
    if hasNextPage and afterCursor:
        strAfter = ' after:"{}",'.format(afterCursor)
    else:
        strAfter = ''
        
    query = '''
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      search(first: 50,%s type: ISSUE, query: "repo:%s is:issue") {
        issueCount
        pageInfo {
          hasNextPage
          endCursor
        }
        edges {
          node {
            ... on Issue {
              number
              title
              url
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              author {
                login
              }
              authorAssociation
              participants(first:50) {
                totalCount
                nodes {
                  login
                  company
                }
              }
              comments(first:50) {
                totalCount
                nodes {
                  author {
                    login
                  }
                  authorAssociation
                  createdAt
                  bodyText
                }
              }
              labels(first:50) {
                nodes {
                  name
                }
              }
              activeLockReason
            }
          }
        }
      }
    }
    ''' % (strAfter, repo_name)
    
    return query
    

In [32]:
query = create_query("bitovi/funcunit")
# Get a result of the test query
result = run_query(query) # Execute the query
remaining_rate_limit = result["data"]["rateLimit"]["remaining"] # Drill down the dictionary
print("Remaining rate limit - {}".format(remaining_rate_limit))

Remaining rate limit - 4999


In [33]:
result

{'data': {'rateLimit': {'cost': 1,
   'remaining': 4999,
   'resetAt': '2019-07-02T04:01:46Z'},
  'search': {'issueCount': 151,
   'pageInfo': {'hasNextPage': True, 'endCursor': 'Y3Vyc29yOjUw'},
   'edges': [{'node': {'number': 246,
      'title': 'invalid usage of delete operator',
      'url': 'https://github.com/bitovi/funcunit/issues/246',
      'createdAt': '2019-05-29T04:20:32Z',
      'updatedAt': '2019-05-30T17:31:52Z',
      'closedAt': None,
      'state': 'OPEN',
      'closed': False,
      'locked': False,
      'author': {'login': 'koalixCZ'},
      'authorAssociation': 'NONE',
      'participants': {'totalCount': 3,
       'nodes': [{'login': 'koalixCZ', 'company': None},
        {'login': 'matthewp', 'company': 'Bitovi'},
        {'login': 'chasenlehara', 'company': '@bitovi'}]},
      'comments': {'totalCount': 1,
       'nodes': [{'author': {'login': 'matthewp'},
         'authorAssociation': 'CONTRIBUTOR',
         'createdAt': '2019-05-29T10:22:22Z',
         'bodyT

In [57]:
hasNextPage = result2['data']['search']['pageInfo']['hasNextPage']
endCursor = result2['data']['search']['pageInfo']['endCursor']

In [58]:
query2 = create_query("bitovi/funcunit", hasNextPage=hasNextPage, afterCursor=endCursor)
# Get a result of the test query
result2 = run_query(query2) # Execute the query

In [59]:
result2

{'data': {'rateLimit': {'cost': 1, 'remaining': 4996, 'resetAt': '2019-07-02T04:07:17Z'}, 'search': {'issueCount': 151, 'pageInfo': {'hasNextPage': False, 'endCursor': 'Y3Vyc29yOjE1MQ=='}, 'edges': [{'node': {'number': 2, 'title': 'Syn Click in IE8', 'url': 'https://github.com/bitovi/funcunit/issues/2', 'createdAt': '2010-08-30T20:49:08Z', 'updatedAt': '2010-10-07T15:07:01Z', 'closedAt': '2010-10-07T15:07:01Z', 'state': 'CLOSED', 'closed': True, 'locked': False, 'author': {'login': 'arian'}, 'authorAssociation': 'NONE', 'participants': {'totalCount': 3, 'nodes': [{'login': 'arian', 'company': 'Symbaloo'}, {'login': 'jupiterjs', 'company': None}, {'login': 'cpojer', 'company': 'Facebook'}]}, 'comments': {'totalCount': 10, 'nodes': [{'author': None, 'authorAssociation': 'NONE', 'createdAt': '2010-08-30T21:14:53Z', 'bodyText': 'This is happening outside of jsfiddle?'}, {'author': None, 'authorAssociation': 'NONE', 'createdAt': '2010-08-30T21:37:04Z', 'bodyText': 'Ah, if I do the test with

In [60]:
hasNextPage = result2['data']['search']['pageInfo']['hasNextPage']
endCursor = result2['data']['search']['pageInfo']['endCursor']

In [68]:
result2['data']['search']['edges']


[{'node': {'number': 2, 'title': 'Syn Click in IE8', 'url': 'https://github.com/bitovi/funcunit/issues/2', 'createdAt': '2010-08-30T20:49:08Z', 'updatedAt': '2010-10-07T15:07:01Z', 'closedAt': '2010-10-07T15:07:01Z', 'state': 'CLOSED', 'closed': True, 'locked': False, 'author': {'login': 'arian'}, 'authorAssociation': 'NONE', 'participants': {'totalCount': 3, 'nodes': [{'login': 'arian', 'company': 'Symbaloo'}, {'login': 'jupiterjs', 'company': None}, {'login': 'cpojer', 'company': 'Facebook'}]}, 'comments': {'totalCount': 10, 'nodes': [{'author': None, 'authorAssociation': 'NONE', 'createdAt': '2010-08-30T21:14:53Z', 'bodyText': 'This is happening outside of jsfiddle?'}, {'author': None, 'authorAssociation': 'NONE', 'createdAt': '2010-08-30T21:37:04Z', 'bodyText': 'Ah, if I do the test without Mootools, and changing onload to load, it works.  Mootools must be overwriting something that Syn depends on.'}, {'author': {'login': 'arian'}, 'authorAssociation': 'NONE', 'createdAt': '2010-08

In [94]:
list_repos = [
    "qunitjs/qunit",
    "mochajs/mocha",
    "facebook/jest",
    "jasmine/jasmine",
    "bitovi/funcunit",
    "GoogleChrome/puppeteer",
    "cypress-io/cypress"
]

In [95]:
raw_data_file = 'temp/raw_issues_data.txt'

In [96]:
with open(raw_data_file, 'w') as myfile:
    
    for cur_idx, cur_repo in enumerate(list_repos):
        
        hasNextPage = True
        endCursor = None
        
        cnt_nextPage = 0
        
        print('***** [{}] START repo#{}-{}" *****'.format(str(datetime.datetime.now()), cur_idx, cur_repo))
        
        while hasNextPage:

            # Slow down the requests
            time.sleep(1)
            
            cnt_nextPage += 1
            print('|-- [{}] page#{}" *****'.format(str(datetime.datetime.now()), cnt_nextPage))

            try:
                query = create_query(cur_repo, hasNextPage=hasNextPage, afterCursor=endCursor)
                # Get a result of the test query
                result = run_query(query) # Execute the query


                # See whether we need to go to the next page
                hasNextPage = result['data']['search']['pageInfo']['hasNextPage']
                endCursor = result['data']['search']['pageInfo']['endCursor']

                # Write data to file
                queried_data = result['data']['search']['edges']
                for cur_data in queried_data:
                    cur_json_str = json.dumps(cur_data['node'])
                    myfile.write(cur_json_str + '\n')
            except:
                print('There are errors when retrieving data!')


***** [2019-07-01 23:30:58.197355] START repo#0-qunitjs/qunit" *****
|-- [2019-07-01 23:30:59.202638] page#1" *****
|-- [2019-07-01 23:31:02.222314] page#2" *****
|-- [2019-07-01 23:31:07.841660] page#3" *****
|-- [2019-07-01 23:31:12.522847] page#4" *****
|-- [2019-07-01 23:31:19.097989] page#5" *****
|-- [2019-07-01 23:31:23.402208] page#6" *****
|-- [2019-07-01 23:31:28.689065] page#7" *****
|-- [2019-07-01 23:31:34.034337] page#8" *****
|-- [2019-07-01 23:31:41.188897] page#9" *****
|-- [2019-07-01 23:31:46.234173] page#10" *****
|-- [2019-07-01 23:31:52.261595] page#11" *****
|-- [2019-07-01 23:31:56.707553] page#12" *****
|-- [2019-07-01 23:32:01.282281] page#13" *****
|-- [2019-07-01 23:32:04.958036] page#14" *****
|-- [2019-07-01 23:32:08.693634] page#15" *****
***** [2019-07-01 23:32:09.285224] START repo#1-mochajs/mocha" *****
|-- [2019-07-01 23:32:10.290591] page#1" *****
|-- [2019-07-01 23:32:13.866720] page#2" *****
|-- [2019-07-01 23:32:20.111684] page#3" *****
|-- [2019-

In [97]:
with open(raw_data_file) as f:
    raw_content = f.readlines()


In [98]:
out_json_name_raw = 'temp/github_repo_issues_raw.json'

In [99]:
# Remove whitespace characters like `\n` at the end of each line
all_content = [x.strip() for x in raw_content] 

# Assemble json
all_json_str = ','.join(all_content)
out_json_str = '[{}]'.format(all_json_str)


with open(out_json_name_raw, 'w') as f:  
    f.write(out_json_str)

In [100]:
with open(out_json_name_raw) as json_file:  
    data_repo = json.load(json_file)

In [101]:
len(data_repo)

5851

In [102]:
data_repo[0]

{'number': 1399, 'title': 'Can the test file name be passed in the "suitestart" and "suiteend" callback ?', 'url': 'https://github.com/qunitjs/qunit/issues/1399', 'createdAt': '2019-06-20T23:29:44Z', 'updatedAt': '2019-06-20T23:29:44Z', 'closedAt': None, 'state': 'OPEN', 'closed': False, 'locked': False, 'author': {'login': 'muthu90ec'}, 'authorAssociation': 'NONE', 'participants': {'totalCount': 1, 'nodes': [{'login': 'muthu90ec', 'company': None}]}, 'comments': {'totalCount': 0, 'nodes': []}, 'labels': {'nodes': []}, 'activeLockReason': None}

In [110]:
list(data_repo[0].keys())

['number', 'title', 'url', 'createdAt', 'updatedAt', 'closedAt', 'state', 'closed', 'locked', 'author', 'authorAssociation', 'participants', 'comments', 'labels', 'activeLockReason']

In [153]:
data_repo[10]['labels']

{'nodes': []}

In [149]:
cur_data = data_repo[4]

In [163]:
clean_data = []

for cur_data in data_repo:
    new_data = OrderedDict() 

    cur_url = cur_data['url']
    new_data['repo_name'] = re.findall(r'https:\/\/github.com\/(.*)\/issues/\d', cur_url)[0]
    new_data['issue_id'] = cur_data['number']
    new_data['title'] = cur_data['title']
    new_data['url'] = cur_url
    new_data['createdAt'] = cur_data['createdAt']
    new_data['updatedAt'] = cur_data['updatedAt']
    new_data['closedAt'] = cur_data['closedAt']
    new_data['state'] = cur_data['state']
    new_data['closed'] = cur_data['closed']
    new_data['locked'] = cur_data['locked']
    new_data['activeLockReason'] = cur_data['activeLockReason']

    new_data['authorLogin']= cur_data['author']['login'] if cur_data['author'] else ''
    new_data['authorAssociation'] = cur_data['authorAssociation']

    new_data['participants'] = OrderedDict() 
    new_data['participants']['totalCount'] = cur_data['participants']['totalCount']
    new_data['participants']['data'] = cur_data['participants']['nodes']

    new_data['comments'] = OrderedDict() 
    new_data['comments']['totalCount'] = cur_data['comments']['totalCount']

    cur_all_comments = cur_data['comments']['nodes']
    cur_data_comments = []
    for cur_comment in cur_all_comments:
        new_comment = OrderedDict() 
        new_comment['authorLogin'] = cur_comment['author']['login'] if cur_comment['author'] else ''
        new_comment['authorAssociation'] = cur_comment['authorAssociation']
        new_comment['createdAt'] = cur_comment['createdAt']
        new_comment['contents'] = cur_comment['bodyText']
        cur_data_comments.append(new_comment)


    new_data['comments']['data'] = cur_data_comments

    new_data['labels'] = [x['name'] for x in cur_data['labels']['nodes']]
    
    clean_data.append(new_data)



In [165]:
len(clean_data)


5851

In [166]:
clean_data[2]

OrderedDict([('repo_name', 'qunitjs/qunit'), ('issue_id', 1394), ('title', 'Display progress in test runner during test suite execution'), ('url', 'https://github.com/qunitjs/qunit/issues/1394'), ('createdAt', '2019-05-13T22:53:27Z'), ('updatedAt', '2019-06-18T17:42:10Z'), ('closedAt', '2019-06-18T17:42:10Z'), ('state', 'CLOSED'), ('closed', True), ('locked', False), ('activeLockReason', None), ('authorLogin', 'ghaagsma'), ('authorAssociation', 'CONTRIBUTOR'), ('participants', OrderedDict([('totalCount', 2), ('data', [{'login': 'ghaagsma', 'company': None}, {'login': 'trentmwillis', 'company': '@Netflix'}])])), ('comments', OrderedDict([('totalCount', 1), ('data', [OrderedDict([('authorLogin', 'trentmwillis'), ('authorAssociation', 'MEMBER'), ('createdAt', '2019-05-17T16:12:02Z'), ('contents', 'Seems like a great idea to me! A pull request to implement would be appreciated 🙂')])])])), ('labels', ['component-reporter'])])

In [168]:
out_json_name_clean = 'temp/github_repo_issues_2019_July_1.json'

In [169]:
with open(out_json_name_clean, 'w') as fp:
    json.dump(clean_data, fp)

In [170]:
with open(out_json_name_clean) as json_file:  
    data_new = json.load(json_file)

In [171]:
len(data_new)

5851

In [185]:
data_new[704]

{'repo_name': 'mochajs/mocha', 'issue_id': 3959, 'title': 'async await execution crash no error reported', 'url': 'https://github.com/mochajs/mocha/issues/3959', 'createdAt': '2019-06-25T11:25:39Z', 'updatedAt': '2019-06-25T11:43:46Z', 'closedAt': '2019-06-25T11:43:46Z', 'state': 'CLOSED', 'closed': True, 'locked': False, 'activeLockReason': None, 'authorLogin': 'adrianwix', 'authorAssociation': 'NONE', 'participants': {'totalCount': 1, 'data': [{'login': 'adrianwix', 'company': None}]}, 'comments': {'totalCount': 1, 'data': [{'authorLogin': 'adrianwix', 'authorAssociation': 'NONE', 'createdAt': '2019-06-25T11:43:46Z', 'contents': "Hi, I don't know why every time I post an issue a magical light come to me and solve it hahhaaha.\nHere is the explanation of what exactly happened I case it helps someone in the future.\nThis test is from a Free Code Camp project I made months ago using express. I was using chai-http for API testing which requires me to export the index.js file from the ser