# Collect data of issues in github repositories

In [595]:
import json
import time
import datetime

import pandas as pd
import numpy as np

import requests
import re
from collections import OrderedDict


### Gather data via GitHub's GraphQL

In [25]:
# # See how to get a token here: https://github.blog/2013-05-16-personal-api-tokens/
api_token = "<Your Token>"

headers = {'Authorization': 'token %s' % api_token}


In [26]:
def run_query(query): 
    ''' A simple function to use requests.post to make the API call. Note the json= section. '''
    
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
    

In [591]:
# Create a GraphQL query that we will use to collect data        
def create_query(repo_name, hasNextPage=True, afterCursor=None):
    
    if hasNextPage and afterCursor:
        strAfter = ', after:"{}",'.format(afterCursor)
    else:
        strAfter = ''
        
    repo_parts = repo_name.split('/')
    repo_owner = repo_parts[0]
    name_of_repo = repo_parts[1]
        
    query = '''
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: %s, name: %s) {
        issues(first: 100 %s) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
                totalCount
                nodes {
                  login
                  company
                }
              }
              comments(first: 100) {
                totalCount
                nodes {
                  author {
                    login
                  }
                  authorAssociation
                  createdAt
                  bodyText
                  reactions(first:20) {
                    totalCount
                    nodes {
                      content
                    }
                  }
                }
              }
              reactions(first:100) {
                totalCount
                nodes {
                  content
                }
              }
              labels(first: 100) {
                nodes {
                  name
                }
              }
              milestone {
                id
                title
                description
                createdAt
                dueOn
                number
                closed


              }
            }
          }
        }
      }
    }
    ''' % (repo_owner, name_of_repo, strAfter)
        
#     query = '''
#     {
#       rateLimit {
#         cost
#         remaining
#         resetAt
#       }
#       search(first: 100,%s type: ISSUE, query: "repo:%s is:issue") {
#         issueCount
#         pageInfo {
#           hasNextPage
#           endCursor
#         }
#         edges {
#           node {
#             ... on Issue {
#               number
#               url
#               title
#               bodyText
#               createdAt
#               updatedAt
#               closedAt
#               state
#               closed
#               locked
#               activeLockReason
#               author {
#                 login
#               }
#               authorAssociation
#               participants(first:100) {
#                 totalCount
#                 nodes {
#                   login
#                   company
#                 }
#               }
#               comments(first:100) {
#                 totalCount
#                 nodes {
#                   author {
#                     login
#                   }
#                   authorAssociation
#                   createdAt
#                   bodyText
#                 }
#               }
#               labels(first:100) {
#                 nodes {
#                   name
#                 }
#               }
#             }
#           }
#         }
#       }
#     }
#     ''' % (strAfter, repo_name)
    
    return query
    

In [294]:
"bitovi/funcunit".split('/')

['bitovi', 'funcunit']

In [579]:
query = create_query("qunitjs/qunit")
result = run_query(query)

In [580]:
result



3645

Note that the api only return results update 1000 issues

In [617]:
list_repos = [
#     "qunitjs/qunit",
#     "mochajs/mocha",
#     "facebook/jest",
#     "jasmine/jasmine",
#     "bitovi/funcunit",
#     "GoogleChrome/puppeteer",
    "cypress-io/cypress"
]

# list_repos = [
#     "jasmine/jasmine",
# ]

# list_repos = ["bitovi/funcunit"]

In [608]:
get_raw_data_file = lambda x: 'temp/raw_issues_data_{}.txt'.format(x)

In [609]:
dict_status = {}

for cur_idx, cur_repo in enumerate(list_repos):
    
    dict_status[cur_repo] = {}
    
    print('***** [{}] START repo#{}-{}" *****'.format(str(datetime.datetime.now()), cur_idx, cur_repo))
    
    raw_data_file_cur_repo = get_raw_data_file(cur_repo.split('/')[1])
    with open(raw_data_file_cur_repo, 'w') as myfile:
        
        hasNextPage = True
        endCursor = None
        
        cnt_nextPage = 1
        
        
        while hasNextPage:
            
            print('|-- [{}] page#{}" *****'.format(str(datetime.datetime.now()), cnt_nextPage))

            try:
                query = create_query(cur_repo, hasNextPage=hasNextPage, afterCursor=endCursor)
                # Get a result of the test query
                result = run_query(query) # Execute the query
                
                print('* Remaining RateLimit: ', result['data']['rateLimit']['remaining'])
                
                if cnt_nextPage == 1:
                    dict_status[cur_repo]['issueCount'] = result['data']['repository']['issues']['totalCount']


                # See whether we need to go to the next page
                hasNextPage = result['data']['repository']['issues']['pageInfo']['hasNextPage']
                endCursor = result['data']['repository']['issues']['pageInfo']['endCursor']

                # Write data to file
                queried_data = result['data']['repository']['issues']['edges']
                for cur_data in queried_data:
                    cur_json_str = json.dumps(cur_data['node'])
                    myfile.write(cur_json_str + '\n')
                    
                cnt_nextPage += 1

                    
            except Exception as e: 
                print('There are errors when retrieving data - Retry the same query again if this happens!')
                print(e)
                
            # Slow down the requests
            time.sleep(120)
                
        dict_status[cur_repo]['total_queried_pages'] = cnt_nextPage
    

***** [2019-07-02 21:35:11.367109] START repo#0-mochajs/mocha" *****
|-- [2019-07-02 21:35:11.369093] page#1" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:37:16.286225] page#2" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 21:39:23.146005] page#3" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:41:31.537777] page#4" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 21:43:41.526847] page#5" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:45:47.600411] page#6" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 21:47:53.542685] page#7" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:49:58.685013] page#8" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 21:52:07.280852] page#9" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:54:14.209111] page#10" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 21:56:23.235786] page#11" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 21:58:31.286612] page#12" *****
* Remaining RateLimit:  4792
|-- [2019-07-02 

|-- [2019-07-02 22:32:42.592378] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 ) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
                totalCount
                nodes {

|-- [2019-07-02 22:47:50.062577] page#4" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 22:49:56.934709] page#5" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOB5K5aQ==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
            

|-- [2019-07-02 23:05:05.491334] page#8" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 23:07:12.800456] page#9" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOCmw3fQ==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
            

|-- [2019-07-02 23:18:03.167476] page#10" *****
* Remaining RateLimit:  4896
|-- [2019-07-02 23:20:11.685388] page#11" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOCu2gtA==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
          

|-- [2019-07-02 23:31:01.817416] page#12" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOC1BreQ==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
              

|-- [2019-07-02 23:46:11.240887] page#15" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODBS2KA==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
              

|-- [2019-07-02 23:59:10.764323] page#17" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODLHn5w==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
              

|-- [2019-07-03 00:14:17.027457] page#20" *****
* Remaining RateLimit:  3648
|-- [2019-07-03 00:16:26.915619] page#21" *****
* Remaining RateLimit:  3544
|-- [2019-07-03 00:18:34.577758] page#22" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODqT7wA==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockRe

|-- [2019-07-03 00:33:39.897635] page#25" *****
* Remaining RateLimit:  2712
|-- [2019-07-03 00:35:46.792649] page#26" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOEGVMrg==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
          

|-- [2019-07-03 00:52:59.716638] page#30" *****
* Remaining RateLimit:  4688
|-- [2019-07-03 00:55:07.088360] page#31" *****
* Remaining RateLimit:  4584
|-- [2019-07-03 00:57:16.137091] page#32" *****
* Remaining RateLimit:  4480
|-- [2019-07-03 00:59:24.700295] page#33" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: facebook, name: jest) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOE0d54w==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
           

|-- [2019-07-03 01:18:46.782225] page#38" *****
* Remaining RateLimit:  3440
|-- [2019-07-03 01:20:56.722529] page#39" *****
* Remaining RateLimit:  3336
|-- [2019-07-03 01:23:05.889499] page#40" *****
* Remaining RateLimit:  3232
|-- [2019-07-03 01:25:14.353676] page#41" *****
* Remaining RateLimit:  3128
|-- [2019-07-03 01:27:24.265190] page#42" *****
* Remaining RateLimit:  3024
|-- [2019-07-03 01:29:31.514590] page#43" *****
* Remaining RateLimit:  2920
|-- [2019-07-03 01:31:39.683463] page#44" *****
* Remaining RateLimit:  2816
|-- [2019-07-03 01:33:45.749884] page#45" *****
* Remaining RateLimit:  2712
|-- [2019-07-03 01:35:51.926851] page#46" *****
* Remaining RateLimit:  2608
***** [2019-07-03 01:37:53.313352] START repo#2-jasmine/jasmine" *****
|-- [2019-07-03 01:37:53.317725] page#1" *****
* Remaining RateLimit:  2504
|-- [2019-07-03 01:39:58.900546] page#2" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by return

|-- [2019-07-03 02:16:06.027402] page#2" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODuoZ_A==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 02:24:47.766143] page#2" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODuoZ_A==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 02:35:39.651826] page#3" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODv6JoA==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 02:46:31.539441] page#4" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHODxwthw==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 02:57:22.116441] page#5" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOD0bFWQ==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 03:08:10.510365] page#6" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOD37PzQ==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 03:19:00.740708] page#7" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 03:21:09.953493] page#8" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOEAhBZw==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
   

|-- [2019-07-03 03:32:00.230542] page#9" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOED1FIg==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
              authorAssociation
              participants(first: 100) {
      

|-- [2019-07-03 03:47:09.868531] page#12" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 03:49:18.463045] page#13" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOEVQL5A==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
              number
              url
              title
              bodyText
              createdAt
              updatedAt
              closedAt
              state
              closed
              locked
              activeLockReason
              author {
                login

              }
 

|-- [2019-07-03 04:04:23.023867] page#16" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:06:27.769461] page#17" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 04:08:35.048589] page#18" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:10:43.795259] page#19" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 04:12:53.656197] page#20" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:15:03.250907] page#21" *****
There are errors when retrieving data - Retry the same query again if this happens!
Query failed to run by returning code of 502. 
    {
      rateLimit {
        cost
        remaining
        resetAt
      }
      repository(owner: GoogleChrome, name: puppeteer) {
        issues(first: 100 , after:"Y3Vyc29yOnYyOpHOFNfnRw==",) {
          totalCount
          pageInfo {
            hasNextPage
            startCursor
            endCursor
          }
          edges {
            node {
              repository {
                nameWithOwner
              }
 

|-- [2019-07-03 04:30:12.076995] page#24" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:32:19.157828] page#25" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 04:34:29.095983] page#26" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:36:34.593853] page#27" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 04:38:43.879187] page#28" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:40:53.552087] page#29" *****
* Remaining RateLimit:  4792
|-- [2019-07-03 04:43:01.617668] page#30" *****
* Remaining RateLimit:  4896
|-- [2019-07-03 04:45:08.468059] page#31" *****
* Remaining RateLimit:  4792
***** [2019-07-03 04:47:11.236851] START repo#5-cypress-io/cypress" *****
|-- [2019-07-03 04:47:11.239125] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 04:49:11.470712] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 04:51:11.716336] p

|-- [2019-07-03 06:37:25.211679] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 06:39:25.429065] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 06:41:25.652663] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 06:43:25.897371] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 06:45:26.132627] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'
|-- [2019-07-03 06:47:26.326320] page#1" *****
There are errors when retrieving data - Retry the same query again if this happens!
'data'


KeyboardInterrupt: 

In [564]:
result



In [570]:
last_issue = result['data']['search']['edges'][-1]['node']

In [312]:
dict_status

{'qunitjs/qunit': {'issueCount': 704, 'total_queried_pages': 15}, 'mochajs/mocha': {'issueCount': 2334, 'total_queried_pages': 20}, 'facebook/jest': {'issueCount': 4513, 'total_queried_pages': 20}, 'jasmine/jasmine': {'issueCount': 1186, 'total_queried_pages': 20}, 'bitovi/funcunit': {'issueCount': 152, 'total_queried_pages': 4}, 'GoogleChrome/puppeteer': {'issueCount': 3058, 'total_queried_pages': 20}, 'cypress-io/cypress': {'issueCount': 3520, 'total_queried_pages': 20}}

In [581]:
result



In [590]:
result['data']['repository']['issues']['edges']



In [612]:
list_repos = [
#     "qunitjs/qunit",
#     "mochajs/mocha",
#     "facebook/jest",
#     "jasmine/jasmine",
#     "bitovi/funcunit",
#     "GoogleChrome/puppeteer",
    "cypress-io/cypress"
]

In [611]:
for cur_idx, cur_repo in enumerate(list_repos):
    
    print('***** [{}] START cleaning data for repo#{}-{}" *****'.format(str(datetime.datetime.now()), cur_idx, cur_repo))
    
    name_repo = cur_repo.split('/')[1]
    raw_data_file_cur_repo = get_raw_data_file(name_repo)
    
    with open(raw_data_file_cur_repo) as f:
        raw_content = f.readlines()
        
    out_json_name_raw = 'temp/github_repo_issues_raw{}.json'.format(name_repo)
    
    # Remove whitespace characters like `\n` at the end of each line
    all_content = [x.strip() for x in raw_content] 

    # Assemble json
    all_json_str = ','.join(all_content)
    out_json_str = '[{}]'.format(all_json_str)


    with open(out_json_name_raw, 'w') as f:  
        f.write(out_json_str)
        
    with open(out_json_name_raw) as json_file:  
        data_repo = json.load(json_file)
        
    clean_data = []

    for cur_data in data_repo:
        new_data = OrderedDict() 

        cur_url = cur_data['url']
        new_data['repo_name'] = cur_data['repository']['nameWithOwner']
        new_data['issue_id'] = cur_data['number']
        new_data['url'] = cur_url
        new_data['title'] = cur_data['title']
        new_data['contents'] = cur_data['bodyText']
        new_data['createdAt'] = cur_data['createdAt']
        new_data['updatedAt'] = cur_data['updatedAt']
        new_data['closedAt'] = cur_data['closedAt']
        new_data['state'] = cur_data['state']
        new_data['closed'] = cur_data['closed']
        new_data['locked'] = cur_data['locked']
        new_data['activeLockReason'] = cur_data['activeLockReason']

        new_data['authorLogin']= cur_data['author']['login'] if cur_data['author'] else ''
        new_data['authorAssociation'] = cur_data['authorAssociation']

        new_data['participants_count'] = cur_data['participants']['totalCount']
        new_data['participants_data'] = cur_data['participants']['nodes']

        new_data['comments_count'] = cur_data['comments']['totalCount']

        cur_all_comments = cur_data['comments']['nodes']
        cur_data_comments = []
        for cur_comment in cur_all_comments:
            new_comment = OrderedDict() 
            new_comment['authorLogin'] = cur_comment['author']['login'] if cur_comment['author'] else ''
            new_comment['authorAssociation'] = cur_comment['authorAssociation']
            new_comment['createdAt'] = cur_comment['createdAt']
            new_comment['contents'] = cur_comment['bodyText']
            
            new_comment['reactions_count'] = cur_comment['reactions']['totalCount']
            new_comment['reactions_data'] = [x['content'] for x in cur_comment['reactions']['nodes']]
            
            cur_data_comments.append(new_comment)


        new_data['comments_data'] = cur_data_comments
        
        
        new_data['reactions_count'] = cur_data['reactions']['totalCount']
        new_data['reactions_data'] = [x['content'] for x in cur_data['reactions']['nodes']]

        new_data['labels'] = [x['name'] for x in cur_data['labels']['nodes']]
        
        new_data['milestone'] = cur_data['milestone']

        clean_data.append(new_data)
        
    out_json_name_clean = 'temp/github_repo_issues_{}.json'.format(name_repo)
    with open(out_json_name_clean, 'w') as fp:
        json.dump(clean_data, fp)

    

***** [2019-07-03 06:51:46.915433] START cleaning data for repo#0-mochajs/mocha" *****
***** [2019-07-03 06:51:47.643066] START cleaning data for repo#1-facebook/jest" *****
***** [2019-07-03 06:51:48.954500] START cleaning data for repo#2-jasmine/jasmine" *****
***** [2019-07-03 06:51:49.366246] START cleaning data for repo#3-bitovi/funcunit" *****
***** [2019-07-03 06:51:49.402447] START cleaning data for repo#4-GoogleChrome/puppeteer" *****


In [236]:
# with open(raw_data_file) as f:
#     raw_content = f.readlines()


In [237]:
# out_json_name_raw = 'temp/github_repo_issues_raw.json'

In [238]:
# # Remove whitespace characters like `\n` at the end of each line
# all_content = [x.strip() for x in raw_content] 

# # Assemble json
# all_json_str = ','.join(all_content)
# out_json_str = '[{}]'.format(all_json_str)


# with open(out_json_name_raw, 'w') as f:  
#     f.write(out_json_str)

In [239]:
# with open(out_json_name_raw) as json_file:  
#     data_repo = json.load(json_file)

In [242]:
# clean_data = []

# for cur_data in data_repo:
#     new_data = OrderedDict() 

#     cur_url = cur_data['url']
#     new_data['repo_name'] = re.findall(r'https:\/\/github.com\/(.*)\/issues/\d', cur_url)[0]
#     new_data['issue_id'] = cur_data['number']
#     new_data['url'] = cur_url
#     new_data['title'] = cur_data['title']
#     new_data['contents'] = cur_url['bodyText']
#     new_data['createdAt'] = cur_data['createdAt']
#     new_data['updatedAt'] = cur_data['updatedAt']
#     new_data['closedAt'] = cur_data['closedAt']
#     new_data['state'] = cur_data['state']
#     new_data['closed'] = cur_data['closed']
#     new_data['locked'] = cur_data['locked']
#     new_data['activeLockReason'] = cur_data['activeLockReason']

#     new_data['authorLogin']= cur_data['author']['login'] if cur_data['author'] else ''
#     new_data['authorAssociation'] = cur_data['authorAssociation']

#     new_data['participants_count'] = cur_data['participants']['totalCount']
#     new_data['participants_data'] = cur_data['participants']['nodes']

#     new_data['comments_count'] = cur_data['comments']['totalCount']

#     cur_all_comments = cur_data['comments']['nodes']
#     cur_data_comments = []
#     for cur_comment in cur_all_comments:
#         new_comment = OrderedDict() 
#         new_comment['authorLogin'] = cur_comment['author']['login'] if cur_comment['author'] else ''
#         new_comment['authorAssociation'] = cur_comment['authorAssociation']
#         new_comment['createdAt'] = cur_comment['createdAt']
#         new_comment['contents'] = cur_comment['bodyText']
#         cur_data_comments.append(new_comment)


#     new_data['comments_data'] = cur_data_comments

#     new_data['labels'] = [x['name'] for x in cur_data['labels']['nodes']]
    
#     clean_data.append(new_data)



In [315]:
len(clean_data)


1000

In [318]:
clean_data[3]

OrderedDict([('repo_name', 'cypress-io/cypress'), ('issue_id', 4630), ('url', 'https://github.com/cypress-io/cypress/issues/4630'), ('title', 'Your framework is f*cking bullsh*t'), ('contents', "It is impossible to configure your f*cking moronic framework to work with TypeScript and Webpack when project is bigger than hello f*cking world example you have. I spent all day tweaking different options and had no success. I'm f*cking done. F*ck you and your stupid framework."), ('createdAt', '2019-07-02T07:31:29Z'), ('updatedAt', '2019-07-02T08:15:11Z'), ('closedAt', '2019-07-02T08:05:30Z'), ('state', 'CLOSED'), ('closed', True), ('locked', False), ('activeLockReason', None), ('authorLogin', 'alexeychikk'), ('authorAssociation', 'NONE'), ('participants_count', 2), ('participants_data', [{'login': 'alexeychikk', 'company': None}, {'login': 'jennifer-shehane', 'company': '@cypress-io '}]), ('comments_count', 2), ('comments_data', [OrderedDict([('authorLogin', 'jennifer-shehane'), ('authorAsso

In [245]:
# out_json_name_clean = 'temp/github_repo_issues_2019_July_1.json'

In [246]:
# with open(out_json_name_clean, 'w') as fp:
#     json.dump(clean_data, fp)

In [613]:
out_json_name_clean

'temp/github_repo_issues_puppeteer.json'

In [614]:
with open(out_json_name_clean) as json_file:  
    data_new = json.load(json_file)

In [615]:
len(data_new)

3067

In [616]:
data_new[707]



In [322]:
dict_status

{'qunitjs/qunit': {'issueCount': 704, 'total_queried_pages': 15}, 'mochajs/mocha': {'issueCount': 2334, 'total_queried_pages': 20}, 'facebook/jest': {'issueCount': 4513, 'total_queried_pages': 20}, 'jasmine/jasmine': {'issueCount': 1186, 'total_queried_pages': 20}, 'bitovi/funcunit': {'issueCount': 152, 'total_queried_pages': 4}, 'GoogleChrome/puppeteer': {'issueCount': 3058, 'total_queried_pages': 20}, 'cypress-io/cypress': {'issueCount': 3520, 'total_queried_pages': 20}}

In [481]:
url = "https://api.github.com/repos/jasmine/jasmine/issues?state=all&q=is:issue&page=17&per_page=100"
# url = "https://api.github.com/search/issues?q=repo:jasmine/jasmine+is:issue&page=11&per_page=100"
res=requests.get(url,headers=headers)
result=res.json()

In [482]:
len(result)

100

In [483]:
result

[{'url': 'https://api.github.com/repos/jasmine/jasmine/issues/128', 'repository_url': 'https://api.github.com/repos/jasmine/jasmine', 'labels_url': 'https://api.github.com/repos/jasmine/jasmine/issues/128/labels{/name}', 'comments_url': 'https://api.github.com/repos/jasmine/jasmine/issues/128/comments', 'events_url': 'https://api.github.com/repos/jasmine/jasmine/issues/128/events', 'html_url': 'https://github.com/jasmine/jasmine/issues/128', 'id': 1592341, 'node_id': 'MDU6SXNzdWUxNTkyMzQx', 'number': 128, 'title': 'support for .with() modifier for spies', 'user': {'login': 'davidwkeith', 'id': 97965, 'node_id': 'MDQ6VXNlcjk3OTY1', 'avatar_url': 'https://avatars2.githubusercontent.com/u/97965?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/davidwkeith', 'html_url': 'https://github.com/davidwkeith', 'followers_url': 'https://api.github.com/users/davidwkeith/followers', 'following_url': 'https://api.github.com/users/davidwkeith/following{/other_user}', 'gists_url': 'https://

In [460]:
len(result['items'])

100

In [463]:
result

{'message': 'Only the first 1000 search results are available', 'documentation_url': 'https://developer.github.com/v3/search/'}