In [6]:
import requests
from config import TOKEN
import time
import json
import pandas as pd

# GitHub GraphQL URL and your personal access token
GITHUB_GRAPHQL_URL = 'https://api.github.com/graphql'
ACCESS_TOKEN = TOKEN  # Replace with your GitHub access token

In [9]:
# GraphQL query template for gathering repository information with epic labels
query_template = """
query ($repoOwner: String!, $repoName: String!) {
  repository(owner: $repoOwner, name: $repoName) {
    name
    stargazers {
      totalCount
    }
    issues(labels: ["Epic"], states: OPEN) {
      totalCount
    }
    allIssues: issues {
      totalCount
    }
    forkCount
    watchers {
      totalCount
    }
    createdAt
    updatedAt
    primaryLanguage {
      name
    }
  }
}
"""

In [6]:

def run_query(query, variables):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    try:
        request = requests.post(GITHUB_GRAPHQL_URL, json={'query': query, 'variables': variables}, headers=headers, timeout=30)  # Increase timeout to 30 seconds
    except:
        time.sleep(5)
        #try again
        return run_query(query, variables)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f"Query failed with status code {request.status_code}. {request.text}")

def get_repo_info(repo_owner, repo_name):
    variables = {"repoOwner": repo_owner, "repoName": repo_name}
    return run_query(query_template, variables)

In [None]:
# Initialize an empty DataFrame
columns = ['Repo Name', 'Star Count', 'Epic Issue Count', 'Total Issue Count', 'Fork Count', 'Watcher Count', 'Created At', 'Updated At', 'Primary Language']
df = pd.DataFrame(columns=columns)

# Read the file and process each line
file_path = 'repo_list.txt'  # Replace with your file path
with open(file_path, 'r') as file:
    for line in file:
        repo_name, repo_owner = line.strip().split()
        result = get_repo_info(repo_owner, repo_name)

        # Extracting data from the query result
        data = result['data']['repository']
        row = {
            'Repo Name': data['name'],
            'Star Count': data['stargazers']['totalCount'],
            'Epic Issue Count': data['issues']['totalCount'],
            'Total Issue Count': data['allIssues']['totalCount'],
            'Fork Count': data['forkCount'],
            'Watcher Count': data['watchers']['totalCount'],
            'Created At': data['createdAt'],
            'Updated At': data['updatedAt'],
            'Primary Language': data['primaryLanguage']['name'] if data['primaryLanguage'] else None
        }

        # Append row to the DataFrame
        df = df.append(row, ignore_index=True)
        time.sleep(1)

# Output the DataFrame
print(df)

In [17]:
#save df to csv
df.to_csv('repos_info.csv', index=False)

###Analyzing repos

In [14]:
import pandas as pd
df = pd.read_csv('repos_info.csv')
df

Unnamed: 0,Repo Name,Star Count,Epic Issue Count,Total Issue Count,Fork Count,Watcher Count,Created At,Updated At,Primary Language
0,va.gov-team,261,1591,68116,192,169,2018-05-17T16:50:38Z,2023-11-26T17:35:21Z,HTML
1,wollok-ts,14,1,83,11,26,2018-08-30T21:17:50Z,2023-11-19T11:54:43Z,TypeScript
2,nuguri-msa,0,5,44,0,0,2022-12-13T05:44:01Z,2023-11-09T06:48:21Z,Java
3,project,0,40,352,0,14,2023-05-19T11:45:35Z,2023-11-09T12:11:49Z,
4,uW-Tools-Collab,0,2,15,0,3,2023-07-21T17:51:46Z,2023-07-21T17:51:46Z,
...,...,...,...,...,...,...,...,...,...
384,doc,11,42,2762,43,31,2016-07-04T17:00:57Z,2023-11-19T20:29:21Z,CSS
385,jpetstore-kubernetes,2,2,1206,19,1,2020-10-09T13:37:04Z,2023-01-31T18:33:00Z,Java
386,hilla,687,10,768,47,18,2021-04-27T13:25:07Z,2023-11-27T04:01:43Z,Java
387,mina,1847,88,5931,479,103,2017-12-18T01:10:17Z,2023-11-25T09:44:48Z,OCaml


In [15]:
#filter out epic issue count < 5
df = df[df['Epic Issue Count'] >= 5]
df

#sort by epic issue count
df = df.sort_values(by=['Epic Issue Count'], ascending=False)
df

Unnamed: 0,Repo Name,Star Count,Epic Issue Count,Total Issue Count,Fork Count,Watcher Count,Created At,Updated At,Primary Language
0,va.gov-team,261,1591,68116,192,169,2018-05-17T16:50:38Z,2023-11-26T17:35:21Z,HTML
278,entity,21,330,18483,56,21,2018-11-17T00:36:48Z,2023-11-27T17:24:10Z,JavaScript
316,va.gov-cms,62,320,10420,59,56,2018-10-22T16:10:58Z,2023-11-03T18:12:23Z,PHP
67,atd-data-tech,15,235,14389,2,12,2018-07-11T20:59:40Z,2023-11-15T13:48:55Z,Jupyter Notebook
311,airbyte,12367,227,15287,3236,172,2020-07-27T23:55:54Z,2023-11-27T18:16:58Z,Python
...,...,...,...,...,...,...,...,...,...
280,Strategy_game,1,5,139,2,2,2023-09-06T10:27:09Z,2023-10-11T05:44:24Z,Python
99,lcfs,1,5,204,2,2,2023-04-28T20:36:53Z,2023-11-06T06:45:20Z,JavaScript
95,gym-pro-fitness,0,5,22,0,1,2023-10-22T20:27:54Z,2023-11-24T08:55:35Z,HTML
361,midgard-ng,0,5,17,0,1,2023-11-16T15:29:44Z,2023-11-16T15:31:36Z,HTML


##Scraping Repos

In [17]:
df.iloc[:20, :]

Unnamed: 0,Repo Name,Star Count,Epic Issue Count,Total Issue Count,Fork Count,Watcher Count,Created At,Updated At,Primary Language
0,va.gov-team,261,1591,68116,192,169,2018-05-17T16:50:38Z,2023-11-26T17:35:21Z,HTML
278,entity,21,330,18483,56,21,2018-11-17T00:36:48Z,2023-11-27T17:24:10Z,JavaScript
316,va.gov-cms,62,320,10420,59,56,2018-10-22T16:10:58Z,2023-11-03T18:12:23Z,PHP
67,atd-data-tech,15,235,14389,2,12,2018-07-11T20:59:40Z,2023-11-15T13:48:55Z,Jupyter Notebook
311,airbyte,12367,227,15287,3236,172,2020-07-27T23:55:54Z,2023-11-27T18:16:58Z,Python
112,alkemio,23,211,898,4,9,2020-05-25T09:34:13Z,2023-09-20T07:21:26Z,
143,prime-reportstream,62,192,6366,40,19,2020-10-15T19:03:50Z,2023-11-17T15:22:52Z,Kotlin
16,va-mobile-app,8,126,4445,0,7,2020-09-01T16:38:50Z,2023-11-24T16:08:45Z,TypeScript
262,ocl_issues,4,115,1701,1,15,2018-02-21T19:43:31Z,2023-02-16T18:38:56Z,
104,projectmanagement,5,113,1565,3,7,2019-11-07T10:40:45Z,2023-11-13T14:31:49Z,


In [20]:
#sort by star count
df = df.sort_values(by=['Star Count'], ascending=False)
df.iloc[:20,:]

Unnamed: 0,Repo Name,Star Count,Epic Issue Count,Total Issue Count,Fork Count,Watcher Count,Created At,Updated At,Primary Language
196,appsmith,29858,88,16877,3137,287,2020-06-30T04:07:36Z,2023-11-27T19:30:25Z,TypeScript
96,OpenRCT2,12447,14,11530,1459,199,2014-04-01T23:32:25Z,2023-11-27T18:37:27Z,C++
125,dvc,12435,6,4516,1124,136,2017-03-04T08:16:33Z,2023-11-27T17:10:42Z,Python
311,airbyte,12367,227,15287,3236,172,2020-07-27T23:55:54Z,2023-11-27T18:16:58Z,Python
231,metamask-extension,10897,111,9742,4716,534,2015-09-06T16:34:48Z,2023-11-27T19:05:28Z,JavaScript
376,uno,8092,68,5579,676,195,2018-05-07T11:52:27Z,2023-11-27T16:31:00Z,C#
291,dbt-core,7842,5,4666,1387,137,2016-03-10T02:38:00Z,2023-11-27T19:15:59Z,Python
163,ckeditor5,7709,14,12449,3520,141,2015-01-08T12:13:59Z,2023-11-27T03:04:19Z,Rich Text Format
175,organicmaps,7596,33,3448,720,80,2020-12-27T19:02:26Z,2023-11-27T16:32:12Z,C++
378,materialize,5392,70,7072,447,76,2019-02-22T17:15:15Z,2023-11-27T03:27:18Z,Rust


##Scrape all issues of a repo

In [1]:
import requests
import pandas as pd
from config import TOKEN

# GitHub GraphQL URL and your personal access token
GITHUB_GRAPHQL_URL = 'https://api.github.com/graphql'

# GraphQL query template for issues
query_template = """
query ($repoOwner: String!, $repoName: String!, $cursor: String) {
  repository(owner: $repoOwner, name: $repoName) {
    issues(first: 100, after: $cursor) {
      pageInfo {
        endCursor
        hasNextPage
      }
      nodes {
        title
        number
        state
        body
        author {
          login
        }
        assignees(first: 10) {
          nodes {
            login
          }
        }
        comments(first: 100) {
          nodes {
            body
            author {
              login
            }
          }
        }
        labels(first: 10) {
          nodes {
            name
          }
        }
      }
    }
  }
}
"""

In [2]:
def run_query(query, variables):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    try:
        request = requests.post(GITHUB_GRAPHQL_URL, json={'query': query, 'variables': variables}, headers=headers)
    except:
        time.sleep(5)
        #try again
        return run_query(query, variables)
    
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f"Query failed with status code {request.status_code}. {request.text}")

# Function to fetch all issues from a repository
def fetch_all_issues(repo_owner, repo_name):
    issues = []
    cursor = None
    has_next_page = True
    total_issues_fetched = 0

    while has_next_page:
        variables = {"repoOwner": repo_owner, "repoName": repo_name, "cursor": cursor}
        result = run_query(query_template, variables)
        issues_data = result['data']['repository']['issues']
        issues_batch = issues_data['nodes']
        issues.extend(issues_batch)

        # Update total issues fetched and print progress
        total_issues_fetched += len(issues_batch)
        print(f"Repository: {repo_owner}/{repo_name}, Issues fetched so far: {total_issues_fetched}")

        # Pagination handling
        cursor = issues_data['pageInfo']['endCursor']
        has_next_page = issues_data['pageInfo']['hasNextPage']

    return issues

In [None]:
load = False

if load:
    df = pd.read_csv('issues.csv')
else:
    # Initialize an empty DataFrame
    columns = ['Repo Owner', 'Repo Name', 'Issue Title', 'Issue Body', 'Author', 'Assignees', 'Comments', 'Labels']
    df = pd.DataFrame(columns=columns)

# List of repositories (replace with your list)
repositories = [('ckeditor', 'ckeditor5')]  # Replace with your repo list

# Process each repository
for owner, name in repositories:
    issues = fetch_all_issues(owner, name)
    for issue in issues:
        # Safely extract nested fields
        author_login = issue['author']['login'] if issue['author'] else None
        assignees = [assignee['login'] for assignee in issue['assignees']['nodes']] if issue['assignees']['nodes'] else []
        comments = [{'body': comment['body'], 'author': comment['author']['login'] if comment['author'] else None} for comment in issue['comments']['nodes']] if issue['comments']['nodes'] else []
        labels = [label['name'] for label in issue['labels']['nodes']] if issue['labels']['nodes'] else []
        row = {
            'Repo Owner': owner,
            'Repo Name': name,
            'Issue State': issue['state'],
            'Issue Number': issue['number'],  # This is the issue's number in the repository, not the issue ID
            'Issue Title': issue['title'],
            'Issue Body': issue['body'],
            'Author': author_login,
            'Assignees': assignees,
            'Comments': comments,
            'Labels': labels
        }
        df = df.append(row, ignore_index=True)

# Output or process the DataFrame
print(df.head())  # Prints first few rows of the DataFrame

In [6]:
df.to_csv('/data/' + repo_name + '_issues.csv', index=False)