# Get all commit SHA for the repo from GitHub

In [1]:
import requests
import os

base_url = "https://api.github.com"
repo_path = "/repos/flutter/flutter"
end_point = "/commits"
page_url_query = '?per_page=100&page=1'

my_headers = {
#         'Accept': 'application/vnd.github+json',
    'Authorization': os.getenv('GITHUB_AUTH_TOKEN')
}

In [2]:
def get_paginated_data ( url ):
    
    res = requests.get ( url, headers = my_headers )
    json_data = res.json()

    while 'next' in res.links.keys():
        res = requests.get ( res.links['next']['url'], headers = my_headers )
        json_data.extend ( res.json() )
    
    return json_data

In [3]:
def get_json_data ( url ):
    
    res = requests.get ( url, headers = my_headers )
#     print(res.headers['Retry-After'])
    json_data = res.json()

    return json_data

In [4]:
url = base_url + repo_path + end_point + page_url_query

commits = get_paginated_data ( url )

In [5]:
sha_list = []

for commit in commits:
    sha_list.append(commit['sha'])

In [6]:
# import csv

# with open('sha_list.csv', 'w') as file:
#     writer = csv.writer(file)
#     writer.writerow(sha_list)

In [7]:
# validate if all sha in the sha_list are of length 40
# print( all( (len(sha_list[idx]) == 40) for idx in range(len(sha_list)) ) )

# Generate slices of sha_list. Each slice shall have <= 4,500 SHA.

In [6]:
import numpy as np
import csv
from math import ceil

max_slice_len = 1000

sliced_sha_list = np.array_split(sha_list, ceil (len(sha_list) / max_slice_len) )

with open('sha_list.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(sliced_sha_list)

In [7]:
with open('sha_list.csv', newline='') as f:
    sliced_sha_list = [ list(rec) for rec in csv.reader(f, delimiter=',')]

len(sliced_sha_list)

31

In [5]:
# with open('sha_list.csv') as f:
#     sliced_sha_list = f.read().splitlines()
    
# len(sliced_sha_list)

31

# Generate dataset by fetching data using commit SHA

In [8]:
# final_commit_list = []

import json

with open('final_commit_list_20.json') as f:
    final_commit_list = json.load(f)

len(final_commit_list)

20409

In [9]:
def is_exception(json_data):
    
    if json_data is None:
        return True
    
    key_list = ['commit', 'author', 'stats']
    
    for key in key_list:
        if key not in json_data.keys():
            return True
        
    if json_data['author'] is None or json_data['commit'] is None or json_data['stats'] is None:
        return True
    
    return False

In [10]:
def get_commit_data ( sha ):

    commit_data = {}
    url = base_url + repo_path + end_point + '/' + sha
    json_data = get_json_data(url)
    
    if is_exception(json_data) is True:
        return {}
        
    commit_data[sha] = { 
        'author_name':            json_data['commit']['author']['name'],
        'author_email':           json_data['commit']['author']['email'],
        'date':                   json_data['commit']['author']['date'],
        'comment_count':          json_data['commit']['comment_count'],
        'verified':               json_data['commit']['verification']['verified'],
        'author_login':           json_data['author']['login'],
        'author_id':              json_data['author']['id'],
        'author_type':            json_data['author']['type'],
        'author_is_site_admin':   json_data['author']['site_admin'],
        'total':                  json_data['stats']['total'],
        'additions':              json_data['stats']['additions'],
        'deletions':              json_data['stats']['deletions']
}

    return commit_data

In [39]:
from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor ( max_workers = 50 ) as executor:
    futures = [ executor.submit(get_commit_data, sha) for sha in sliced_sha_list[27] ]

In [40]:
# url = url = base_url + repo_path + end_point + page_url_query + '&author=' + author[0]
# commits_data = get_paginated_data(url)
# commits_data

In [41]:
count = 0
for future in as_completed(futures):
#     print(future.result())
    if future.result() == {}:
        count += 1
        
count

0

In [42]:
for future in as_completed(futures):
    final_commit_list.append(future.result())

In [43]:
len(final_commit_list)

26241

In [81]:
import json
with open('final_commit_list.json', 'w', newline='') as fcl_file:
    json_object = json.dumps(final_commit_list)
    fcl_file.write(json_object)