In [1]:
import os
import json
import glob
import shutil
import requests
import subprocess
import pandas as pd

In [2]:
def flatten_json(data, output, prefix=""):    
    for key, value in data.items():
        if isinstance(value, dict):
            flatten_json(value, output, key)
        else:
            if(prefix):
                output["{}_{}".format(prefix, key)] = value
            else:
                output[key] = value
    
    return output

In [3]:
def change_permissions(direc):
    for root, dirs, files in os.walk(direc):
        for f in files:
            os.chmod(os.path.join(root, f), 0o777)
        for d in dirs:
            os.chmod(os.path.join(root, d), 0o777)

In [4]:
def download_repositories(repository_list, github_token, file_extensions):
    api_endpoint = "https://api.github.com/repos"
    auth_token = "Bearer {}".format(github_token)
    api_headers = {"Authorization": auth_token, "Accept": "application/vnd.github.mercy-preview+json"}
    
    for index, row in pd.read_csv(repository_list).iterrows():
        repo_url = row["repo_url"]
        filename = row["filename"]
        repo_name = repo_url.split("https://github.com/")[-1]
        
        # Ensure repository has not already been downloaded
        if(not os.path.exists(filename)):    
            
            try: 
                # Make request to Github API to get repository metadata
                api_request = requests.get("{}/{}".format(api_endpoint, repo_name), headers=api_headers)

                # Check to make sure API request was successful
                if(api_request.status_code == 200):
                    flatten_request = flatten_json(api_request.json(), {})
                    request_df = pd.DataFrame([flatten_request])

                    # Clone the repo into a new directory named after the arxiv ID of the paper associated with the repo
                    subprocess.call(["git", "clone", repo_url, filename])

                    files = []
                    direcs = []

                    # Change permissions of all files/folders to read/write
                    change_permissions(filename)

                    # Delete .git folder
                    shutil.rmtree("{}/.git".format(filename))

                    # Remove non-code files and empty directories
                    for root, direc, files in os.walk(filename):
                        for f in files:
                            if(f.split(".")[-1] not in file_extensions):
                                os.remove(os.path.join(root, f))
                                
                    for root, direc, files in os.walk(filename):
                        for d in direc:
                            if(not os.listdir(os.path.join(root, d))):
                                os.rmdir(os.path.join(root, d))

                    # Delete repository if empty (this is because the repo has no code)
                    if(not os.listdir(filename)):
                        shutil.rmtree(filename)
                    else:
                        request_df.to_csv("{}/{}.csv".format(filename, filename), encoding="utf-8", index=False)
                        print("Successfully downloaded {}".format(filename))      

                else:
                    # This error occurs if the repo no longer exists or you have reached the API request limit
                    print("Could not download {} due to a network error".format(filename, api_request.status_code))
                    
            except Exception as e:
                print("Could not download {} because {}".format(filename, str(e)))
        
        else:
            print("Repository {} already downloaded".format(filename))

In [None]:
file_extensions = json.load(open("file_extensions.json", "r"))
languages = set()

for key, value in file_extensions.items():
    for file in value:
        languages.add(file)

download_repositories("dataset.csv", "", languages)

Successfully downloaded 1806.06098v1
Successfully downloaded 1712.05889v2
Successfully downloaded 1904.02399v4
Successfully downloaded 1806.05507v1
Successfully downloaded 1905.02925v1
Successfully downloaded 1801.09319v2
Successfully downloaded 1806.02988v1
Successfully downloaded 1705.03854v3
Successfully downloaded 1206.2944v2
Successfully downloaded 1807.05118v1
Successfully downloaded 1806.01175v1
Repository 1206.2944v2 already downloaded
Successfully downloaded 1805.12048v1
Successfully downloaded 1803.01299v2
Successfully downloaded 1805.10799v1
Successfully downloaded 1711.11511v5
Successfully downloaded 1711.04956v5
Successfully downloaded 1903.11114v2
Successfully downloaded 1604.07316v1
Successfully downloaded 1708.02551v1
Successfully downloaded 1805.07499v1
Successfully downloaded 1611.01603v6
Successfully downloaded 1402.0929v3
Successfully downloaded 1805.05480v2
Successfully downloaded 1805.09843v1
Successfully downloaded 1805.06504v1
Successfully downloaded 1403.5607v1

Successfully downloaded 1810.12576v1
Successfully downloaded 1802.05668v1
Successfully downloaded 1808.00897v1
Successfully downloaded 1506.05198v3
Successfully downloaded 1412.7753v2
Successfully downloaded 1502.00750v1
Successfully downloaded 1502.00130v1
Successfully downloaded 1502.02761v1
Successfully downloaded 1503.02424v2
Successfully downloaded 1712.05690v2
Successfully downloaded 1705.02801v4
Successfully downloaded 1408.5093v1
Successfully downloaded 1401.6169v2
Successfully downloaded 1503.03578v1
Successfully downloaded 1307.5302v3
Successfully downloaded 1404.4893v1
Repository 1712.01815v1 already downloaded
Successfully downloaded 1705.02364v5
Successfully downloaded 1811.08988v3
Successfully downloaded 1703.06345v1
Successfully downloaded 1711.11575v2
Successfully downloaded 1905.13391v2
Successfully downloaded 1803.08375v2
Repository 1703.06870v3 already downloaded
Successfully downloaded 1811.09621v2
Successfully downloaded 1508.07909v5
Successfully downloaded 1506.03

Successfully downloaded 1411.0292v2
Repository 1810.04805v2 already downloaded
Successfully downloaded 1312.4400v3
Successfully downloaded 1701.06547v5
Successfully downloaded 1705.08841v1
Repository 1707.06347v2 already downloaded
Successfully downloaded 1804.06208v2
Successfully downloaded 1502.03240v3
Successfully downloaded 1812.01483v2
Successfully downloaded 1606.04316v3
Successfully downloaded 1801.06126v3
Successfully downloaded 1702.00288v3
Successfully downloaded 1502.05477v5
Successfully downloaded 1810.08427v1
Successfully downloaded 1902.11004v1
Successfully downloaded 1901.07031v1
Successfully downloaded 1702.08835v3
Successfully downloaded 1511.06410v3
Successfully downloaded 1806.09907v1
Successfully downloaded 1901.03353v1
Successfully downloaded 1703.03906v2
Successfully downloaded 1711.06396v1
Successfully downloaded 1906.06059v2
Repository 1312.5602v1 already downloaded
Successfully downloaded 1704.06125v1
Successfully downloaded 1408.5882v2
Successfully downloaded 

Successfully downloaded 1808.04487v1
Successfully downloaded 1808.06876v3
Successfully downloaded 1804.00823v4
Successfully downloaded 1606.00298v1
Successfully downloaded 1806.01946v3
Successfully downloaded 1901.09590v2
Repository 1703.03130v1 already downloaded
Successfully downloaded 1412.6575v4
Successfully downloaded 1611.07932v2
Successfully downloaded 1802.02950v4
Successfully downloaded 1902.10903v1
Successfully downloaded 1807.06906v1
Successfully downloaded 1705.09307v1
Repository 1707.06347v2 already downloaded
Successfully downloaded 1511.06581v3
Repository 1611.07004v3 already downloaded
Successfully downloaded 1702.03814v3
Successfully downloaded 1811.07417v1
Successfully downloaded 1809.02847v2
Successfully downloaded 1809.02701v4
Successfully downloaded 1611.05425v1
Successfully downloaded 1606.00094v2
Successfully downloaded 1603.01360v3
Successfully downloaded 1806.06259v1
Successfully downloaded 1411.5799v2
Successfully downloaded 1703.00395v1
Repository 1810.04805v

Successfully downloaded 1809.10853v3
Successfully downloaded 1810.07842v1
Successfully downloaded 1806.05975v2
Successfully downloaded 1502.05767v4
Successfully downloaded 1602.00904v2
Successfully downloaded 1802.09957v1
Successfully downloaded 1804.10686v1
Successfully downloaded 1605.06595v2
Successfully downloaded 1703.02504v1
Successfully downloaded 1602.02410v2
Successfully downloaded 1808.00793v1
Successfully downloaded 1809.06309v3
Successfully downloaded 1808.04699v1
Successfully downloaded 1705.11040v2
Successfully downloaded 1512.02325v5
Repository 1701.07875v3 already downloaded
Successfully downloaded 1703.06103v4
Successfully downloaded 1901.04095v2
Successfully downloaded 1708.04439v2
Successfully downloaded 1411.5928v4
Repository 1505.04597v1 already downloaded
Successfully downloaded 1812.05692v1
Successfully downloaded 1809.02058v3
Successfully downloaded 1810.09305v1
Successfully downloaded 1809.02196v2
Successfully downloaded 1804.00497v3
Successfully downloaded 180

Successfully downloaded 1807.10076v3
Successfully downloaded 1610.02391v3
Successfully downloaded 1801.02203v1
Successfully downloaded 1703.09554v5
Repository 1602.07360v4 already downloaded
Successfully downloaded 1908.11587v1
Successfully downloaded 1903.06708v2
Successfully downloaded 1805.10255v1
Successfully downloaded 1811.08305v1
Successfully downloaded 1603.01354v5
Successfully downloaded 1704.03477v4
Successfully downloaded 1904.07475v4
Successfully downloaded 1712.08290v2
Successfully downloaded 1708.04782v1
Successfully downloaded 1512.04150v1
Repository 1408.5882v2 already downloaded
Successfully downloaded 1703.03329v2
Successfully downloaded 1902.05625v2
Successfully downloaded 1510.07493v1
Successfully downloaded 1702.03044v2
Successfully downloaded 1706.04638v3
Successfully downloaded 1706.05137v1
Successfully downloaded 1807.03819v3
Successfully downloaded 1807.00775v1
Repository 1506.01497v3 already downloaded
Successfully downloaded 1802.07606v1
Successfully download

Successfully downloaded 1803.02291v3
Successfully downloaded 1612.07837v2
Successfully downloaded 1610.00465v1
Repository 1603.01417v1 already downloaded
Successfully downloaded 1611.08387v1
Successfully downloaded 1707.03333v1
Successfully downloaded 1703.05884v2
Successfully downloaded 1808.10692v1
Successfully downloaded 1705.08623v2
Successfully downloaded 1808.05469v2
Repository 1403.6652v2 already downloaded
Repository 1602.07360v4 already downloaded
Successfully downloaded 1707.02392v3
Successfully downloaded 1802.10250v3
Successfully downloaded 1710.10951v2
Repository 1805.03359v2 already downloaded
Successfully downloaded 1706.04957v2
Repository 1810.04805v2 already downloaded
Successfully downloaded 1805.07549v1
Successfully downloaded 1801.00062v1
Repository 1412.7753v2 already downloaded
Successfully downloaded 1807.02125v2
Repository 1901.03353v1 already downloaded
Successfully downloaded 1801.00926v3
Successfully downloaded 1810.05475v1
Successfully downloaded 1712.01076v

Successfully downloaded 1810.00774v1
Successfully downloaded 1610.08735v2
Repository 1809.02627v1 already downloaded
Successfully downloaded 1610.10087v1
Successfully downloaded 1607.02488v2
Repository 1702.02138v2 already downloaded
Successfully downloaded 1701.05363v3
Successfully downloaded 1901.00158v2
Successfully downloaded 1506.04834v3
Successfully downloaded 1902.03155v1
Successfully downloaded 1803.03178v1
Successfully downloaded 1906.01083v1
Successfully downloaded 1806.06457v2
Repository 1605.00316v1 already downloaded
Repository 1706.03762v5 already downloaded
Successfully downloaded 1809.04206v3
Successfully downloaded 1712.01521v1
Repository 1706.03762v5 already downloaded
Repository 1706.03762v5 already downloaded
Successfully downloaded 1702.02447v2
Repository 1607.04606v2 already downloaded
Successfully downloaded 1909.05289v2
Successfully downloaded 1904.09925v4
Successfully downloaded 1904.03441v1
Successfully downloaded 1712.02136v3
Successfully downloaded 1908.1082