In [None]:
from top_github_scraper import (get_top_repo_urls, get_top_repos, get_top_contributors, 
get_top_user_urls, get_top_users)
#import datapane as dp 
import pandas as pd 
import numpy as np
from tqdm import tqdm
import requests
import os
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")

### Define your search keywords

We are scraping 38 curated 'github_topics' to get a good and diverse selection of GitHub repositories.
Feel free to change these topics for your own interests/needs.

In [None]:
keywords = ["data science","api"]
github_topics = ['3D','Algorithm','Android','API','Arduino','Atom','aws','azure','bash','bootstrap','chrome','compiler','crytocurrency','data structures','database','data visualization','deep learning','data science','deployment','flask','front end','git','google','iOS','json','library','machine learning','macOS','mobile','modeling','natural language processing','neural network','operating system','parsing','software','server','virtual reality','windows']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'}

#### Scraping functions

##### GitHub Repo Scraping Functions
In the following cell we define the GitHub repository scraping functions. The basis for these were given in the following [Medium article](https://towardsdatascience.com/i-scraped-more-than-1k-top-machine-learning-github-profiles-and-this-is-what-i-found-1ab4fb0c0474)

In [None]:
# TODO: Include the start parameter, also in the all_repo function!
def get_repo_info(keyword, start=0, stop=10):
    """
    Scrapes important information from the first 'stop' pages of GitHub repositories queried with 'keyword'.
  
    Grabs 'name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at' information
    from all of these repositories and adds the 'url' as well as the 'search_word' to it. 
  
    Parameters:
    keyword (string): Keyword to search GitHub repositories for
    start (int): First page of the query to be taken. Can be used to make subsequent, smaller query calls. Default = 0.
    stop (int): Limits the amount of pages to be scraped by the query. Default = 10.
  
    Returns:
    dataframe: A dataframe combining all the scraped information for the relevant repositories.
  
    """
    # First gather the first 'stop' pages of GitHub repos associated with that keyword
    repos = get_repo_urls(keyword, start=start, stop=stop)
    
    all_repo_info = dict()
    # Information to be scraped from every repo
    info_to_scrape = ['name', 'stargazers_count', 'forks_count', 'subscribers_count', 'topics', 'language', 'created_at', 'updated_at']
    for repo in tqdm(repos,desc="Scraping top repo info..."):
        repo_url = repo
        repo_info_url = f"https://api.github.com/repos{repo_url}"
        # Issue an API request
        repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN))
        # Check if too many requests have been sent, wait a bit and try again
        while repo_info.status_code == 429:
            print("Timeout, retrying to fetch repository information...")
            time.sleep(30)
            repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN))
        repo_info = repo_info.json()
        repo_name = repo_info['id']
        repo_important_info = {}
        for info in info_to_scrape:
            repo_important_info[info] = repo_info[info]
        repo_important_info['url'] = repo_url
        repo_important_info['search_word'] = keyword
        all_repo_info[repo_name] = repo_important_info
    # Build a dataframe out of the scraped data
    repo_df = pd.DataFrame.from_dict(all_repo_info, orient='index', columns=info_to_scrape+['url','search_word'])
    return repo_df

def all_repo_info(keywords, start = 0, stop=10):
    """
    Scrapes important information from the first 'stop' pages of all GitHub repositories queried with all keywords in 'keywords'.
    Writes the scraped data to disk as "most_updated_repo_info_stop'start'to'stop'.csv"
  
    Grabs 'name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at' information
    from all of these repositories and adds the 'url' as well as the 'search_word' to it.
  
    Parameters:
    keywords (List[string]): Keywords to search GitHub repositories for
    start (int): First page of the query to be taken. Can be used to make subsequent, smaller query calls. Default = 0.
    stop (int): Limits the amount of pages to be scraped by the query. Default = 10.
  
    Returns:
    dataframe: A dataframe combining all the scraped information for the relevant repositories.
  
    """
    repo_df = pd.DataFrame(columns=['name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at','url','search_word'])
    for k in keywords:
        # Gather all important information about the repos associated with a specific keyword
        new_repo = get_repo_info(k, start=start, stop=stop)
        print(k,len(new_repo.index))
        # Combine dataframes for different keywords
        repo_df = pd.concat([repo_df,new_repo])
        # Write partial result to disk to keep progress
        repo_df.to_csv(f'data/most_updated_repo_info_stop{start}to{stop}.csv')
    return repo_df
            

def topic_relationship_table(repo_df, output_file):
    """
    Builds a GitHub repository id to repo topic relationship table and returns it.
  
    Parameters:
    repo_df (Dataframe): GitHub repo dataframe to build id-topic-relationship table for
  
    Returns:
    dataframe: A new dataframe containing a separate mapping from a repo ID to all its related topics.
  
    """
    id_list = []
    topic_list = []
    for i in repo_df.index:
        topics = repo_df.loc[i,'topics']
        for t in topics:
            # Build the individual pairing
            id_list.append(i)
            topic_list.append(t)
    df = pd.DataFrame({'id':id_list,'topic':topic_list})
    df.to_csv(output_file)
    return df


SCRAPE_CLASS = {'Users': 'mr-1', 'Repositories': "v-align-middle"}
TYPE = 'Repositories'
def get_repo_urls(keyword, start=0, stop=10):
    """
    Queries the 'start' until 'stop' pages of GitHub repositories associated with the given keyword and returns their URLs as a list.
  
    Also takes into consideration API timeouts and subsequently waits for 60 seconds upon such a timeout, so it may
    take a while.
  
    Parameters:
    keyword (string): Keyword to search GitHub repositories for
    start (int): First page of the query to be taken. Can be used to make subsequent, smaller query calls. Default = 0.
    stop (int): Limits the amount of pages to be scraped by the query. Default = 10.
  
    Returns:
    list: A list containing all GitHub repo urls associated with the keyword.
  
    """
    urls = []
    page = None
    for page_num in tqdm(range(start, stop), desc="Scraping top GitHub URLs..."):
        keyword_no_space = ("+").join(keyword.split(" "))
        url = f"https://github.com/search?o=desc&p={str(page_num)}&q={keyword_no_space}&s=&type={TYPE}"
        page = requests.get(url, headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'})
        while page.status_code == 429:
            # Check that the page was fetched, otherwise time out and retry
            print("Timeout, retrying to fetch repo urls...")
            time.sleep(60)
            page = requests.get(url, headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'})
        # Extract contents
        soup = BeautifulSoup(page.text, "html.parser")
        a_tags = soup.find_all("a", class_=SCRAPE_CLASS[TYPE])
        new_urls = [a_tag.get("href") for a_tag in a_tags]
        # Gather the relevant urls
        urls.extend(new_urls)
        time.sleep(5)
    return urls

##### GitHub Repo Contributor Scraping Functions
Now we'll scrape the information about the top ten contributors of each repo, such that we can later combine the information to illustrate connections between different repositories.

In [None]:
def get_repo_contributors(repo_url, repo_contributor_rel, repo_id, contributors_set, n_contributors=10):
    """
    Scrapes the top 'n_contributors' of the repo given under 'repo_url' and saves the mapping into 
    'repo_contributor_rel' with the given 'repo_id' and returns a new dataframe with all the 
    contributor information.
  
    Parameters:
    repo_url (string): repo to scrape contributors for
    repo_contributor_rel (set): set to be populated with tuples of (repo_id, contrib_username, # of contrib)
    repo_id (int): the id of the GitHub repo
    contributors_set (set): set of all already known contributors
    n_contributors (int): number of ordered contributors to maximally scrape from a given repository. Default = 10.
  
    Returns:
    dataframe: A df containing all the information for the top n contributors of the given
               GitHub repo.

    Throws:
    an error for status code 403 whilst fetching the contributors page
  
    """
    contributor_url = (f"https://api.github.com/repos{repo_url}/contributors")
    contributor_page = requests.get(contributor_url, auth=(USERNAME, TOKEN))
    all_contributors = dict()
    while contributor_page.status_code == 403:
        print('Sleeping')
        time.sleep(1500)
        contributor_page = requests.get(contributor_url, auth=(USERNAME, TOKEN))
    if contributor_page.status_code != 204:
        if contributor_page.status_code == 403:
            # If this happens, restart the script
            raise Exception('This is the exception you expect to handle')
        contributor_page = contributor_page.json()
        max_n_top_contributors = min(len(contributor_page),n_contributors)

        profile=None
        profile_features = ["login", "url", "type", "name", "company", "location", "hireable", "bio", "public_repos", "public_gists", "followers", "following", "created_at"]
        if max_n_top_contributors > 0 and type(contributor_page) == list:
            for n in range(max_n_top_contributors):
                contributor = contributor_page[n]
                # Add an entry into the repo_contributor_relation df consisting of the repo id, the contributor username and the amount of contributions of that user on this repo
                repo_contributor_rel.add((repo_id, contributor["login"], contributor["contributions"]))
                if contributor["login"] not in contributors_set and contributor["contributions"] > 10:
                    # Save all users with a significant amount of contributions to any repo, but only do it once
                    contributors_set.add(contributor["login"])
                    profile = requests.get(contributor["url"], auth=(USERNAME, TOKEN),headers=headers)
                    while profile.status_code == 429:
                        # Check for timeouts and retry again after waiting
                        print("Timeout, retrying to fetch contributor profile...")
                        time.sleep(30)
                        profile = requests.get(contributor["url"], auth=(USERNAME, TOKEN),headers=headers)
                    all_contributors[contributor["login"]] = {key: val for key, val in profile.json().items() if key in profile_features}
    return pd.DataFrame.from_dict(all_contributors,orient='index')

def get_all_contributors(repos,repo_contributor_rel,contributors_set,n_contributors=10):
    """
    Scrapes the top 'n_contributors' of all the 'repos' given and saves the mapping into 
    'repo_contributor_rel' onto disk as 'most_updated_(n_contributors)_contributor_info_stop75.csv'. 
    Also saves a new dataframe with all the contributor information as 
    'repo_contributor_relationship_table_stop75.csv' to disk.
  
    Parameters:
    repos (list): tuples of (url, repo_id) pairs for all the repos to scrape contributors for
    repo_contributor_rel (set): set to be populated with tuples of (repo_id, contrib_username, # of contrib)
    contributors_set (set): set of all already known contributors
    n_contributors (int): number of ordered contributors to maximally scrape from a given repository. Default = 10.
  
    Returns:
    nothing

    Throws:
    an error for status code 403 whilst fetching the contributors page
  
    """
    contributor_df = pd.DataFrame(columns=["login", "url", "type", "name", "company", "location", "hireable", "bio", "public_repos", "public_gists", "followers", "following", "created_at"])
    repos_zip = list(zip(repos.url,repos.id))
    for url, r_id in tqdm(repos_zip, desc="Scraping top contributors info..."):
        # Get top n_contributors of this repo
        new_contributors = get_repo_contributors(url, repo_contributor_rel, r_id, contributors_set, n_contributors=n_contributors)
        # print(url,len(new_contributors.index))
        # Add it to the already scraped ones
        contributor_df = pd.concat([contributor_df, new_contributors]).drop_duplicates()
        # Save progress of contributor data to disk -> Change name if you want to
        contributor_df.to_csv(f'data/most_updated_{n_contributors}_contributor_info_stop75.csv')
        # Save relationship df to disk -> Change name if you want to
        pd.DataFrame(repo_contributor_rel, columns=['Repo','Contributor','Contributions']).sort_values('Repo').to_csv('data/repo_contributor_relationship_table_stop75.csv')

### Scraping the data: Example function calls


In [None]:
# First get repo information based on the search words
start = 0
stop = 75

repos = all_repo_info(keywords, start=start, stop=stop)
topic_rel = topic_relationship_table(repos, output_file = f'data/topic_relationship_table_stop{stop}')

#Next get contributor information for all repositories previously scraped

contributors = set()
repo_contributor_rel = set()
get_all_contributors(df.read_csv(f'data/most_updated_repo_info_stop{start}to{stop}.csv'),repo_contributor_rel,contributors)