# Denison CS181/DA210 APIs, Authenticated

---

In [1]:
import os
import os.path
import sys
import importlib
import matplotlib.pyplot as plt
import pandas as pd

if os.path.isdir(os.path.join("../../..", "modules")):
    module_dir = os.path.join("../../..", "modules")
else:
    module_dir = os.path.join("../..", "modules")

module_path = os.path.abspath(module_dir)
if not module_path in sys.path:
    sys.path.append(module_path)

import util
importlib.reload(util)

import requests
from requests.auth import HTTPBasicAuth

---

## Part A: Authenticated users

So far, we've used the GitHub API without authentication.  As a result:
- We have been limited to 60 requests per hour.
- We haven't been able to view private data, like user data or private repos.

The GitHub API can be used with "Basic Authentication" (e.g., username and password) or OAuth2 (the primary framework used on the internet for delegated authority).  For simplicity, we'll focus on Basic Authentication.

Once we're authenticated, we can use the `/user` endpoint to get information about the authenticated user.  First, let's see what happens without authenticating:

In [2]:
# Get information about the current user via: /user
host = "api.github.com"
resource_path = "/user"
url = util.buildURL(resource_path, host, protocol="https")

response = requests.get(url)
print(response.status_code)
print(response.json())

401
{'message': 'Requires authentication', 'documentation_url': 'https://docs.github.com/rest/reference/users#get-the-authenticated-user'}


In [3]:
# Inspect the headers: we've used one of our 60-per-hour requests
response.headers

{'Server': 'GitHub.com', 'Date': 'Fri, 29 Apr 2022 14:14:14 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '131', 'X-GitHub-Media-Type': 'github.v3; format=json', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset', 'Access-Control-Allow-Origin': '*', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Frame-Options': 'deny', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '0', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'Content-Security-Policy': "default-src 'none'", 'Vary': 'Accept-Encoding, Accept, X-Requested-With', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '56', 'X-RateLimit-Reset': '1651245101', 'X-RateLimit

It is worth noting that as of 2020, GitHub no longer allows username+password for authentication.  Instead, we need to generate a Personal Access Token (much like you did at the start of the semester).  Then, we'll use that in place of our password.

To start, we can generate a new PAT and put it and our username in `creds.json`.

As discussed in the [Python `requests` documentation](https://docs.python-requests.org/en/master/user/authentication/), Basic Authentication is so common online that it is extremely straightforward with `requests`.  In fact, their example is using this very GitHub API:
```
from requests.auth import HTTPBasicAuth
requests.get('https://api.github.com/user', auth=HTTPBasicAuth('user', 'pass'))
```

Let's try that endpoint again, now with our username and PAT:

In [4]:
# Read username and Personal Access Token (PAT) from creds.json
github_creds = util.read_creds("github", ".", "creds.json")
username = github_creds["username"]
pat = github_creds["pat"]

# Try again, authenticated
response = requests.get(url, auth=HTTPBasicAuth(username, pat))
response.status_code

200

In [5]:
# Inspect the headers again -- now we have 4999 requests remaining!
response.headers

{'Server': 'GitHub.com', 'Date': 'Fri, 29 Apr 2022 14:15:31 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Cache-Control': 'private, max-age=60, s-maxage=60', 'Vary': 'Accept, Authorization, Cookie, X-GitHub-OTP, Accept-Encoding, Accept, X-Requested-With', 'ETag': 'W/"f589b7bebaf8dcc9d33f8cee476495b2e50fb659ca0e839a68e4a249ce1f85b8"', 'Last-Modified': 'Thu, 07 Apr 2022 11:55:54 GMT', 'X-OAuth-Scopes': 'read:org, read:user, repo, user:email', 'X-Accepted-OAuth-Scopes': '', 'github-authentication-token-expiration': '2022-05-29 01:51:42 UTC', 'X-GitHub-Media-Type': 'github.v3; format=json', 'X-RateLimit-Limit': '5000', 'X-RateLimit-Remaining': '4999', 'X-RateLimit-Reset': '1651245331', 'X-RateLimit-Used': '1', 'X-RateLimit-Resource': 'core', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accept

In [None]:
# Look at the data for the authenticated user (that's you!)
user_data = response.json()
user_data

---

## Part B: Getting commit information for private repositories

Last time, we explored the `/repos/{owner}/{repo}/commits` endpoint to get commit information about a public repository.  Once we've authenticated, we can do the same for private repositories to which we have access.

First, let's copy over our `getRepositoryCommits` function, with an added parameter for our auth information.

In [None]:
def getRepositoryCommits(owner, repo, path, num_per_page=10, page=1, auth=None):
    """
    Uses the /repos/{owner}/{repo}/commits GitHub API endpoint to
    retrieve commit information for a given repository.
    """
    # Build the URL
    host = "api.github.com"
    resource_path = f"/repos/{owner}/{repo}/commits"
    url = util.buildURL(resource_path, host, protocol="https")

    # Make the request
    query_params = {"path": path,
                    "per_page": num_per_page,
                    "page": page}
    try:
        if auth is not None:
            response = requests.get(url, params=query_params, auth=auth)
        else:
            response = requests.get(url, params=query_params)
        assert response.status_code == 200
    except AssertionError:
        print(f"Failed: {resource_path} with status code {response.status_code}")

    # Return the parsed JSON object
    return response.json()

Let's try to get commit information for all files in the instructor's version of the course repository, but without providing auth information.

In [None]:
# Try to read the course repository, unauthenticated
owner = "amertt-denison-courses"
repo = "cs181-s22"
data = getRepositoryCommits(owner, repo, path=None)

util.print_json(data, level=4)

Now we'll try again, with auth info:

In [None]:
# Read username and Personal Access Token (PAT) from creds.json
github_creds = util.read_creds("github", ".", "creds.json")
username = github_creds["username"]
pat = github_creds["pat"]

# Try again, with HTTP Basic Auth (via PAT)
owner = "amertt-denison-courses"
repo = "cs181-s22"
auth = HTTPBasicAuth(username, pat)
instructor_repo_data = getRepositoryCommits(owner, repo, path=None, auth=auth)

print("Number of commits retrieved:", len(instructor_repo_data))

---

## Part C: Building a table of commits for the course repo

Now, we'll copy over the remaining functions to build a table of commit information, again adding a parameter for auth info where appropriate.

In [None]:
def commitResult2LoD(result, maxelements=None):
    """
    Converts a JSON array of commit results to an LoD.
    """
    assert isinstance(result, list)

    LoD = []
    count = 0
    for commit_obj in result:
        if maxelements != None and count >= maxelements:
            break

        D = {}
        D["id"] = commit_obj["sha"]
        D["message"] = commit_obj["commit"]["message"]
        D["author"] = commit_obj["author"]["login"]
        D["timestamp"] = commit_obj["commit"]["author"]["date"]
        LoD.append(D)

        count += 1

    return LoD

In [None]:
def getCommits(owner, repo, query_path, num_commits=15, num_per_page=10, auth=None):
    """
    Uses the /repos/{owner}/{repo}/commits GitHub API endpoint to
    retrieve commit information for a given repository, and returns
    the results (of possibly several paged requests) in a DataFrame.
    """
    fullLoD = []

    page = 1
    commits_left = num_commits
    more_pages = True

    while more_pages and commits_left > 0:
        commit_page = getRepositoryCommits(owner, repo, query_path, num_per_page, page, auth)

        if len(commit_page) < num_per_page:
            more_pages = False

        pageLoD = commitResult2LoD(commit_page)
        fullLoD.extend(pageLoD)

        commits_left -= len(pageLoD)
        page += 1

    df = pd.DataFrame(fullLoD)
    return df

Now, we can read in recent commit information for the course repository into a `pandas` `DataFrame`.

In [None]:
def getUserRepoCommits(owner, repo, num_commits=90, num_per_page=30):
    """
    Use GitHub credentials (username+PAT) to retrieve commit information,
    and convert that info to a pandas DataFrame, sorted by
    oldest commit first.
    """
    # Read username and Personal Access Token (PAT) from creds.json
    github_creds = util.read_creds("github", ".", "creds.json")
    username = github_creds["username"]
    pat = github_creds["pat"]

    # Use HTTP Basic Auth (via PAT)
    auth = HTTPBasicAuth(username, pat)
    commits_df = getCommits(owner, repo, None, num_commits, num_per_page, auth)

    # Change sort order to have oldest commits first
    commits_df.sort_values(by="timestamp", inplace=True)
    commits_df.reset_index(inplace=True, drop=True)

    print("Number of commits in DataFrame:", len(commits_df))
    return commits_df

In [None]:
# Try it out
owner = "amertt-denison-courses"
repo = "cs181-s22"
commits_df = getUserRepoCommits(owner, repo)
commits_df.head()

We can ask interesting questions, like what times do commits typically occur.

In [None]:
def convertToHourFloatSeries(timestampSeries):
    """
    Convert a Series of times 'hh:mm:ss' to floating-point hours.

    Ex: '14:30:00' -> 14.5
    """
    times = timestampSeries.apply(lambda t: t.split("T")[1][:-1]) # hh:mm:ss
    hours = times.apply(lambda t: int(t.split(':')[0]))
    minutes = times.apply(lambda t: int(t.split(':')[1]))
    return ((hours * 60 + minutes) / 60 - 4) % 24

def plotCommitTimes(df):
    """
    Make a scatterplot of all `timestamp` values in a DataFrame.
    Assumes timestamps are of the form 'YYYY-MM-DDThh:mm:ssZ'.
    """
    times_hours = convertToHourFloatSeries(df["timestamp"])
    plt.scatter(list(df.index), times_hours)

    plt.title("Commit Times")
    plt.xlabel("Commit Number (most recent last)")
    plt.ylabel("Time of day, 24-hour clock (Eastern Time)")
    plt.show()

In [None]:
plotCommitTimes(commits_df)

In [None]:
def plotCommitTimesByAuthor(df):
    """
    Make a scatterplot of all `timestamp` values in a DataFrame,
    grouped by author.
    
    Assumes timestamps are of the form 'YYYY-MM-DDThh:mm:ssZ'.
    """
    colors = ["blue", "red", "green", "yellow"]
    markers = ['o', 'x', 's']
    authors = sorted(df["author"].unique())
    for i, author in enumerate(authors):
        author_df = df.loc[df["author"] == author]
        timestampSeries = author_df["timestamp"]
        times_hours = convertToHourFloatSeries(timestampSeries)

        plt.scatter(list(author_df.index), times_hours,
                    label=author,
                    c = colors[i % len(colors)],
                    marker = markers[i % len(markers)])

    plt.title("Commit Times By User")
    plt.xlabel("Commit Number (most recent last)")
    plt.ylabel("Time of day, 24-hour clock (Eastern Time)")
    plt.legend()
    plt.show()

In [None]:
# Try it out for a student repo
owner = ""
repo = ""
student_commits_df = getUserRepoCommits(owner, repo, 100, 30)

In [None]:
plotCommitTimesByAuthor(student_commits_df)