In [None]:
import requests
import time

In [None]:
import os
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

In [None]:
from google.colab import userdata

GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
print("Loaded token:", GITHUB_TOKEN is not None)
print("Token length:", len(GITHUB_TOKEN) if GITHUB_TOKEN else 0)

Loaded token: True
Token length: 93


In [None]:
headers = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
}

## Authentication Check

Before extracting any data, we verify that the GitHub Personal Access Token
is valid and authorized.

We send a simple authenticated request to the GitHub API and check:
- HTTP status code
- JSON response structure

In [None]:
##This request validates that the GitHub token is correct and active.

resp = requests.get("https://api.github.com/user", headers=headers)
print("Status:", resp.status_code)
print(resp.json())

Status: 200
{'login': 'mikhaillmkuleshov', 'id': 254604577, 'node_id': 'U_kgDODyz1IQ', 'avatar_url': 'https://avatars.githubusercontent.com/u/254604577?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/mikhaillmkuleshov', 'html_url': 'https://github.com/mikhaillmkuleshov', 'followers_url': 'https://api.github.com/users/mikhaillmkuleshov/followers', 'following_url': 'https://api.github.com/users/mikhaillmkuleshov/following{/other_user}', 'gists_url': 'https://api.github.com/users/mikhaillmkuleshov/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/mikhaillmkuleshov/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/mikhaillmkuleshov/subscriptions', 'organizations_url': 'https://api.github.com/users/mikhaillmkuleshov/orgs', 'repos_url': 'https://api.github.com/users/mikhaillmkuleshov/repos', 'events_url': 'https://api.github.com/users/mikhaillmkuleshov/events{/privacy}', 'received_events_url': 'https://api.github.com/users/mikhaillmkulesho

## Report 1 — Search Public Repositories

Client goal:
Search for public GitHub repositories based on a keyword.

This report demonstrates how to:
- Query public repositories
- Control result size
- Inspect the response structure

### API Endpoint

GET /search/repositories

This endpoint allows searching public repositories using keywords
and supports pagination and sorting.

In [None]:
search_url = "https://api.github.com/search/repositories"
search_params = {
    "q": "data",
    "per_page": 5,
    "page": 1
}

search_response = requests.get(
    search_url,
    headers=headers,
    params=search_params
)

print("Status code:", search_response.status_code)

search_data = search_response.json()
print("Total repositories found:", search_data.get("total_count"))
print("Repositories returned in this page:", len(search_data.get("items", [])))

Status code: 200
Total repositories found: 6494829
Repositories returned in this page: 5


In [None]:
first_repo = search_data["items"][0]

print("Repository name:", first_repo["full_name"])
print("Description:", first_repo["description"])
print("Stars:", first_repo["stargazers_count"])
print("URL:", first_repo["html_url"])

Repository name: fivethirtyeight/data
Description: Data and code behind the articles and graphics at FiveThirtyEight
Stars: 17275
URL: https://github.com/fivethirtyeight/data


In [None]:
import json

with open("search_repositories_sample.json", "w") as f:
    json.dump(search_data, f, indent=2)

In [None]:
import os
print("File exists:", os.path.exists("search_repositories_sample.json"))

File exists: True


### Result

- Status code: 200 (Success)
- Authentication token is valid
- API returns a JSON object with:
  - total_count (total number of repositories matching the query)
  - incomplete_results (indicates whether results are truncated)
  - items (list of repositories)

This confirms that authenticated requests to the GitHub API work correctly.

## Report #2 — Commits + Pagination

Goal: fetch commits for a selected repository and demonstrate pagination (multiple pages of results).

In [None]:
# Pick a repository from previous search results
first_repo = search_data["items"][0]
owner = first_repo["owner"]["login"]
repo = first_repo["name"]

print("Selected repo:", f"{owner}/{repo}")
print("Repo URL:", first_repo["html_url"])

Selected repo: fivethirtyeight/data
Repo URL: https://github.com/fivethirtyeight/data


### Endpoint
GET https://api.github.com/repos/{owner}/{repo}/commits

Pagination:
- per_page: commits per page (max 100)
- page: page number (1,2,3...)

In [None]:
commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"

commits_params = {
    "per_page": 5,
    "page": 1
}

commits_response = requests.get(commits_url, headers=headers, params=commits_params)

print("Status code:", commits_response.status_code)

commits_data_page1 = commits_response.json()

# Quick sanity check
print("Commits returned:", len(commits_data_page1))
if len(commits_data_page1) > 0:
    print("Sample commit SHA:", commits_data_page1[0].get("sha"))
    print("Sample commit message:", commits_data_page1[0].get("commit", {}).get("message"))

Status code: 200
Commits returned: 5
Sample commit SHA: 4c1ff5e3aef1816ae04af63218015066e186c147
Sample commit message: Add data and README for trump-2-poll-issue-questions


In [None]:
all_commits = []
pages_to_fetch = 3  # 2-3 страницы достаточно для демонстрации

for p in range(1, pages_to_fetch + 1):
    params = {"per_page": 5, "page": p}
    r = requests.get(commits_url, headers=headers, params=params)

    print(f"Page {p} status:", r.status_code)

    if r.status_code != 200:
        print("Error response:", r.json())
        break

    data = r.json()
    print(f"Page {p} commits:", len(data))

    # If empty page => stop early
    if not data:
        break

    all_commits.extend(data)

print("Total commits collected:", len(all_commits))

Page 1 status: 200
Page 1 commits: 5
Page 2 status: 200
Page 2 commits: 5
Page 3 status: 200
Page 3 commits: 5
Total commits collected: 15


In [None]:
import json

with open("commits_sample_page1.json", "w") as f:
    json.dump(commits_data_page1, f, indent=2)

with open("commits_sample_paginated.json", "w") as f:
    json.dump(all_commits, f, indent=2)

print("Saved commits samples.")

Saved commits samples.


## Report #3 — Repository Contents

Goal: fetch repository contents (files/folders) using GitHub Contents API and save output samples.

### Endpoint
GET https://api.github.com/repos/{owner}/{repo}/contents/{path}

Notes:
- If {path} is empty → returns contents of repository root
- Response is usually a list of objects with fields like: name, path, type (file/dir), sha, size, download_url

In [None]:
contents_url = f"https://api.github.com/repos/{owner}/{repo}/contents"

contents_response = requests.get(contents_url, headers=headers)

print("Status code:", contents_response.status_code)

contents_data_root = contents_response.json()

# Quick view
if isinstance(contents_data_root, list):
    print("Items in root:", len(contents_data_root))
    for item in contents_data_root[:10]:
        print(item.get("type"), "-", item.get("name"))
else:
    # Sometimes GitHub returns an error object
    print("Response:", contents_data_root)

Status code: 200
Items in root: 173
file - .gitattributes
file - .gitignore
file - LICENSE
file - README.md
dir - ahca-polls
dir - airline-safety
dir - alcohol-consumption
dir - antiquities-act
dir - august-senate-polls
dir - avengers


In [None]:
import json

with open("repo_contents_root_sample.json", "w") as f:
    json.dump(contents_data_root, f, indent=2)

print("Saved: repo_contents_root_sample.json")

Saved: repo_contents_root_sample.json


## Rate limits & Error handling

Goal:
- Check current GitHub API rate limits
- Add reusable request wrapper with basic error handling (401/403/429/5xx)

In [None]:
rate_url = "https://api.github.com/rate_limit"
rate_resp = requests.get(rate_url, headers=headers)

print("Status code:", rate_resp.status_code)
rate_data = rate_resp.json()

# Most important part:
core = rate_data.get("resources", {}).get("core", {})
search = rate_data.get("resources", {}).get("search", {})

print("CORE remaining:", core.get("remaining"), "/", core.get("limit"), "reset:", core.get("reset"))
print("SEARCH remaining:", search.get("remaining"), "/", search.get("limit"), "reset:", search.get("reset"))

Status code: 200
CORE remaining: 4993 / 5000 reset: 1768307756
SEARCH remaining: 30 / 30 reset: 1768304650


In [None]:
import time

def github_get(url, headers, params=None, max_retries=3, retry_wait_sec=3):
    """
    Simple GET wrapper for GitHub API with minimal error handling:
    - 401: bad/expired token
    - 403: forbidden OR rate limit
    - 429: too many requests (rare for GitHub, but good to mention)
    - 5xx: server errors (retry)
    """
    for attempt in range(1, max_retries + 1):
        resp = requests.get(url, headers=headers, params=params)

        # Success
        if resp.status_code == 200:
            return resp

        # 401 Unauthorized
        if resp.status_code == 401:
            raise Exception(f"401 Unauthorized: Bad credentials. Check token. Response: {resp.text}")

        # 403 Forbidden (often rate limit)
        if resp.status_code == 403:
            # GitHub usually puts details in headers when rate-limited
            remaining = resp.headers.get("X-RateLimit-Remaining")
            reset = resp.headers.get("X-RateLimit-Reset")
            msg = f"403 Forbidden. X-RateLimit-Remaining={remaining}, X-RateLimit-Reset={reset}. Body={resp.text}"
            raise Exception(msg)

        # 429 Too Many Requests (not typical for GitHub REST, but handle anyway)
        if resp.status_code == 429:
            wait = retry_wait_sec * attempt
            print(f"429 Too Many Requests. Waiting {wait}s and retrying (attempt {attempt}/{max_retries})...")
            time.sleep(wait)
            continue

        # 5xx Server errors → retry
        if 500 <= resp.status_code <= 599:
            wait = retry_wait_sec * attempt
            print(f"{resp.status_code} Server error. Waiting {wait}s and retrying (attempt {attempt}/{max_retries})...")
            time.sleep(wait)
            continue

        # Other errors
        raise Exception(f"Unexpected status {resp.status_code}: {resp.text}")

    raise Exception("Max retries exceeded.")

In [None]:
test_resp = github_get(search_url, headers=headers, params=search_params)
print("Wrapper test status:", test_resp.status_code)

Wrapper test status: 200


In [None]:
def fetch_search_repos_pages(query, pages=2, per_page=5):
    all_items = []
    for page in range(1, pages + 1):
        params = {"q": query, "per_page": per_page, "page": page}
        resp = github_get("https://api.github.com/search/repositories", headers=headers, params=params)
        data = resp.json()
        items = data.get("items", [])
        all_items.extend(items)
        print(f"Fetched page {page}: {len(items)} repos")
    return all_items

# Example run (small, safe)
repos_items = fetch_search_repos_pages("data", pages=2, per_page=5)
print("Total repos collected:", len(repos_items))

Fetched page 1: 5 repos
Fetched page 2: 5 repos
Total repos collected: 10


In [None]:
from google.colab import files
import os

files_to_download = [
    "search_repositories_sample.json",
    "commits_sample_page1.json",
    "commits_sample_paginated.json",
    "repo_contents_root_sample.json"
]

print("Checking files...")
for f in files_to_download:
    print(f, "exists:", os.path.exists(f))

print("\nDownloading files...")
for f in files_to_download:
    if os.path.exists(f):
        files.download(f)

Checking files...
search_repositories_sample.json exists: True
commits_sample_page1.json exists: True
commits_sample_paginated.json exists: True
repo_contents_root_sample.json exists: True

Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>