In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

import acquire_zach as az
import wrangle as w

import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

# Use helper functions to acquire a list of url suffixes from all of Google's Github Repos

In [2]:
def get_githubpgs():
    """
    This function will create a list of all the url pages in the google repos section
    """
    urls = []
    for i in range(1,68):
        
        url = f'https://github.com/google?page={i}'
        urls.append(url)
    return urls

In [3]:
urls = get_githubpgs()

In [4]:
urls

['https://github.com/google?page=1',
 'https://github.com/google?page=2',
 'https://github.com/google?page=3',
 'https://github.com/google?page=4',
 'https://github.com/google?page=5',
 'https://github.com/google?page=6',
 'https://github.com/google?page=7',
 'https://github.com/google?page=8',
 'https://github.com/google?page=9',
 'https://github.com/google?page=10',
 'https://github.com/google?page=11',
 'https://github.com/google?page=12',
 'https://github.com/google?page=13',
 'https://github.com/google?page=14',
 'https://github.com/google?page=15',
 'https://github.com/google?page=16',
 'https://github.com/google?page=17',
 'https://github.com/google?page=18',
 'https://github.com/google?page=19',
 'https://github.com/google?page=20',
 'https://github.com/google?page=21',
 'https://github.com/google?page=22',
 'https://github.com/google?page=23',
 'https://github.com/google?page=24',
 'https://github.com/google?page=25',
 'https://github.com/google?page=26',
 'https://github.com/

In [6]:
urls = ['https://github.com/google?page=1',
 'https://github.com/google?page=2',
 'https://github.com/google?page=3',
 'https://github.com/google?page=4',
 'https://github.com/google?page=5',
 'https://github.com/google?page=6',
 'https://github.com/google?page=7',
 'https://github.com/google?page=8',
 'https://github.com/google?page=10',
 'https://github.com/google?page=11',
 'https://github.com/google?page=12',
 'https://github.com/google?page=13',
 'https://github.com/google?page=14',
 'https://github.com/google?page=15',
 'https://github.com/google?page=16',
 'https://github.com/google?page=17',
 'https://github.com/google?page=18',
 'https://github.com/google?page=19',
 'https://github.com/google?page=20',
 'https://github.com/google?page=21',
 'https://github.com/google?page=22',
 'https://github.com/google?page=23',
 'https://github.com/google?page=24',
 'https://github.com/google?page=25',
 'https://github.com/google?page=27',
 'https://github.com/google?page=28',
 'https://github.com/google?page=29',
 'https://github.com/google?page=30',
 'https://github.com/google?page=31',
 'https://github.com/google?page=32',
 'https://github.com/google?page=33',
 'https://github.com/google?page=35',
 'https://github.com/google?page=36',
 'https://github.com/google?page=37',
 'https://github.com/google?page=39',
 'https://github.com/google?page=40',
 'https://github.com/google?page=41',
 'https://github.com/google?page=42',
 'https://github.com/google?page=43',
 'https://github.com/google?page=44',
 'https://github.com/google?page=45',
 'https://github.com/google?page=46',
 'https://github.com/google?page=47',
 'https://github.com/google?page=48',
 'https://github.com/google?page=49',
 'https://github.com/google?page=50']

In [16]:
len(urls)

46

In [7]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [8]:
def get_repo_linksuffix():
    """
    This function will concat a base url with the repo suffix
    and create a list of all appended repo links on a single page.
    """
    
    repos = []
    
    for n in range(30):
        link_suffix = soup.find_all('a', itemprop='name codeRepository')[n].get('href')
        repos.append(link_suffix)
    return repos


In [48]:
repo_links = []

for url in urls[:34]:
    soup = make_soup(url)
    repos = get_repo_linksuffix()
    repo_links.append(repos)
    repo_links

In [49]:
len(repo_links)

34

In [50]:
repo_links

[['/google/llvm-bazel',
  '/google/paseos',
  '/google/filament',
  '/google/it-cert-automation-practice',
  '/google/jax',
  '/google/zx',
  '/google/flatbuffers',
  '/google/wikiloop-doublecheck',
  '/google/closure-compiler',
  '/google/iree',
  '/google/nodejs-container-image-builder',
  '/google/CFU-Playground',
  '/google/pytype',
  '/google/gvisor',
  '/google/styleguide',
  '/google/nomulus',
  '/google/agi',
  '/google/j2objc',
  '/google/trax',
  '/google/personfinder',
  '/google/sentencepiece',
  '/google/evcxr',
  '/google/proto-quic',
  '/google/bulkan',
  '/google/XNNPACK',
  '/google/autocxx',
  '/google/model-viewer',
  '/google/ksp',
  '/google/rust_icu',
  '/google/BIG-bench'],
 ['/google/blockly',
  '/google/pigweed',
  '/google/blockly-samples',
  '/google/web-stories-wp',
  '/google/cssi-labs',
  '/google/gfbuild-glslang',
  '/google/glog',
  '/google/martian',
  '/google/exposure-notifications-private-analytics-ingestion',
  '/google/GoogleUtilities',
  '/google/

In [16]:
for url in urls[39:50]:
    soup = make_soup(url)
    repos = get_repo_linksuffix()
    repo_links.append(repos)
    repo_links

In [17]:
len(repo_links)

44

In [18]:
repo_links

[['/google/sentencepiece',
  '/google/llvm-bazel',
  '/google/flatbuffers',
  '/google/trax',
  '/google/evcxr',
  '/google/proto-quic',
  '/google/bulkan',
  '/google/XNNPACK',
  '/google/autocxx',
  '/google/model-viewer',
  '/google/iree',
  '/google/ksp',
  '/google/rust_icu',
  '/google/BIG-bench',
  '/google/blockly',
  '/google/pigweed',
  '/google/filament',
  '/google/blockly-samples',
  '/google/web-stories-wp',
  '/google/gvisor',
  '/google/cssi-labs',
  '/google/jax',
  '/google/gfbuild-glslang',
  '/google/nomulus',
  '/google/pytype',
  '/google/glog',
  '/google/zx',
  '/google/martian',
  '/google/exposure-notifications-private-analytics-ingestion',
  '/google/GoogleUtilities'],
 ['/google/ecclesia-machine-management',
  '/google/ground-platform',
  '/google/CFU-Playground',
  '/google/shaka-player',
  '/google/turbinia',
  '/google/gtm-session-fetcher',
  '/google/ffn',
  '/google/earthenterprise',
  '/google/it-cert-automation-practice',
  '/google/gin-config',
  '/g

In [51]:
import itertools
flatten_list = list(itertools.chain(*repo_links))

In [52]:
len(flatten_list)

1020

In [53]:
flatten_list = list(set(flatten_list))

In [54]:
len(flatten_list)

1020

In [55]:
reposuffixes = pd.Series(flatten_list)

In [56]:
REPOS = reposuffixes.str[1:].to_list()

In [57]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username


headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data1.json", "w"), indent=1)

In [58]:
# run acquire.py script to create a .json file of all READMEs
# this takes awhile and produced an error message
# that I used my token too much
# but still produced complete .json file
scrape_github_data()

ConnectionError: ('Connection aborted.', TimeoutError(60, 'Operation timed out'))

In [59]:
f = open('data1.json')

In [60]:
data = json.load(f)

In [61]:
df = pd.DataFrame(data)

In [62]:
# create DF of all READMEs
df.head(2)

Unnamed: 0,repo,language,readme_contents
0,google/ci_edit,Python,# What is ci_edit\n\nci_edit is a text editor....
1,google/bms-toolkit,Shell,# bms-toolkit\n\nToolkit for installing and cr...


In [63]:
df.to_csv('google_readmes1020.csv')