In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

import acquire_zach as az
import wrangle as w

import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

# Use helper functions to acquire a list of url suffixes from all of Google's Github Repos

In [2]:
def get_githubpgs():
    """
    This function will create a list of all the url pages in the google repos section
    """
    urls = []
    for i in range(1,68):
        
        url = f'https://github.com/google?page={i}'
        urls.append(url)
    return urls

In [15]:
urls = get_githubpgs()

In [16]:
urls

['https://github.com/google?page=1',
 'https://github.com/google?page=2',
 'https://github.com/google?page=3',
 'https://github.com/google?page=4',
 'https://github.com/google?page=5',
 'https://github.com/google?page=6',
 'https://github.com/google?page=7',
 'https://github.com/google?page=8',
 'https://github.com/google?page=9',
 'https://github.com/google?page=10',
 'https://github.com/google?page=11',
 'https://github.com/google?page=12',
 'https://github.com/google?page=13',
 'https://github.com/google?page=14',
 'https://github.com/google?page=15',
 'https://github.com/google?page=16',
 'https://github.com/google?page=17',
 'https://github.com/google?page=18',
 'https://github.com/google?page=19',
 'https://github.com/google?page=20',
 'https://github.com/google?page=21',
 'https://github.com/google?page=22',
 'https://github.com/google?page=23',
 'https://github.com/google?page=24',
 'https://github.com/google?page=25',
 'https://github.com/google?page=26',
 'https://github.com/

In [13]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = get(url, headers=headers)

for url in urls:
    #print(url)
    soup = BeautifulSoup(response.text)

In [14]:
soup

<!DOCTYPE html>
<html data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-b1fd4fa68a3c095c7e600e3b8888b621.css" integrity="sha512-sf1Ppoo8CVx+YA47iIi2IUMYtjH17Errad0dPA+lo7DVd6VW1Mdy+TBcpD06Z6FN8MKI7TH5fpYU33+DhP25kg==" media="all" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/behaviors-c3616181c4e10c65445ad536097898a6.css" integrity="sha512-w2FhgcThDGVEWtU2CXiYptn/ycIg+HR7eUYhF7D4995er23AAjfjfNafEQcf8I8b8j9w0mogqXElpFTkpWnIZg==" media="all" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://github.gi

In [10]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [17]:
make_soup(urls[0])


<!DOCTYPE html>

<html data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-b1fd4fa68a3c095c7e600e3b8888b621.css" integrity="sha512-sf1Ppoo8CVx+YA47iIi2IUMYtjH17Errad0dPA+lo7DVd6VW1Mdy+TBcpD06Z6FN8MKI7TH5fpYU33+DhP25kg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/behaviors-c3616181c4e10c65445ad536097898a6.css" integrity="sha512-w2FhgcThDGVEWtU2CXiYptn/ycIg+HR7eUYhF7D4995er23AAjfjfNafEQcf8I8b8j9w0mogqXElpFTkpWnIZg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.gi

In [19]:
repo_links = []

soup = make_soup(urls[0])
repos = get_repo_linksuffix()
repo_links.append(repos)
repo_links

[['/google/model-viewer',
  '/google/BIG-bench',
  '/google/filament',
  '/google/rust_icu',
  '/google/autocxx',
  '/google/blockly',
  '/google/iree',
  '/google/blockly-samples',
  '/google/llvm-bazel',
  '/google/web-stories-wp',
  '/google/gvisor',
  '/google/cssi-labs',
  '/google/jax',
  '/google/gfbuild-glslang',
  '/google/XNNPACK',
  '/google/nomulus',
  '/google/pytype',
  '/google/glog',
  '/google/zx',
  '/google/martian',
  '/google/exposure-notifications-private-analytics-ingestion',
  '/google/GoogleUtilities',
  '/google/ecclesia-machine-management',
  '/google/ground-platform',
  '/google/CFU-Playground',
  '/google/shaka-player',
  '/google/turbinia',
  '/google/gtm-session-fetcher',
  '/google/ffn']]

In [20]:
repo_links = []

soup = make_soup(urls[1])
repos = get_repo_linksuffix()
repo_links.append(repos)
repo_links

[['/google/flatbuffers',
  '/google/evcxr',
  '/google/it-cert-automation-practice',
  '/google/gin-config',
  '/google/vscode-bigquery',
  '/google/docsy-example',
  '/google/privacy-sandbox-aggregation-service',
  '/google/fonts',
  '/google/clasp',
  '/google/nearby-connections',
  '/google/cadvisor',
  '/google/create-service-account',
  '/google/oss-fuzz',
  '/google/dopamine',
  '/google/closure-compiler-npm',
  '/google/elements-sk',
  '/google/fhir',
  '/google/CommonLoopUtils',
  '/google/dagger',
  '/google/go-attestation',
  '/google/uncertainty-baselines',
  '/google/j2cl',
  '/google/tf-quant-finance',
  '/google/closure-compiler',
  '/google/exposure-notifications-verification-server',
  '/google/angular-directed-graph',
  '/google/glazier',
  '/google/perfetto',
  '/google/error-prone']]

In [33]:
def get_repo_linksuffix():
    """
    This function will concat a base url with the repo suffix
    and create a list of all appended repo links on a single page.
    """
    
    repos = []
    
    for n in range(30):
        link_suffix = soup.find_all('a', itemprop='name codeRepository')[n].get('href')
        repos.append(link_suffix)
    return repos


In [37]:
def get_all_repo_links():
    """
    
    """  
    urls = get_githubpgs()
    
    repo_links = []
    
    
    for i in range(66):
        soup = make_soup(urls[i])
        
        repos = get_repo_linksuffix()
        repo_links.append(repos)
        i += 1
    
    return repo_links

In [38]:
repo_links = get_all_repo_links()

In [39]:
repo_links

[['/google/flatbuffers',
  '/google/evcxr',
  '/google/it-cert-automation-practice',
  '/google/gin-config',
  '/google/vscode-bigquery',
  '/google/docsy-example',
  '/google/privacy-sandbox-aggregation-service',
  '/google/fonts',
  '/google/clasp',
  '/google/nearby-connections',
  '/google/cadvisor',
  '/google/create-service-account',
  '/google/oss-fuzz',
  '/google/dopamine',
  '/google/closure-compiler-npm',
  '/google/elements-sk',
  '/google/fhir',
  '/google/CommonLoopUtils',
  '/google/dagger',
  '/google/go-attestation',
  '/google/uncertainty-baselines',
  '/google/j2cl',
  '/google/tf-quant-finance',
  '/google/closure-compiler',
  '/google/exposure-notifications-verification-server',
  '/google/angular-directed-graph',
  '/google/glazier',
  '/google/perfetto',
  '/google/error-prone',
  '/google/ksp'],
 ['/google/flatbuffers',
  '/google/evcxr',
  '/google/it-cert-automation-practice',
  '/google/gin-config',
  '/google/vscode-bigquery',
  '/google/docsy-example',
  '/