In [70]:
import os
import json
import itertools as it
from typing import List, Dict

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [1]:
BASE_URL = 'https://inshorts.com/en/read'
SECTIONS = ['business', 'sports', 'technology', 'entertainment']

def handle_article(article: BeautifulSoup) -> Dict[str, str]:
    '''
    Given a single article, extracts the title and content
    '''
    return {
        'title': article.find(class_='news-card-title').find('a').text.strip(),
        'content': (article.find(class_='news-card-content')
                    .find('div', attrs={'itemprop': 'articleBody'})
                    .text.strip())
    }

def fetch_section(section: str) -> List[Dict[str, str]]:
    '''
    Makes a request for the given section and processes all the articles in it
    '''
    url = f'{BASE_URL}/{section}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features="lxml")
    articles = [handle_article(article) for article in soup.find_all(class_='news-card')]
    for article in articles:
        article['category'] = section
        
    return articles

def get_all_sections() -> List[Dict[str, str]]:
    '''
    Returns the processed article data for all of the sections we defined in
    SECTIONS
    '''
    sections = [fetch_section(section) for section in SECTIONS]

    return list(it.chain(*sections))

def get_news_articles(use_cache=True) -> List[Dict[str, str]]:
    if use_cache and os.path.exists('news_articles.json'):
        articles = json.load(open('news_articles.json'))
    else:
        articles = get_all_sections()
        json.dump(articles, open('news_articles.json', 'w'))
        
    return articles

def get_news_data() -> pd.DataFrame:
    '''
    Returns all the articles from all the sections as a pandas DataFrame
    '''
    return pd.DataFrame(get_all_sections())

In [29]:
def handle_article(article: BeautifulSoup) -> Dict[str, str]:
    '''
    Given a single article, extracts the title and content
    '''
    return {
        'title': article.find(class_='news-card-title').find('a').text.strip(),
        'content': (article.find(class_='news-card-content')
                    .find('div', attrs={'itemprop': 'articleBody'})
                    .text.strip())
    }

def fetch_readme(section: str) -> List[Dict[str, str]]:
    '''
    Makes a request for the given section and processes all the articles in it
    '''
    url = f'{BASE_URL}/{section}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features="lxml")
    articles = [handle_article(article) for article in soup.find_all(class_='news-card')]
    for article in articles:
        article['category'] = section
        
    return articles




url = 'https://github.com/search?o=desc&q=stars:%3E1&s=forks&type=Repositories'
headers = {'User-Agent': 'Eric'} # codeup.com doesn't like our default user-agent
response = get(url, headers=headers)


soup = BeautifulSoup(response.content, 'html.parser')
article = soup.find('div', class_='col-12 col-md-8 pr-md-3')

print(article.find('href'))

None


In [25]:
url = 'https://github.com/jtleek/datasharing'
headers = {'User-Agent': 'Eric'} # codeup.com doesn't like our default user-agent
response = get(url, headers=headers)


soup = BeautifulSoup(response.content, 'html.parser')
article = soup.find('div', class_="Box Box--condensed instapaper_body md js-code-block-container")

print(article.text)





        README.md
      


How to share data with a statistician
This is a guide for anyone who needs to share data with a statistician or data scientist. The target audiences I have in mind are:

Collaborators who need statisticians or data scientists to analyze data for them
Students or postdocs in various disciplines looking for consulting advice
Junior statistics students whose job it is to collate/clean/wrangle data sets

The goals of this guide are to provide some instruction on the best way to share data to avoid the most common pitfalls
and sources of delay in the transition from data collection to data analysis. The Leek group works with a large
number of collaborators and the number one source of variation in the speed to results is the status of the data
when they arrive at the Leek group. Based on my conversations with other statisticians this is true nearly universally.
My strong feeling is that statisticians should be able to handle the data in whatever state they arr

In [63]:
url = 'https://github.com/rdpeng/ProgrammingAssignment2'
headers = {'User-Agent': 'Eric'} # codeup.com doesn't like our default user-agent
response = get(url, headers=headers)


soup = BeautifulSoup(response.content, 'html.parser')
# thing1 = soup.find('div', class_="repository-lang-stats")
# thing2 = thing1.find('ol')
# thing3 = thing2.find('li')
# thing4 = thing3.find('a')
# lang = thing4.find('span', class_='lang')
# lang_percent = thing4.find('span', class_='percent')

lang = soup.find('span', class_='lang')
lang_percent = soup.find('span', class_='percent')

print(lang.text + ' ' + lang_percent.text)

R 100.0%


In [53]:
type(thing1)

bs4.element.Tag

In [54]:
str(thing1)

'<div class="repository-lang-stats">\n<ol class="repository-lang-stats-numbers">\n<li>\n<a data-ga-click="Repository, language stats search click, location:repo overview" href="/rdpeng/ProgrammingAssignment2/search?l=r">\n<span class="color-block language-color" style="background-color:#198CE7;"></span>\n<span class="lang">R</span>\n<span class="percent">100.0%</span>\n</a>\n</li>\n</ol>\n</div>'

In [92]:
url = 'https://github.com/trending'
headers = {'User-Agent': 'Eric'} # codeup.com doesn't like our default user-agent
response = get(url, headers=headers)


soup = BeautifulSoup(response.content, 'html.parser')
# thing1 = soup.find('div', class_="repository-lang-stats")
# thing2 = thing1.find('ol')
# thing3 = thing2.find('li')
# thing4 = thing3.find('a')
# lang = thing4.find('span', class_='lang')
# lang_percent = thing4.find('span', class_='percent')

# for repo in repo_title in soup.find_all(href)
repo_title = soup.find('ol', class_='repo-list').find('h3').find('a')
print(repo_title)



# h3 = soup.find('h3')
# url_class = h3.find('a',href ='/jtleek/datasharing')

# print(url_class)

<a href="/microsoft/Terminal">
<span class="text-normal">microsoft / </span>Terminal
</a>


In [8]:
import os
import json
from typing import Dict, List
import requests

# TODO: make a github personal access token
# TODO: replace YOUR_GITHUB_USERNAME with your github username

# Go here and generate a personal access token
# https://github.com/settings/tokens
# save it in your env.py file
from env import github_token

DATA_DIR = 'data'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

headers = {
    'Authorization': f'token {github_token}',
    'User-Agent': 'YOUR_GITHUB_USERNAME'
}

def github_api_request(url: str) -> requests.Response:
    return requests.get(url, headers=headers)

def get_repo_language(repo: str) -> str:
    url = f'https://api.github.com/repos/{repo}'
    return github_api_request(url).json()['language']
    
def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f'https://api.github.com/repos/{repo}/contents/'
    return github_api_request(url).json()

def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    '''
    Takes in a response from the github api that lists
    the files in a repo and returns the url that can be
    used to download the repo's README file.
    '''
    for file in files:
        if file['name'].lower().startswith('readme'):
            return file['download_url']

def process_repo(repo: str) -> Dict[str, str]:
    '''
    Takes a repo name like "gocodeup/codeup-setup-script" and returns
    a dictionary with the language of the repo and the readme contents.
    '''
    contents = get_repo_contents(repo)
    return {
        'repo': repo,
        'language': get_repo_language(repo),
        'readme_contents': requests.get(get_readme_download_url(contents)).text
    }

# TODO: put a lot of repos here (or generate the list progromatically)
repos = [
    '/microsoft/Terminal',
]

def scrape_github_data():
    data = [process_repo(repo) for repo in repos]
    json.dump(data, open('data.json', 'w'))
    
if __name__ == '__main__':
    scrape_github_data()

KeyError: 'language'

In [None]:
import pandas as pd
pd.read_json('data.json')

In [33]:
from bs4 import BeautifulSoup
from urllib import request
import re

html_page = request.urlopen("https://github.com/search?{}q=stars%3A%3E0&s=stars&type=Repositories")
'https://github.com/search?p=2&q=stars%3A%3E0&s=stars&type=Repositories'
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', class_="v-align-middle"):
#     if link in soup.findAll('a'):
        print (link.get('href'))

/freeCodeCamp/freeCodeCamp
/996icu/996.ICU
/vuejs/vue
/twbs/bootstrap
/facebook/react
/tensorflow/tensorflow
/EbookFoundation/free-programming-books
/sindresorhus/awesome
/getify/You-Dont-Know-JS
/robbyrussell/oh-my-zsh


In [28]:
def handle_starttag(self,header, tag, attrs):
        if header == "h3":
            # Only parse the 'anchor' tag.
            if tag == "a":
                # Check the list of defined attributes.
                for name, value in attrs:
                    # If href is defined, print it.
                    if name == "href":
                        print (name, "=", value)


soup = BeautifulSoup(response.content, 'html.parser')

ModuleNotFoundError: No module named 'HTMLParser'

In [None]:
url = 'https://github.com/jtleek/datasharing'
headers = {'User-Agent': 'Eric'} # codeup.com doesn't like our default user-agent
response = get(url, headers=headers)


soup = BeautifulSoup(response.content, 'html.parser')
article = soup.find('div', class_="Box Box--condensed instapaper_body md js-code-block-container")

print(article.text)