# Scraping READMEs from GitHub

## Approach
- Two approaches discussed:
    - MVP: accept a list of project repos to extract the READMEs from
    - AAB: start with the user's GitHub profile and use the pinned repositories.

In [43]:
import numpy as np
from urllib.parse import urljoin
import requests, bs4, time

class ReadmeGetter:
    def __init__(self):
        self.READMES = {}
        self.repo_links = []
    
    def get_readmes(self):
        # Gather readmes.  Must be used AFTER get_repos()
        for repo in self.repo_links:

            # get response
            response = requests.get(repo)
            # make a beautiful soup
            soup = bs4.BeautifulSoup(response.content)

            articles = soup.find_all('article')
            try:
                readme = articles[0]

                key = repo.replace(base_url+'/','')#.replace('/','')
                self.READMES[key] = readme.text
        
            except Exception as e:
                display(e)
                print(repo)
            sec_sleep = np.random.choice([1.9,1.2, 1.34,1.1,0.9])
            time.sleep(sec_sleep)
    
    def get_repos(self, profile_link):
        # Takes a GitHub profile link and gathers links to all pinned repos
        class_="js-pinned-items-reorder-container"
        
        response = requests.get(profile)
        soup = bs4.BeautifulSoup(response.content)
        pins  = soup.find_all(class_=class_)
        pinned_repos = pins[0]
        links = pinned_repos.find_all('a', href=True)
        base_url = "https://www.github.com"
        repo_links = []

        # saving list of absolute links

        for link in links:
            # relative link
            rel_link = link['href']
            abs_link = urljoin(base_url,rel_link)

            # remove stars and forks
            if abs_link.endswith('stargazers') | abs_link.endswith('forks'):
                pass
            else:
                self.repo_links.append(abs_link)
                
                

In [44]:
Getter = ReadmeGetter()

In [45]:
Getter.get_repos('https://github.com/Caellwyn')

In [46]:
Getter.repo_links

['https://www.github.com/Caellwyn/ou_student_predictions',
 'https://www.github.com/Caellwyn/Seattle-Home-Sales',
 'https://www.github.com/Violet-Spiral/covid-xprize',
 'https://www.github.com/Caellwyn/product-flexible-twitter-sentiment-analysis',
 'https://www.github.com/Caellwyn/pet-predictor',
 'https://www.github.com/learn-co-curriculum/streamlit-image-classifier-demo',
 'https://www.github.com/Caellwyn/chat-with-a-philosopher']

In [47]:
Getter.get_readmes()
Getter.READMES.keys()

dict_keys(['Caellwyn/ou_student_predictions', 'Caellwyn/Seattle-Home-Sales', 'Violet-Spiral/covid-xprize', 'Caellwyn/product-flexible-twitter-sentiment-analysis', 'Caellwyn/pet-predictor', 'learn-co-curriculum/streamlit-image-classifier-demo', 'Caellwyn/chat-with-a-philosopher'])

In [51]:
Getter.READMES[list(Getter.READMES.keys())[1]]

'Seattle Home Sales\nData Science Projects worthy of sharing\nThis is a project that I created for the capstone class of the IBM Data Science professional certificate from Coursera in March 2020.  It explores the Seattle housing market and tries to estimate home prices using data on the properties and data about nearby services.\nThe notebooks for this project are split into 3 sequential pieces.\n\nData Wrangling: where I collect and format my dataset\nData Exploration: where I explore, map, and graph correlations within the dataset.\nData Modeling: where I test several linear regression models to determine how best to model the sales data and analyze the results.\n\n'