# Introdução

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html)

In [None]:
for link in bsObj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])

In [None]:
import re

for link in bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

# Percorrendo um único domínio

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime, random, re

In [None]:
random.seed(datetime.datetime.now())

In [None]:
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj = BeautifulSoup(html)
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

In [None]:
links = getLinks("/wiki/Kevin_Bacon")

In [None]:
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

# Rastreando um site inteiro

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()

def getLinks(pageUrl):
    
    global pages
    
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html)
    
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        
        if 'href' in link.attrs:
            
            if link.attrs['href'] not in pages:
                
                # Encontramos uma página nova
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")

# Coletando dados em um site inteiro

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()

def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html)
    
    try:
        print(bsObj.h1.get_text())
        print(bsObj.find(id = "mw-content-text").findAll("p")[0])
        print(bsObj.find(id = "ca-edit").find("span").find("a").attrs['href'])
    
    except AttributeError:
        
        print("This page is missing somenthing! No worries though!")
    
        
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        
        if 'href' in link.attrs:
            
            if link.attrs['href'] not in pages:
                
                # Encontramos uma página nova
                newPage = link.attrs['href']
                print("--------------\n" + newPage)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")

# Rastreando na Internet

In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

In [18]:
#Recupera uma lista de todos os links internos encontrados em uma página
def getInternalLinks(bsObj, includeUrl):
    
    internalLinks = []
    
    # Encontra todos os links que começa com a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    
    return internalLinks

In [19]:
# Recupera uma lista de todos os links externos encontrados em uma página
def getExernalLinks(bsObj, excludeUrl):
        
    externalLinks = []
    
    # Encontra todos os links que começa com a "http" ou "www" que não contêm url atual
    for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    
    return externalLinks

In [20]:
def splitAddress(address):
    
    addressParts = address.replace("http://", "").split("/")
    return addressParts

In [21]:
def getRandomExternalLink(startingPage):
    
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html)
    
    externalLinks = getExernalLinks(bsObj, splitAddress(startingPage)[0])
    
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(startingPage)
        return getNextExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

In [22]:
def followExternalOnly(startingSite):
    
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is:" + externalLink)
    followExternalOnly(externalLink)

In [25]:
followExternalOnly("http://oreilly.com")

Random external link is:https://play.google.com/store/apps/details?id=com.safariflow.queue
Random external link is:https://play.google.com/store/paymentmethods
Random external link is:https://accounts.google.com/signin/usernamerecovery?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fpaymentmethods&hl=pt
Random external link is:https://accounts.google.com/TOS?loc=BR&hl=pt
Random external link is:https://docs.google.com/spreadsheets/?usp=sheets_alc
Random external link is:https://accounts.google.com/SignUp?service=wise&continue=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fcreate%3Fusp%3Dsheets_alc&ltmpl=sheets
Random external link is:https://accounts.google.com/TOS?loc=BR&hl=pt&privacy=true
Random external link is:https://www.google.com/about/datacenters/inside/locations?hl=pt_BR
Random external link is:https://support.google.com/?hl=pt-BR
Random external link is:https://news.google.com/?tab=un
Random external link is:https://www.blogger.com/?tab=nj


TypeError: getInternalLinks() missing 1 required positional argument: 'includeUrl'

In [28]:
# Coleta uma lista de todos os URLS

allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    
    html = urlopen(siteUrl)
    bsObj = BeautifulSoup(html)
    
    internalLinks = getInternalLinks(bsObj, splitAddress(siteUrl)[0])
    externalLinks = getExernalLinks(bsObj, splitAddress(siteUrl)[0])
    
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    
    for link in internalLinks:
        if link not in allIntLinks:
            print("About to get link: {}".format(link))
            allIntLinks.add(link)
            getAllExternalLinks(link)
            
getAllExternalLinks("http://www.mrafaelbatista.dev")

https://mrafaelbatista.dev/site/
https://mrafaelbatista.dev/site/index.php/pagina-principal/
https://mrafaelbatista.dev/site/#about-me
https://mrafaelbatista.dev/site/#my-social-medias
https://www.youtube.com/c/messiasbatista
https://medium.com/@mrafaelbatista
https://mrafaelbatista.dev/site/instagram.com/gdgjoaopessoa
https://www.instagram.com/mrafaelbatista.dev
https://twitter.com/mrafaelbatista
https://www.linkedin.com/in/messiasbatista
https://github.com/mrafaelbatista
https://extendthemes.com/go/built-with-highlight/
